From a43d3ad17656680188599fd524d1f334f123d48b Mon Sep 17 00:00:00 2001 From: Jens Steube Date: Sun, 22 Jul 2018 12:20:20 +0200 Subject: [PATCH] Rename some hashcat specific OpenCL functions to avoid conflicts with existing OpenCL functions from OpenCL runtime --- OpenCL/inc_common.cl | 56416 ++++++++++++++++---------------- OpenCL/inc_hash_functions.cl | 32 +- OpenCL/inc_rp_optimized.cl | 616 +- OpenCL/inc_types.cl | 55 +- OpenCL/inc_vendor.cl | 4 + OpenCL/m00500-optimized.cl | 52 +- OpenCL/m01600-optimized.cl | 52 +- OpenCL/m03200-pure.cl | 8 +- OpenCL/m05800-optimized.cl | 24 +- OpenCL/m05800-pure.cl | 24 +- OpenCL/m06300-optimized.cl | 52 +- OpenCL/m07400-optimized.cl | 80 +- OpenCL/m09000-pure.cl | 8 +- OpenCL/m10700-optimized.cl | 36 +- OpenCL/m11600-pure.cl | 8 +- OpenCL/m13800_a0-optimized.cl | 68 +- OpenCL/m13800_a1-optimized.cl | 68 +- OpenCL/m13800_a3-optimized.cl | 68 +- 18 files changed, 28840 insertions(+), 28831 deletions(-) diff --git a/OpenCL/inc_common.cl b/OpenCL/inc_common.cl index 3b099aad2..3b3e9c40c 100644 --- a/OpenCL/inc_common.cl +++ b/OpenCL/inc_common.cl @@ -188,25 +188,25 @@ DECLSPEC void make_utf16be (const u32x *in, u32x *out1, u32x *out2) { #if defined IS_NV - out2[3] = __byte_perm (in[3], 0, 0x3727); - out2[2] = __byte_perm (in[3], 0, 0x1707); - out2[1] = __byte_perm (in[2], 0, 0x3727); - out2[0] = __byte_perm (in[2], 0, 0x1707); - out1[3] = __byte_perm (in[1], 0, 0x3727); - out1[2] = __byte_perm (in[1], 0, 0x1707); - out1[1] = __byte_perm (in[0], 0, 0x3727); - out1[0] = __byte_perm (in[0], 0, 0x1707); + out2[3] = hc_byte_perm (in[3], 0, 0x3727); + out2[2] = hc_byte_perm (in[3], 0, 0x1707); + out2[1] = hc_byte_perm (in[2], 0, 0x3727); + out2[0] = hc_byte_perm (in[2], 0, 0x1707); + out1[3] = hc_byte_perm (in[1], 0, 0x3727); + out1[2] = hc_byte_perm (in[1], 0, 0x1707); + out1[1] = hc_byte_perm (in[0], 0, 0x3727); + out1[0] = hc_byte_perm (in[0], 0, 0x1707); #elif defined IS_AMD && AMD_GCN >= 3 - out2[3] = __byte_perm (in[3], 0, 0x03070207); - out2[2] = __byte_perm (in[3], 0, 0x01070007); - out2[1] = __byte_perm (in[2], 0, 0x03070207); - out2[0] = __byte_perm (in[2], 0, 0x01070007); - out1[3] = __byte_perm (in[1], 0, 0x03070207); - out1[2] = __byte_perm (in[1], 0, 0x01070007); - out1[1] = __byte_perm (in[0], 0, 0x03070207); - out1[0] = __byte_perm (in[0], 0, 0x01070007); + out2[3] = hc_byte_perm (in[3], 0, 0x03070207); + out2[2] = hc_byte_perm (in[3], 0, 0x01070007); + out2[1] = hc_byte_perm (in[2], 0, 0x03070207); + out2[0] = hc_byte_perm (in[2], 0, 0x01070007); + out1[3] = hc_byte_perm (in[1], 0, 0x03070207); + out1[2] = hc_byte_perm (in[1], 0, 0x01070007); + out1[1] = hc_byte_perm (in[0], 0, 0x03070207); + out1[0] = hc_byte_perm (in[0], 0, 0x01070007); #else @@ -226,25 +226,25 @@ DECLSPEC void make_utf16beN (const u32x *in, u32x *out1, u32x *out2) { #if defined IS_NV - out2[3] = __byte_perm (in[3], 0, 0x1707); - out2[2] = __byte_perm (in[3], 0, 0x3727); - out2[1] = __byte_perm (in[2], 0, 0x1707); - out2[0] = __byte_perm (in[2], 0, 0x3727); - out1[3] = __byte_perm (in[1], 0, 0x1707); - out1[2] = __byte_perm (in[1], 0, 0x3727); - out1[1] = __byte_perm (in[0], 0, 0x1707); - out1[0] = __byte_perm (in[0], 0, 0x3727); + out2[3] = hc_byte_perm (in[3], 0, 0x1707); + out2[2] = hc_byte_perm (in[3], 0, 0x3727); + out2[1] = hc_byte_perm (in[2], 0, 0x1707); + out2[0] = hc_byte_perm (in[2], 0, 0x3727); + out1[3] = hc_byte_perm (in[1], 0, 0x1707); + out1[2] = hc_byte_perm (in[1], 0, 0x3727); + out1[1] = hc_byte_perm (in[0], 0, 0x1707); + out1[0] = hc_byte_perm (in[0], 0, 0x3727); #elif defined IS_AMD && AMD_GCN >= 3 - out2[3] = __byte_perm (in[3], 0, 0x01070007); - out2[2] = __byte_perm (in[3], 0, 0x03070207); - out2[1] = __byte_perm (in[2], 0, 0x01070007); - out2[0] = __byte_perm (in[2], 0, 0x03070207); - out1[3] = __byte_perm (in[1], 0, 0x01070007); - out1[2] = __byte_perm (in[1], 0, 0x03070207); - out1[1] = __byte_perm (in[0], 0, 0x01070007); - out1[0] = __byte_perm (in[0], 0, 0x03070207); + out2[3] = hc_byte_perm (in[3], 0, 0x01070007); + out2[2] = hc_byte_perm (in[3], 0, 0x03070207); + out2[1] = hc_byte_perm (in[2], 0, 0x01070007); + out2[0] = hc_byte_perm (in[2], 0, 0x03070207); + out1[3] = hc_byte_perm (in[1], 0, 0x01070007); + out1[2] = hc_byte_perm (in[1], 0, 0x03070207); + out1[1] = hc_byte_perm (in[0], 0, 0x01070007); + out1[0] = hc_byte_perm (in[0], 0, 0x03070207); #else @@ -264,25 +264,25 @@ DECLSPEC void make_utf16le (const u32x *in, u32x *out1, u32x *out2) { #if defined IS_NV - out2[3] = __byte_perm (in[3], 0, 0x7372); - out2[2] = __byte_perm (in[3], 0, 0x7170); - out2[1] = __byte_perm (in[2], 0, 0x7372); - out2[0] = __byte_perm (in[2], 0, 0x7170); - out1[3] = __byte_perm (in[1], 0, 0x7372); - out1[2] = __byte_perm (in[1], 0, 0x7170); - out1[1] = __byte_perm (in[0], 0, 0x7372); - out1[0] = __byte_perm (in[0], 0, 0x7170); + out2[3] = hc_byte_perm (in[3], 0, 0x7372); + out2[2] = hc_byte_perm (in[3], 0, 0x7170); + out2[1] = hc_byte_perm (in[2], 0, 0x7372); + out2[0] = hc_byte_perm (in[2], 0, 0x7170); + out1[3] = hc_byte_perm (in[1], 0, 0x7372); + out1[2] = hc_byte_perm (in[1], 0, 0x7170); + out1[1] = hc_byte_perm (in[0], 0, 0x7372); + out1[0] = hc_byte_perm (in[0], 0, 0x7170); #elif defined IS_AMD && AMD_GCN >= 3 - out2[3] = __byte_perm (in[3], 0, 0x07030702); - out2[2] = __byte_perm (in[3], 0, 0x07010700); - out2[1] = __byte_perm (in[2], 0, 0x07030702); - out2[0] = __byte_perm (in[2], 0, 0x07010700); - out1[3] = __byte_perm (in[1], 0, 0x07030702); - out1[2] = __byte_perm (in[1], 0, 0x07010700); - out1[1] = __byte_perm (in[0], 0, 0x07030702); - out1[0] = __byte_perm (in[0], 0, 0x07010700); + out2[3] = hc_byte_perm (in[3], 0, 0x07030702); + out2[2] = hc_byte_perm (in[3], 0, 0x07010700); + out2[1] = hc_byte_perm (in[2], 0, 0x07030702); + out2[0] = hc_byte_perm (in[2], 0, 0x07010700); + out1[3] = hc_byte_perm (in[1], 0, 0x07030702); + out1[2] = hc_byte_perm (in[1], 0, 0x07010700); + out1[1] = hc_byte_perm (in[0], 0, 0x07030702); + out1[0] = hc_byte_perm (in[0], 0, 0x07010700); #else @@ -302,25 +302,25 @@ DECLSPEC void make_utf16leN (const u32x *in, u32x *out1, u32x *out2) { #if defined IS_NV - out2[3] = __byte_perm (in[3], 0, 0x7170); - out2[2] = __byte_perm (in[3], 0, 0x7372); - out2[1] = __byte_perm (in[2], 0, 0x7170); - out2[0] = __byte_perm (in[2], 0, 0x7372); - out1[3] = __byte_perm (in[1], 0, 0x7170); - out1[2] = __byte_perm (in[1], 0, 0x7372); - out1[1] = __byte_perm (in[0], 0, 0x7170); - out1[0] = __byte_perm (in[0], 0, 0x7372); + out2[3] = hc_byte_perm (in[3], 0, 0x7170); + out2[2] = hc_byte_perm (in[3], 0, 0x7372); + out2[1] = hc_byte_perm (in[2], 0, 0x7170); + out2[0] = hc_byte_perm (in[2], 0, 0x7372); + out1[3] = hc_byte_perm (in[1], 0, 0x7170); + out1[2] = hc_byte_perm (in[1], 0, 0x7372); + out1[1] = hc_byte_perm (in[0], 0, 0x7170); + out1[0] = hc_byte_perm (in[0], 0, 0x7372); #elif defined IS_AMD && AMD_GCN >= 3 - out2[3] = __byte_perm (in[3], 0, 0x07010700); - out2[2] = __byte_perm (in[3], 0, 0x07030702); - out2[1] = __byte_perm (in[2], 0, 0x07010700); - out2[0] = __byte_perm (in[2], 0, 0x07030702); - out1[3] = __byte_perm (in[1], 0, 0x07010700); - out1[2] = __byte_perm (in[1], 0, 0x07030702); - out1[1] = __byte_perm (in[0], 0, 0x07010700); - out1[0] = __byte_perm (in[0], 0, 0x07030702); + out2[3] = hc_byte_perm (in[3], 0, 0x07010700); + out2[2] = hc_byte_perm (in[3], 0, 0x07030702); + out2[1] = hc_byte_perm (in[2], 0, 0x07010700); + out2[0] = hc_byte_perm (in[2], 0, 0x07030702); + out1[3] = hc_byte_perm (in[1], 0, 0x07010700); + out1[2] = hc_byte_perm (in[1], 0, 0x07030702); + out1[1] = hc_byte_perm (in[0], 0, 0x07010700); + out1[0] = hc_byte_perm (in[0], 0, 0x07030702); #else @@ -340,17 +340,17 @@ DECLSPEC void undo_utf16be (const u32x *in1, const u32x *in2, u32x *out) { #if defined IS_NV - out[0] = __byte_perm (in1[0], in1[1], 0x4602); - out[1] = __byte_perm (in1[2], in1[3], 0x4602); - out[2] = __byte_perm (in2[0], in2[1], 0x4602); - out[3] = __byte_perm (in2[2], in2[3], 0x4602); + out[0] = hc_byte_perm (in1[0], in1[1], 0x4602); + out[1] = hc_byte_perm (in1[2], in1[3], 0x4602); + out[2] = hc_byte_perm (in2[0], in2[1], 0x4602); + out[3] = hc_byte_perm (in2[2], in2[3], 0x4602); #elif defined IS_AMD && AMD_GCN >= 3 - out[0] = __byte_perm (in1[0], in1[1], 0x04060002); - out[1] = __byte_perm (in1[2], in1[3], 0x04060002); - out[2] = __byte_perm (in2[0], in2[1], 0x04060002); - out[3] = __byte_perm (in2[2], in2[3], 0x04060002); + out[0] = hc_byte_perm (in1[0], in1[1], 0x04060002); + out[1] = hc_byte_perm (in1[2], in1[3], 0x04060002); + out[2] = hc_byte_perm (in2[0], in2[1], 0x04060002); + out[3] = hc_byte_perm (in2[2], in2[3], 0x04060002); #else @@ -370,17 +370,17 @@ DECLSPEC void undo_utf16le (const u32x *in1, const u32x *in2, u32x *out) { #if defined IS_NV - out[0] = __byte_perm (in1[0], in1[1], 0x6420); - out[1] = __byte_perm (in1[2], in1[3], 0x6420); - out[2] = __byte_perm (in2[0], in2[1], 0x6420); - out[3] = __byte_perm (in2[2], in2[3], 0x6420); + out[0] = hc_byte_perm (in1[0], in1[1], 0x6420); + out[1] = hc_byte_perm (in1[2], in1[3], 0x6420); + out[2] = hc_byte_perm (in2[0], in2[1], 0x6420); + out[3] = hc_byte_perm (in2[2], in2[3], 0x6420); #elif defined IS_AMD && AMD_GCN >= 3 - out[0] = __byte_perm (in1[0], in1[1], 0x06040200); - out[1] = __byte_perm (in1[2], in1[3], 0x06040200); - out[2] = __byte_perm (in2[0], in2[1], 0x06040200); - out[3] = __byte_perm (in2[2], in2[3], 0x06040200); + out[0] = hc_byte_perm (in1[0], in1[1], 0x06040200); + out[1] = hc_byte_perm (in1[2], in1[3], 0x06040200); + out[2] = hc_byte_perm (in2[0], in2[1], 0x06040200); + out[3] = hc_byte_perm (in2[2], in2[3], 0x06040200); #else @@ -537,79 +537,79 @@ DECLSPEC void switch_buffer_by_offset_le (u32x *w0, u32x *w1, u32x *w2, u32x *w3 switch (offset_switch) { case 0: - w3[3] = amd_bytealign (w3[2], w3[3], offset); - w3[2] = amd_bytealign (w3[1], w3[2], offset); - w3[1] = amd_bytealign (w3[0], w3[1], offset); - w3[0] = amd_bytealign (w2[3], w3[0], offset); - w2[3] = amd_bytealign (w2[2], w2[3], offset); - w2[2] = amd_bytealign (w2[1], w2[2], offset); - w2[1] = amd_bytealign (w2[0], w2[1], offset); - w2[0] = amd_bytealign (w1[3], w2[0], offset); - w1[3] = amd_bytealign (w1[2], w1[3], offset); - w1[2] = amd_bytealign (w1[1], w1[2], offset); - w1[1] = amd_bytealign (w1[0], w1[1], offset); - w1[0] = amd_bytealign (w0[3], w1[0], offset); - w0[3] = amd_bytealign (w0[2], w0[3], offset); - w0[2] = amd_bytealign (w0[1], w0[2], offset); - w0[1] = amd_bytealign (w0[0], w0[1], offset); - w0[0] = amd_bytealign ( 0, w0[0], offset); + w3[3] = hc_bytealign (w3[2], w3[3], offset); + w3[2] = hc_bytealign (w3[1], w3[2], offset); + w3[1] = hc_bytealign (w3[0], w3[1], offset); + w3[0] = hc_bytealign (w2[3], w3[0], offset); + w2[3] = hc_bytealign (w2[2], w2[3], offset); + w2[2] = hc_bytealign (w2[1], w2[2], offset); + w2[1] = hc_bytealign (w2[0], w2[1], offset); + w2[0] = hc_bytealign (w1[3], w2[0], offset); + w1[3] = hc_bytealign (w1[2], w1[3], offset); + w1[2] = hc_bytealign (w1[1], w1[2], offset); + w1[1] = hc_bytealign (w1[0], w1[1], offset); + w1[0] = hc_bytealign (w0[3], w1[0], offset); + w0[3] = hc_bytealign (w0[2], w0[3], offset); + w0[2] = hc_bytealign (w0[1], w0[2], offset); + w0[1] = hc_bytealign (w0[0], w0[1], offset); + w0[0] = hc_bytealign ( 0, w0[0], offset); break; case 1: - w3[3] = amd_bytealign (w3[1], w3[2], offset); - w3[2] = amd_bytealign (w3[0], w3[1], offset); - w3[1] = amd_bytealign (w2[3], w3[0], offset); - w3[0] = amd_bytealign (w2[2], w2[3], offset); - w2[3] = amd_bytealign (w2[1], w2[2], offset); - w2[2] = amd_bytealign (w2[0], w2[1], offset); - w2[1] = amd_bytealign (w1[3], w2[0], offset); - w2[0] = amd_bytealign (w1[2], w1[3], offset); - w1[3] = amd_bytealign (w1[1], w1[2], offset); - w1[2] = amd_bytealign (w1[0], w1[1], offset); - w1[1] = amd_bytealign (w0[3], w1[0], offset); - w1[0] = amd_bytealign (w0[2], w0[3], offset); - w0[3] = amd_bytealign (w0[1], w0[2], offset); - w0[2] = amd_bytealign (w0[0], w0[1], offset); - w0[1] = amd_bytealign ( 0, w0[0], offset); + w3[3] = hc_bytealign (w3[1], w3[2], offset); + w3[2] = hc_bytealign (w3[0], w3[1], offset); + w3[1] = hc_bytealign (w2[3], w3[0], offset); + w3[0] = hc_bytealign (w2[2], w2[3], offset); + w2[3] = hc_bytealign (w2[1], w2[2], offset); + w2[2] = hc_bytealign (w2[0], w2[1], offset); + w2[1] = hc_bytealign (w1[3], w2[0], offset); + w2[0] = hc_bytealign (w1[2], w1[3], offset); + w1[3] = hc_bytealign (w1[1], w1[2], offset); + w1[2] = hc_bytealign (w1[0], w1[1], offset); + w1[1] = hc_bytealign (w0[3], w1[0], offset); + w1[0] = hc_bytealign (w0[2], w0[3], offset); + w0[3] = hc_bytealign (w0[1], w0[2], offset); + w0[2] = hc_bytealign (w0[0], w0[1], offset); + w0[1] = hc_bytealign ( 0, w0[0], offset); w0[0] = 0; break; case 2: - w3[3] = amd_bytealign (w3[0], w3[1], offset); - w3[2] = amd_bytealign (w2[3], w3[0], offset); - w3[1] = amd_bytealign (w2[2], w2[3], offset); - w3[0] = amd_bytealign (w2[1], w2[2], offset); - w2[3] = amd_bytealign (w2[0], w2[1], offset); - w2[2] = amd_bytealign (w1[3], w2[0], offset); - w2[1] = amd_bytealign (w1[2], w1[3], offset); - w2[0] = amd_bytealign (w1[1], w1[2], offset); - w1[3] = amd_bytealign (w1[0], w1[1], offset); - w1[2] = amd_bytealign (w0[3], w1[0], offset); - w1[1] = amd_bytealign (w0[2], w0[3], offset); - w1[0] = amd_bytealign (w0[1], w0[2], offset); - w0[3] = amd_bytealign (w0[0], w0[1], offset); - w0[2] = amd_bytealign ( 0, w0[0], offset); + w3[3] = hc_bytealign (w3[0], w3[1], offset); + w3[2] = hc_bytealign (w2[3], w3[0], offset); + w3[1] = hc_bytealign (w2[2], w2[3], offset); + w3[0] = hc_bytealign (w2[1], w2[2], offset); + w2[3] = hc_bytealign (w2[0], w2[1], offset); + w2[2] = hc_bytealign (w1[3], w2[0], offset); + w2[1] = hc_bytealign (w1[2], w1[3], offset); + w2[0] = hc_bytealign (w1[1], w1[2], offset); + w1[3] = hc_bytealign (w1[0], w1[1], offset); + w1[2] = hc_bytealign (w0[3], w1[0], offset); + w1[1] = hc_bytealign (w0[2], w0[3], offset); + w1[0] = hc_bytealign (w0[1], w0[2], offset); + w0[3] = hc_bytealign (w0[0], w0[1], offset); + w0[2] = hc_bytealign ( 0, w0[0], offset); w0[1] = 0; w0[0] = 0; break; case 3: - w3[3] = amd_bytealign (w2[3], w3[0], offset); - w3[2] = amd_bytealign (w2[2], w2[3], offset); - w3[1] = amd_bytealign (w2[1], w2[2], offset); - w3[0] = amd_bytealign (w2[0], w2[1], offset); - w2[3] = amd_bytealign (w1[3], w2[0], offset); - w2[2] = amd_bytealign (w1[2], w1[3], offset); - w2[1] = amd_bytealign (w1[1], w1[2], offset); - w2[0] = amd_bytealign (w1[0], w1[1], offset); - w1[3] = amd_bytealign (w0[3], w1[0], offset); - w1[2] = amd_bytealign (w0[2], w0[3], offset); - w1[1] = amd_bytealign (w0[1], w0[2], offset); - w1[0] = amd_bytealign (w0[0], w0[1], offset); - w0[3] = amd_bytealign ( 0, w0[0], offset); + w3[3] = hc_bytealign (w2[3], w3[0], offset); + w3[2] = hc_bytealign (w2[2], w2[3], offset); + w3[1] = hc_bytealign (w2[1], w2[2], offset); + w3[0] = hc_bytealign (w2[0], w2[1], offset); + w2[3] = hc_bytealign (w1[3], w2[0], offset); + w2[2] = hc_bytealign (w1[2], w1[3], offset); + w2[1] = hc_bytealign (w1[1], w1[2], offset); + w2[0] = hc_bytealign (w1[0], w1[1], offset); + w1[3] = hc_bytealign (w0[3], w1[0], offset); + w1[2] = hc_bytealign (w0[2], w0[3], offset); + w1[1] = hc_bytealign (w0[1], w0[2], offset); + w1[0] = hc_bytealign (w0[0], w0[1], offset); + w0[3] = hc_bytealign ( 0, w0[0], offset); w0[2] = 0; w0[1] = 0; w0[0] = 0; @@ -617,18 +617,18 @@ DECLSPEC void switch_buffer_by_offset_le (u32x *w0, u32x *w1, u32x *w2, u32x *w3 break; case 4: - w3[3] = amd_bytealign (w2[2], w2[3], offset); - w3[2] = amd_bytealign (w2[1], w2[2], offset); - w3[1] = amd_bytealign (w2[0], w2[1], offset); - w3[0] = amd_bytealign (w1[3], w2[0], offset); - w2[3] = amd_bytealign (w1[2], w1[3], offset); - w2[2] = amd_bytealign (w1[1], w1[2], offset); - w2[1] = amd_bytealign (w1[0], w1[1], offset); - w2[0] = amd_bytealign (w0[3], w1[0], offset); - w1[3] = amd_bytealign (w0[2], w0[3], offset); - w1[2] = amd_bytealign (w0[1], w0[2], offset); - w1[1] = amd_bytealign (w0[0], w0[1], offset); - w1[0] = amd_bytealign ( 0, w0[0], offset); + w3[3] = hc_bytealign (w2[2], w2[3], offset); + w3[2] = hc_bytealign (w2[1], w2[2], offset); + w3[1] = hc_bytealign (w2[0], w2[1], offset); + w3[0] = hc_bytealign (w1[3], w2[0], offset); + w2[3] = hc_bytealign (w1[2], w1[3], offset); + w2[2] = hc_bytealign (w1[1], w1[2], offset); + w2[1] = hc_bytealign (w1[0], w1[1], offset); + w2[0] = hc_bytealign (w0[3], w1[0], offset); + w1[3] = hc_bytealign (w0[2], w0[3], offset); + w1[2] = hc_bytealign (w0[1], w0[2], offset); + w1[1] = hc_bytealign (w0[0], w0[1], offset); + w1[0] = hc_bytealign ( 0, w0[0], offset); w0[3] = 0; w0[2] = 0; w0[1] = 0; @@ -637,17 +637,17 @@ DECLSPEC void switch_buffer_by_offset_le (u32x *w0, u32x *w1, u32x *w2, u32x *w3 break; case 5: - w3[3] = amd_bytealign (w2[1], w2[2], offset); - w3[2] = amd_bytealign (w2[0], w2[1], offset); - w3[1] = amd_bytealign (w1[3], w2[0], offset); - w3[0] = amd_bytealign (w1[2], w1[3], offset); - w2[3] = amd_bytealign (w1[1], w1[2], offset); - w2[2] = amd_bytealign (w1[0], w1[1], offset); - w2[1] = amd_bytealign (w0[3], w1[0], offset); - w2[0] = amd_bytealign (w0[2], w0[3], offset); - w1[3] = amd_bytealign (w0[1], w0[2], offset); - w1[2] = amd_bytealign (w0[0], w0[1], offset); - w1[1] = amd_bytealign ( 0, w0[0], offset); + w3[3] = hc_bytealign (w2[1], w2[2], offset); + w3[2] = hc_bytealign (w2[0], w2[1], offset); + w3[1] = hc_bytealign (w1[3], w2[0], offset); + w3[0] = hc_bytealign (w1[2], w1[3], offset); + w2[3] = hc_bytealign (w1[1], w1[2], offset); + w2[2] = hc_bytealign (w1[0], w1[1], offset); + w2[1] = hc_bytealign (w0[3], w1[0], offset); + w2[0] = hc_bytealign (w0[2], w0[3], offset); + w1[3] = hc_bytealign (w0[1], w0[2], offset); + w1[2] = hc_bytealign (w0[0], w0[1], offset); + w1[1] = hc_bytealign ( 0, w0[0], offset); w1[0] = 0; w0[3] = 0; w0[2] = 0; @@ -657,16 +657,16 @@ DECLSPEC void switch_buffer_by_offset_le (u32x *w0, u32x *w1, u32x *w2, u32x *w3 break; case 6: - w3[3] = amd_bytealign (w2[0], w2[1], offset); - w3[2] = amd_bytealign (w1[3], w2[0], offset); - w3[1] = amd_bytealign (w1[2], w1[3], offset); - w3[0] = amd_bytealign (w1[1], w1[2], offset); - w2[3] = amd_bytealign (w1[0], w1[1], offset); - w2[2] = amd_bytealign (w0[3], w1[0], offset); - w2[1] = amd_bytealign (w0[2], w0[3], offset); - w2[0] = amd_bytealign (w0[1], w0[2], offset); - w1[3] = amd_bytealign (w0[0], w0[1], offset); - w1[2] = amd_bytealign ( 0, w0[0], offset); + w3[3] = hc_bytealign (w2[0], w2[1], offset); + w3[2] = hc_bytealign (w1[3], w2[0], offset); + w3[1] = hc_bytealign (w1[2], w1[3], offset); + w3[0] = hc_bytealign (w1[1], w1[2], offset); + w2[3] = hc_bytealign (w1[0], w1[1], offset); + w2[2] = hc_bytealign (w0[3], w1[0], offset); + w2[1] = hc_bytealign (w0[2], w0[3], offset); + w2[0] = hc_bytealign (w0[1], w0[2], offset); + w1[3] = hc_bytealign (w0[0], w0[1], offset); + w1[2] = hc_bytealign ( 0, w0[0], offset); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -677,15 +677,15 @@ DECLSPEC void switch_buffer_by_offset_le (u32x *w0, u32x *w1, u32x *w2, u32x *w3 break; case 7: - w3[3] = amd_bytealign (w1[3], w2[0], offset); - w3[2] = amd_bytealign (w1[2], w1[3], offset); - w3[1] = amd_bytealign (w1[1], w1[2], offset); - w3[0] = amd_bytealign (w1[0], w1[1], offset); - w2[3] = amd_bytealign (w0[3], w1[0], offset); - w2[2] = amd_bytealign (w0[2], w0[3], offset); - w2[1] = amd_bytealign (w0[1], w0[2], offset); - w2[0] = amd_bytealign (w0[0], w0[1], offset); - w1[3] = amd_bytealign ( 0, w0[0], offset); + w3[3] = hc_bytealign (w1[3], w2[0], offset); + w3[2] = hc_bytealign (w1[2], w1[3], offset); + w3[1] = hc_bytealign (w1[1], w1[2], offset); + w3[0] = hc_bytealign (w1[0], w1[1], offset); + w2[3] = hc_bytealign (w0[3], w1[0], offset); + w2[2] = hc_bytealign (w0[2], w0[3], offset); + w2[1] = hc_bytealign (w0[1], w0[2], offset); + w2[0] = hc_bytealign (w0[0], w0[1], offset); + w1[3] = hc_bytealign ( 0, w0[0], offset); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -697,14 +697,14 @@ DECLSPEC void switch_buffer_by_offset_le (u32x *w0, u32x *w1, u32x *w2, u32x *w3 break; case 8: - w3[3] = amd_bytealign (w1[2], w1[3], offset); - w3[2] = amd_bytealign (w1[1], w1[2], offset); - w3[1] = amd_bytealign (w1[0], w1[1], offset); - w3[0] = amd_bytealign (w0[3], w1[0], offset); - w2[3] = amd_bytealign (w0[2], w0[3], offset); - w2[2] = amd_bytealign (w0[1], w0[2], offset); - w2[1] = amd_bytealign (w0[0], w0[1], offset); - w2[0] = amd_bytealign ( 0, w0[0], offset); + w3[3] = hc_bytealign (w1[2], w1[3], offset); + w3[2] = hc_bytealign (w1[1], w1[2], offset); + w3[1] = hc_bytealign (w1[0], w1[1], offset); + w3[0] = hc_bytealign (w0[3], w1[0], offset); + w2[3] = hc_bytealign (w0[2], w0[3], offset); + w2[2] = hc_bytealign (w0[1], w0[2], offset); + w2[1] = hc_bytealign (w0[0], w0[1], offset); + w2[0] = hc_bytealign ( 0, w0[0], offset); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -717,13 +717,13 @@ DECLSPEC void switch_buffer_by_offset_le (u32x *w0, u32x *w1, u32x *w2, u32x *w3 break; case 9: - w3[3] = amd_bytealign (w1[1], w1[2], offset); - w3[2] = amd_bytealign (w1[0], w1[1], offset); - w3[1] = amd_bytealign (w0[3], w1[0], offset); - w3[0] = amd_bytealign (w0[2], w0[3], offset); - w2[3] = amd_bytealign (w0[1], w0[2], offset); - w2[2] = amd_bytealign (w0[0], w0[1], offset); - w2[1] = amd_bytealign ( 0, w0[0], offset); + w3[3] = hc_bytealign (w1[1], w1[2], offset); + w3[2] = hc_bytealign (w1[0], w1[1], offset); + w3[1] = hc_bytealign (w0[3], w1[0], offset); + w3[0] = hc_bytealign (w0[2], w0[3], offset); + w2[3] = hc_bytealign (w0[1], w0[2], offset); + w2[2] = hc_bytealign (w0[0], w0[1], offset); + w2[1] = hc_bytealign ( 0, w0[0], offset); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -737,12 +737,12 @@ DECLSPEC void switch_buffer_by_offset_le (u32x *w0, u32x *w1, u32x *w2, u32x *w3 break; case 10: - w3[3] = amd_bytealign (w1[0], w1[1], offset); - w3[2] = amd_bytealign (w0[3], w1[0], offset); - w3[1] = amd_bytealign (w0[2], w0[3], offset); - w3[0] = amd_bytealign (w0[1], w0[2], offset); - w2[3] = amd_bytealign (w0[0], w0[1], offset); - w2[2] = amd_bytealign ( 0, w0[0], offset); + w3[3] = hc_bytealign (w1[0], w1[1], offset); + w3[2] = hc_bytealign (w0[3], w1[0], offset); + w3[1] = hc_bytealign (w0[2], w0[3], offset); + w3[0] = hc_bytealign (w0[1], w0[2], offset); + w2[3] = hc_bytealign (w0[0], w0[1], offset); + w2[2] = hc_bytealign ( 0, w0[0], offset); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -757,11 +757,11 @@ DECLSPEC void switch_buffer_by_offset_le (u32x *w0, u32x *w1, u32x *w2, u32x *w3 break; case 11: - w3[3] = amd_bytealign (w0[3], w1[0], offset); - w3[2] = amd_bytealign (w0[2], w0[3], offset); - w3[1] = amd_bytealign (w0[1], w0[2], offset); - w3[0] = amd_bytealign (w0[0], w0[1], offset); - w2[3] = amd_bytealign ( 0, w0[0], offset); + w3[3] = hc_bytealign (w0[3], w1[0], offset); + w3[2] = hc_bytealign (w0[2], w0[3], offset); + w3[1] = hc_bytealign (w0[1], w0[2], offset); + w3[0] = hc_bytealign (w0[0], w0[1], offset); + w2[3] = hc_bytealign ( 0, w0[0], offset); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -777,10 +777,10 @@ DECLSPEC void switch_buffer_by_offset_le (u32x *w0, u32x *w1, u32x *w2, u32x *w3 break; case 12: - w3[3] = amd_bytealign (w0[2], w0[3], offset); - w3[2] = amd_bytealign (w0[1], w0[2], offset); - w3[1] = amd_bytealign (w0[0], w0[1], offset); - w3[0] = amd_bytealign ( 0, w0[0], offset); + w3[3] = hc_bytealign (w0[2], w0[3], offset); + w3[2] = hc_bytealign (w0[1], w0[2], offset); + w3[1] = hc_bytealign (w0[0], w0[1], offset); + w3[0] = hc_bytealign ( 0, w0[0], offset); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -797,9 +797,9 @@ DECLSPEC void switch_buffer_by_offset_le (u32x *w0, u32x *w1, u32x *w2, u32x *w3 break; case 13: - w3[3] = amd_bytealign (w0[1], w0[2], offset); - w3[2] = amd_bytealign (w0[0], w0[1], offset); - w3[1] = amd_bytealign ( 0, w0[0], offset); + w3[3] = hc_bytealign (w0[1], w0[2], offset); + w3[2] = hc_bytealign (w0[0], w0[1], offset); + w3[1] = hc_bytealign ( 0, w0[0], offset); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -817,8 +817,8 @@ DECLSPEC void switch_buffer_by_offset_le (u32x *w0, u32x *w1, u32x *w2, u32x *w3 break; case 14: - w3[3] = amd_bytealign (w0[0], w0[1], offset); - w3[2] = amd_bytealign ( 0, w0[0], offset); + w3[3] = hc_bytealign (w0[0], w0[1], offset); + w3[2] = hc_bytealign ( 0, w0[0], offset); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -837,7 +837,7 @@ DECLSPEC void switch_buffer_by_offset_le (u32x *w0, u32x *w1, u32x *w2, u32x *w3 break; case 15: - w3[3] = amd_bytealign ( 0, w0[0], offset); + w3[3] = hc_bytealign ( 0, w0[0], offset); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -888,79 +888,79 @@ DECLSPEC void switch_buffer_by_offset_le (u32x *w0, u32x *w1, u32x *w2, u32x *w3 switch (offset_switch) { case 0: - w3[3] = __byte_perm (w3[2], w3[3], selector); - w3[2] = __byte_perm (w3[1], w3[2], selector); - w3[1] = __byte_perm (w3[0], w3[1], selector); - w3[0] = __byte_perm (w2[3], w3[0], selector); - w2[3] = __byte_perm (w2[2], w2[3], selector); - w2[2] = __byte_perm (w2[1], w2[2], selector); - w2[1] = __byte_perm (w2[0], w2[1], selector); - w2[0] = __byte_perm (w1[3], w2[0], selector); - w1[3] = __byte_perm (w1[2], w1[3], selector); - w1[2] = __byte_perm (w1[1], w1[2], selector); - w1[1] = __byte_perm (w1[0], w1[1], selector); - w1[0] = __byte_perm (w0[3], w1[0], selector); - w0[3] = __byte_perm (w0[2], w0[3], selector); - w0[2] = __byte_perm (w0[1], w0[2], selector); - w0[1] = __byte_perm (w0[0], w0[1], selector); - w0[0] = __byte_perm ( 0, w0[0], selector); + w3[3] = hc_byte_perm (w3[2], w3[3], selector); + w3[2] = hc_byte_perm (w3[1], w3[2], selector); + w3[1] = hc_byte_perm (w3[0], w3[1], selector); + w3[0] = hc_byte_perm (w2[3], w3[0], selector); + w2[3] = hc_byte_perm (w2[2], w2[3], selector); + w2[2] = hc_byte_perm (w2[1], w2[2], selector); + w2[1] = hc_byte_perm (w2[0], w2[1], selector); + w2[0] = hc_byte_perm (w1[3], w2[0], selector); + w1[3] = hc_byte_perm (w1[2], w1[3], selector); + w1[2] = hc_byte_perm (w1[1], w1[2], selector); + w1[1] = hc_byte_perm (w1[0], w1[1], selector); + w1[0] = hc_byte_perm (w0[3], w1[0], selector); + w0[3] = hc_byte_perm (w0[2], w0[3], selector); + w0[2] = hc_byte_perm (w0[1], w0[2], selector); + w0[1] = hc_byte_perm (w0[0], w0[1], selector); + w0[0] = hc_byte_perm ( 0, w0[0], selector); break; case 1: - w3[3] = __byte_perm (w3[1], w3[2], selector); - w3[2] = __byte_perm (w3[0], w3[1], selector); - w3[1] = __byte_perm (w2[3], w3[0], selector); - w3[0] = __byte_perm (w2[2], w2[3], selector); - w2[3] = __byte_perm (w2[1], w2[2], selector); - w2[2] = __byte_perm (w2[0], w2[1], selector); - w2[1] = __byte_perm (w1[3], w2[0], selector); - w2[0] = __byte_perm (w1[2], w1[3], selector); - w1[3] = __byte_perm (w1[1], w1[2], selector); - w1[2] = __byte_perm (w1[0], w1[1], selector); - w1[1] = __byte_perm (w0[3], w1[0], selector); - w1[0] = __byte_perm (w0[2], w0[3], selector); - w0[3] = __byte_perm (w0[1], w0[2], selector); - w0[2] = __byte_perm (w0[0], w0[1], selector); - w0[1] = __byte_perm ( 0, w0[0], selector); + w3[3] = hc_byte_perm (w3[1], w3[2], selector); + w3[2] = hc_byte_perm (w3[0], w3[1], selector); + w3[1] = hc_byte_perm (w2[3], w3[0], selector); + w3[0] = hc_byte_perm (w2[2], w2[3], selector); + w2[3] = hc_byte_perm (w2[1], w2[2], selector); + w2[2] = hc_byte_perm (w2[0], w2[1], selector); + w2[1] = hc_byte_perm (w1[3], w2[0], selector); + w2[0] = hc_byte_perm (w1[2], w1[3], selector); + w1[3] = hc_byte_perm (w1[1], w1[2], selector); + w1[2] = hc_byte_perm (w1[0], w1[1], selector); + w1[1] = hc_byte_perm (w0[3], w1[0], selector); + w1[0] = hc_byte_perm (w0[2], w0[3], selector); + w0[3] = hc_byte_perm (w0[1], w0[2], selector); + w0[2] = hc_byte_perm (w0[0], w0[1], selector); + w0[1] = hc_byte_perm ( 0, w0[0], selector); w0[0] = 0; break; case 2: - w3[3] = __byte_perm (w3[0], w3[1], selector); - w3[2] = __byte_perm (w2[3], w3[0], selector); - w3[1] = __byte_perm (w2[2], w2[3], selector); - w3[0] = __byte_perm (w2[1], w2[2], selector); - w2[3] = __byte_perm (w2[0], w2[1], selector); - w2[2] = __byte_perm (w1[3], w2[0], selector); - w2[1] = __byte_perm (w1[2], w1[3], selector); - w2[0] = __byte_perm (w1[1], w1[2], selector); - w1[3] = __byte_perm (w1[0], w1[1], selector); - w1[2] = __byte_perm (w0[3], w1[0], selector); - w1[1] = __byte_perm (w0[2], w0[3], selector); - w1[0] = __byte_perm (w0[1], w0[2], selector); - w0[3] = __byte_perm (w0[0], w0[1], selector); - w0[2] = __byte_perm ( 0, w0[0], selector); + w3[3] = hc_byte_perm (w3[0], w3[1], selector); + w3[2] = hc_byte_perm (w2[3], w3[0], selector); + w3[1] = hc_byte_perm (w2[2], w2[3], selector); + w3[0] = hc_byte_perm (w2[1], w2[2], selector); + w2[3] = hc_byte_perm (w2[0], w2[1], selector); + w2[2] = hc_byte_perm (w1[3], w2[0], selector); + w2[1] = hc_byte_perm (w1[2], w1[3], selector); + w2[0] = hc_byte_perm (w1[1], w1[2], selector); + w1[3] = hc_byte_perm (w1[0], w1[1], selector); + w1[2] = hc_byte_perm (w0[3], w1[0], selector); + w1[1] = hc_byte_perm (w0[2], w0[3], selector); + w1[0] = hc_byte_perm (w0[1], w0[2], selector); + w0[3] = hc_byte_perm (w0[0], w0[1], selector); + w0[2] = hc_byte_perm ( 0, w0[0], selector); w0[1] = 0; w0[0] = 0; break; case 3: - w3[3] = __byte_perm (w2[3], w3[0], selector); - w3[2] = __byte_perm (w2[2], w2[3], selector); - w3[1] = __byte_perm (w2[1], w2[2], selector); - w3[0] = __byte_perm (w2[0], w2[1], selector); - w2[3] = __byte_perm (w1[3], w2[0], selector); - w2[2] = __byte_perm (w1[2], w1[3], selector); - w2[1] = __byte_perm (w1[1], w1[2], selector); - w2[0] = __byte_perm (w1[0], w1[1], selector); - w1[3] = __byte_perm (w0[3], w1[0], selector); - w1[2] = __byte_perm (w0[2], w0[3], selector); - w1[1] = __byte_perm (w0[1], w0[2], selector); - w1[0] = __byte_perm (w0[0], w0[1], selector); - w0[3] = __byte_perm ( 0, w0[0], selector); + w3[3] = hc_byte_perm (w2[3], w3[0], selector); + w3[2] = hc_byte_perm (w2[2], w2[3], selector); + w3[1] = hc_byte_perm (w2[1], w2[2], selector); + w3[0] = hc_byte_perm (w2[0], w2[1], selector); + w2[3] = hc_byte_perm (w1[3], w2[0], selector); + w2[2] = hc_byte_perm (w1[2], w1[3], selector); + w2[1] = hc_byte_perm (w1[1], w1[2], selector); + w2[0] = hc_byte_perm (w1[0], w1[1], selector); + w1[3] = hc_byte_perm (w0[3], w1[0], selector); + w1[2] = hc_byte_perm (w0[2], w0[3], selector); + w1[1] = hc_byte_perm (w0[1], w0[2], selector); + w1[0] = hc_byte_perm (w0[0], w0[1], selector); + w0[3] = hc_byte_perm ( 0, w0[0], selector); w0[2] = 0; w0[1] = 0; w0[0] = 0; @@ -968,18 +968,18 @@ DECLSPEC void switch_buffer_by_offset_le (u32x *w0, u32x *w1, u32x *w2, u32x *w3 break; case 4: - w3[3] = __byte_perm (w2[2], w2[3], selector); - w3[2] = __byte_perm (w2[1], w2[2], selector); - w3[1] = __byte_perm (w2[0], w2[1], selector); - w3[0] = __byte_perm (w1[3], w2[0], selector); - w2[3] = __byte_perm (w1[2], w1[3], selector); - w2[2] = __byte_perm (w1[1], w1[2], selector); - w2[1] = __byte_perm (w1[0], w1[1], selector); - w2[0] = __byte_perm (w0[3], w1[0], selector); - w1[3] = __byte_perm (w0[2], w0[3], selector); - w1[2] = __byte_perm (w0[1], w0[2], selector); - w1[1] = __byte_perm (w0[0], w0[1], selector); - w1[0] = __byte_perm ( 0, w0[0], selector); + w3[3] = hc_byte_perm (w2[2], w2[3], selector); + w3[2] = hc_byte_perm (w2[1], w2[2], selector); + w3[1] = hc_byte_perm (w2[0], w2[1], selector); + w3[0] = hc_byte_perm (w1[3], w2[0], selector); + w2[3] = hc_byte_perm (w1[2], w1[3], selector); + w2[2] = hc_byte_perm (w1[1], w1[2], selector); + w2[1] = hc_byte_perm (w1[0], w1[1], selector); + w2[0] = hc_byte_perm (w0[3], w1[0], selector); + w1[3] = hc_byte_perm (w0[2], w0[3], selector); + w1[2] = hc_byte_perm (w0[1], w0[2], selector); + w1[1] = hc_byte_perm (w0[0], w0[1], selector); + w1[0] = hc_byte_perm ( 0, w0[0], selector); w0[3] = 0; w0[2] = 0; w0[1] = 0; @@ -988,17 +988,17 @@ DECLSPEC void switch_buffer_by_offset_le (u32x *w0, u32x *w1, u32x *w2, u32x *w3 break; case 5: - w3[3] = __byte_perm (w2[1], w2[2], selector); - w3[2] = __byte_perm (w2[0], w2[1], selector); - w3[1] = __byte_perm (w1[3], w2[0], selector); - w3[0] = __byte_perm (w1[2], w1[3], selector); - w2[3] = __byte_perm (w1[1], w1[2], selector); - w2[2] = __byte_perm (w1[0], w1[1], selector); - w2[1] = __byte_perm (w0[3], w1[0], selector); - w2[0] = __byte_perm (w0[2], w0[3], selector); - w1[3] = __byte_perm (w0[1], w0[2], selector); - w1[2] = __byte_perm (w0[0], w0[1], selector); - w1[1] = __byte_perm ( 0, w0[0], selector); + w3[3] = hc_byte_perm (w2[1], w2[2], selector); + w3[2] = hc_byte_perm (w2[0], w2[1], selector); + w3[1] = hc_byte_perm (w1[3], w2[0], selector); + w3[0] = hc_byte_perm (w1[2], w1[3], selector); + w2[3] = hc_byte_perm (w1[1], w1[2], selector); + w2[2] = hc_byte_perm (w1[0], w1[1], selector); + w2[1] = hc_byte_perm (w0[3], w1[0], selector); + w2[0] = hc_byte_perm (w0[2], w0[3], selector); + w1[3] = hc_byte_perm (w0[1], w0[2], selector); + w1[2] = hc_byte_perm (w0[0], w0[1], selector); + w1[1] = hc_byte_perm ( 0, w0[0], selector); w1[0] = 0; w0[3] = 0; w0[2] = 0; @@ -1008,16 +1008,16 @@ DECLSPEC void switch_buffer_by_offset_le (u32x *w0, u32x *w1, u32x *w2, u32x *w3 break; case 6: - w3[3] = __byte_perm (w2[0], w2[1], selector); - w3[2] = __byte_perm (w1[3], w2[0], selector); - w3[1] = __byte_perm (w1[2], w1[3], selector); - w3[0] = __byte_perm (w1[1], w1[2], selector); - w2[3] = __byte_perm (w1[0], w1[1], selector); - w2[2] = __byte_perm (w0[3], w1[0], selector); - w2[1] = __byte_perm (w0[2], w0[3], selector); - w2[0] = __byte_perm (w0[1], w0[2], selector); - w1[3] = __byte_perm (w0[0], w0[1], selector); - w1[2] = __byte_perm ( 0, w0[0], selector); + w3[3] = hc_byte_perm (w2[0], w2[1], selector); + w3[2] = hc_byte_perm (w1[3], w2[0], selector); + w3[1] = hc_byte_perm (w1[2], w1[3], selector); + w3[0] = hc_byte_perm (w1[1], w1[2], selector); + w2[3] = hc_byte_perm (w1[0], w1[1], selector); + w2[2] = hc_byte_perm (w0[3], w1[0], selector); + w2[1] = hc_byte_perm (w0[2], w0[3], selector); + w2[0] = hc_byte_perm (w0[1], w0[2], selector); + w1[3] = hc_byte_perm (w0[0], w0[1], selector); + w1[2] = hc_byte_perm ( 0, w0[0], selector); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -1028,15 +1028,15 @@ DECLSPEC void switch_buffer_by_offset_le (u32x *w0, u32x *w1, u32x *w2, u32x *w3 break; case 7: - w3[3] = __byte_perm (w1[3], w2[0], selector); - w3[2] = __byte_perm (w1[2], w1[3], selector); - w3[1] = __byte_perm (w1[1], w1[2], selector); - w3[0] = __byte_perm (w1[0], w1[1], selector); - w2[3] = __byte_perm (w0[3], w1[0], selector); - w2[2] = __byte_perm (w0[2], w0[3], selector); - w2[1] = __byte_perm (w0[1], w0[2], selector); - w2[0] = __byte_perm (w0[0], w0[1], selector); - w1[3] = __byte_perm ( 0, w0[0], selector); + w3[3] = hc_byte_perm (w1[3], w2[0], selector); + w3[2] = hc_byte_perm (w1[2], w1[3], selector); + w3[1] = hc_byte_perm (w1[1], w1[2], selector); + w3[0] = hc_byte_perm (w1[0], w1[1], selector); + w2[3] = hc_byte_perm (w0[3], w1[0], selector); + w2[2] = hc_byte_perm (w0[2], w0[3], selector); + w2[1] = hc_byte_perm (w0[1], w0[2], selector); + w2[0] = hc_byte_perm (w0[0], w0[1], selector); + w1[3] = hc_byte_perm ( 0, w0[0], selector); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -1048,14 +1048,14 @@ DECLSPEC void switch_buffer_by_offset_le (u32x *w0, u32x *w1, u32x *w2, u32x *w3 break; case 8: - w3[3] = __byte_perm (w1[2], w1[3], selector); - w3[2] = __byte_perm (w1[1], w1[2], selector); - w3[1] = __byte_perm (w1[0], w1[1], selector); - w3[0] = __byte_perm (w0[3], w1[0], selector); - w2[3] = __byte_perm (w0[2], w0[3], selector); - w2[2] = __byte_perm (w0[1], w0[2], selector); - w2[1] = __byte_perm (w0[0], w0[1], selector); - w2[0] = __byte_perm ( 0, w0[0], selector); + w3[3] = hc_byte_perm (w1[2], w1[3], selector); + w3[2] = hc_byte_perm (w1[1], w1[2], selector); + w3[1] = hc_byte_perm (w1[0], w1[1], selector); + w3[0] = hc_byte_perm (w0[3], w1[0], selector); + w2[3] = hc_byte_perm (w0[2], w0[3], selector); + w2[2] = hc_byte_perm (w0[1], w0[2], selector); + w2[1] = hc_byte_perm (w0[0], w0[1], selector); + w2[0] = hc_byte_perm ( 0, w0[0], selector); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -1068,13 +1068,13 @@ DECLSPEC void switch_buffer_by_offset_le (u32x *w0, u32x *w1, u32x *w2, u32x *w3 break; case 9: - w3[3] = __byte_perm (w1[1], w1[2], selector); - w3[2] = __byte_perm (w1[0], w1[1], selector); - w3[1] = __byte_perm (w0[3], w1[0], selector); - w3[0] = __byte_perm (w0[2], w0[3], selector); - w2[3] = __byte_perm (w0[1], w0[2], selector); - w2[2] = __byte_perm (w0[0], w0[1], selector); - w2[1] = __byte_perm ( 0, w0[0], selector); + w3[3] = hc_byte_perm (w1[1], w1[2], selector); + w3[2] = hc_byte_perm (w1[0], w1[1], selector); + w3[1] = hc_byte_perm (w0[3], w1[0], selector); + w3[0] = hc_byte_perm (w0[2], w0[3], selector); + w2[3] = hc_byte_perm (w0[1], w0[2], selector); + w2[2] = hc_byte_perm (w0[0], w0[1], selector); + w2[1] = hc_byte_perm ( 0, w0[0], selector); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -1088,12 +1088,12 @@ DECLSPEC void switch_buffer_by_offset_le (u32x *w0, u32x *w1, u32x *w2, u32x *w3 break; case 10: - w3[3] = __byte_perm (w1[0], w1[1], selector); - w3[2] = __byte_perm (w0[3], w1[0], selector); - w3[1] = __byte_perm (w0[2], w0[3], selector); - w3[0] = __byte_perm (w0[1], w0[2], selector); - w2[3] = __byte_perm (w0[0], w0[1], selector); - w2[2] = __byte_perm ( 0, w0[0], selector); + w3[3] = hc_byte_perm (w1[0], w1[1], selector); + w3[2] = hc_byte_perm (w0[3], w1[0], selector); + w3[1] = hc_byte_perm (w0[2], w0[3], selector); + w3[0] = hc_byte_perm (w0[1], w0[2], selector); + w2[3] = hc_byte_perm (w0[0], w0[1], selector); + w2[2] = hc_byte_perm ( 0, w0[0], selector); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -1108,11 +1108,11 @@ DECLSPEC void switch_buffer_by_offset_le (u32x *w0, u32x *w1, u32x *w2, u32x *w3 break; case 11: - w3[3] = __byte_perm (w0[3], w1[0], selector); - w3[2] = __byte_perm (w0[2], w0[3], selector); - w3[1] = __byte_perm (w0[1], w0[2], selector); - w3[0] = __byte_perm (w0[0], w0[1], selector); - w2[3] = __byte_perm ( 0, w0[0], selector); + w3[3] = hc_byte_perm (w0[3], w1[0], selector); + w3[2] = hc_byte_perm (w0[2], w0[3], selector); + w3[1] = hc_byte_perm (w0[1], w0[2], selector); + w3[0] = hc_byte_perm (w0[0], w0[1], selector); + w2[3] = hc_byte_perm ( 0, w0[0], selector); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -1128,10 +1128,10 @@ DECLSPEC void switch_buffer_by_offset_le (u32x *w0, u32x *w1, u32x *w2, u32x *w3 break; case 12: - w3[3] = __byte_perm (w0[2], w0[3], selector); - w3[2] = __byte_perm (w0[1], w0[2], selector); - w3[1] = __byte_perm (w0[0], w0[1], selector); - w3[0] = __byte_perm ( 0, w0[0], selector); + w3[3] = hc_byte_perm (w0[2], w0[3], selector); + w3[2] = hc_byte_perm (w0[1], w0[2], selector); + w3[1] = hc_byte_perm (w0[0], w0[1], selector); + w3[0] = hc_byte_perm ( 0, w0[0], selector); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -1148,9 +1148,9 @@ DECLSPEC void switch_buffer_by_offset_le (u32x *w0, u32x *w1, u32x *w2, u32x *w3 break; case 13: - w3[3] = __byte_perm (w0[1], w0[2], selector); - w3[2] = __byte_perm (w0[0], w0[1], selector); - w3[1] = __byte_perm ( 0, w0[0], selector); + w3[3] = hc_byte_perm (w0[1], w0[2], selector); + w3[2] = hc_byte_perm (w0[0], w0[1], selector); + w3[1] = hc_byte_perm ( 0, w0[0], selector); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -1168,8 +1168,8 @@ DECLSPEC void switch_buffer_by_offset_le (u32x *w0, u32x *w1, u32x *w2, u32x *w3 break; case 14: - w3[3] = __byte_perm (w0[0], w0[1], selector); - w3[2] = __byte_perm ( 0, w0[0], selector); + w3[3] = hc_byte_perm (w0[0], w0[1], selector); + w3[2] = hc_byte_perm ( 0, w0[0], selector); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -1188,7 +1188,7 @@ DECLSPEC void switch_buffer_by_offset_le (u32x *w0, u32x *w1, u32x *w2, u32x *w3 break; case 15: - w3[3] = __byte_perm ( 0, w0[0], selector); + w3[3] = hc_byte_perm ( 0, w0[0], selector); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -1240,89 +1240,89 @@ DECLSPEC void switch_buffer_by_offset_carry_le (u32x *w0, u32x *w1, u32x *w2, u3 switch (offset_switch) { case 0: - c0[0] = amd_bytealign (w3[3], 0, offset); - w3[3] = amd_bytealign (w3[2], w3[3], offset); - w3[2] = amd_bytealign (w3[1], w3[2], offset); - w3[1] = amd_bytealign (w3[0], w3[1], offset); - w3[0] = amd_bytealign (w2[3], w3[0], offset); - w2[3] = amd_bytealign (w2[2], w2[3], offset); - w2[2] = amd_bytealign (w2[1], w2[2], offset); - w2[1] = amd_bytealign (w2[0], w2[1], offset); - w2[0] = amd_bytealign (w1[3], w2[0], offset); - w1[3] = amd_bytealign (w1[2], w1[3], offset); - w1[2] = amd_bytealign (w1[1], w1[2], offset); - w1[1] = amd_bytealign (w1[0], w1[1], offset); - w1[0] = amd_bytealign (w0[3], w1[0], offset); - w0[3] = amd_bytealign (w0[2], w0[3], offset); - w0[2] = amd_bytealign (w0[1], w0[2], offset); - w0[1] = amd_bytealign (w0[0], w0[1], offset); - w0[0] = amd_bytealign ( 0, w0[0], offset); + c0[0] = hc_bytealign (w3[3], 0, offset); + w3[3] = hc_bytealign (w3[2], w3[3], offset); + w3[2] = hc_bytealign (w3[1], w3[2], offset); + w3[1] = hc_bytealign (w3[0], w3[1], offset); + w3[0] = hc_bytealign (w2[3], w3[0], offset); + w2[3] = hc_bytealign (w2[2], w2[3], offset); + w2[2] = hc_bytealign (w2[1], w2[2], offset); + w2[1] = hc_bytealign (w2[0], w2[1], offset); + w2[0] = hc_bytealign (w1[3], w2[0], offset); + w1[3] = hc_bytealign (w1[2], w1[3], offset); + w1[2] = hc_bytealign (w1[1], w1[2], offset); + w1[1] = hc_bytealign (w1[0], w1[1], offset); + w1[0] = hc_bytealign (w0[3], w1[0], offset); + w0[3] = hc_bytealign (w0[2], w0[3], offset); + w0[2] = hc_bytealign (w0[1], w0[2], offset); + w0[1] = hc_bytealign (w0[0], w0[1], offset); + w0[0] = hc_bytealign ( 0, w0[0], offset); break; case 1: - c0[1] = amd_bytealign (w3[3], 0, offset); - c0[0] = amd_bytealign (w3[2], w3[3], offset); - w3[3] = amd_bytealign (w3[1], w3[2], offset); - w3[2] = amd_bytealign (w3[0], w3[1], offset); - w3[1] = amd_bytealign (w2[3], w3[0], offset); - w3[0] = amd_bytealign (w2[2], w2[3], offset); - w2[3] = amd_bytealign (w2[1], w2[2], offset); - w2[2] = amd_bytealign (w2[0], w2[1], offset); - w2[1] = amd_bytealign (w1[3], w2[0], offset); - w2[0] = amd_bytealign (w1[2], w1[3], offset); - w1[3] = amd_bytealign (w1[1], w1[2], offset); - w1[2] = amd_bytealign (w1[0], w1[1], offset); - w1[1] = amd_bytealign (w0[3], w1[0], offset); - w1[0] = amd_bytealign (w0[2], w0[3], offset); - w0[3] = amd_bytealign (w0[1], w0[2], offset); - w0[2] = amd_bytealign (w0[0], w0[1], offset); - w0[1] = amd_bytealign ( 0, w0[0], offset); + c0[1] = hc_bytealign (w3[3], 0, offset); + c0[0] = hc_bytealign (w3[2], w3[3], offset); + w3[3] = hc_bytealign (w3[1], w3[2], offset); + w3[2] = hc_bytealign (w3[0], w3[1], offset); + w3[1] = hc_bytealign (w2[3], w3[0], offset); + w3[0] = hc_bytealign (w2[2], w2[3], offset); + w2[3] = hc_bytealign (w2[1], w2[2], offset); + w2[2] = hc_bytealign (w2[0], w2[1], offset); + w2[1] = hc_bytealign (w1[3], w2[0], offset); + w2[0] = hc_bytealign (w1[2], w1[3], offset); + w1[3] = hc_bytealign (w1[1], w1[2], offset); + w1[2] = hc_bytealign (w1[0], w1[1], offset); + w1[1] = hc_bytealign (w0[3], w1[0], offset); + w1[0] = hc_bytealign (w0[2], w0[3], offset); + w0[3] = hc_bytealign (w0[1], w0[2], offset); + w0[2] = hc_bytealign (w0[0], w0[1], offset); + w0[1] = hc_bytealign ( 0, w0[0], offset); w0[0] = 0; break; case 2: - c0[2] = amd_bytealign (w3[3], 0, offset); - c0[1] = amd_bytealign (w3[2], w3[3], offset); - c0[0] = amd_bytealign (w3[1], w3[2], offset); - w3[3] = amd_bytealign (w3[0], w3[1], offset); - w3[2] = amd_bytealign (w2[3], w3[0], offset); - w3[1] = amd_bytealign (w2[2], w2[3], offset); - w3[0] = amd_bytealign (w2[1], w2[2], offset); - w2[3] = amd_bytealign (w2[0], w2[1], offset); - w2[2] = amd_bytealign (w1[3], w2[0], offset); - w2[1] = amd_bytealign (w1[2], w1[3], offset); - w2[0] = amd_bytealign (w1[1], w1[2], offset); - w1[3] = amd_bytealign (w1[0], w1[1], offset); - w1[2] = amd_bytealign (w0[3], w1[0], offset); - w1[1] = amd_bytealign (w0[2], w0[3], offset); - w1[0] = amd_bytealign (w0[1], w0[2], offset); - w0[3] = amd_bytealign (w0[0], w0[1], offset); - w0[2] = amd_bytealign ( 0, w0[0], offset); + c0[2] = hc_bytealign (w3[3], 0, offset); + c0[1] = hc_bytealign (w3[2], w3[3], offset); + c0[0] = hc_bytealign (w3[1], w3[2], offset); + w3[3] = hc_bytealign (w3[0], w3[1], offset); + w3[2] = hc_bytealign (w2[3], w3[0], offset); + w3[1] = hc_bytealign (w2[2], w2[3], offset); + w3[0] = hc_bytealign (w2[1], w2[2], offset); + w2[3] = hc_bytealign (w2[0], w2[1], offset); + w2[2] = hc_bytealign (w1[3], w2[0], offset); + w2[1] = hc_bytealign (w1[2], w1[3], offset); + w2[0] = hc_bytealign (w1[1], w1[2], offset); + w1[3] = hc_bytealign (w1[0], w1[1], offset); + w1[2] = hc_bytealign (w0[3], w1[0], offset); + w1[1] = hc_bytealign (w0[2], w0[3], offset); + w1[0] = hc_bytealign (w0[1], w0[2], offset); + w0[3] = hc_bytealign (w0[0], w0[1], offset); + w0[2] = hc_bytealign ( 0, w0[0], offset); w0[1] = 0; w0[0] = 0; break; case 3: - c0[3] = amd_bytealign (w3[3], 0, offset); - c0[2] = amd_bytealign (w3[2], w3[3], offset); - c0[1] = amd_bytealign (w3[1], w3[2], offset); - c0[0] = amd_bytealign (w3[0], w3[1], offset); - w3[3] = amd_bytealign (w2[3], w3[0], offset); - w3[2] = amd_bytealign (w2[2], w2[3], offset); - w3[1] = amd_bytealign (w2[1], w2[2], offset); - w3[0] = amd_bytealign (w2[0], w2[1], offset); - w2[3] = amd_bytealign (w1[3], w2[0], offset); - w2[2] = amd_bytealign (w1[2], w1[3], offset); - w2[1] = amd_bytealign (w1[1], w1[2], offset); - w2[0] = amd_bytealign (w1[0], w1[1], offset); - w1[3] = amd_bytealign (w0[3], w1[0], offset); - w1[2] = amd_bytealign (w0[2], w0[3], offset); - w1[1] = amd_bytealign (w0[1], w0[2], offset); - w1[0] = amd_bytealign (w0[0], w0[1], offset); - w0[3] = amd_bytealign ( 0, w0[0], offset); + c0[3] = hc_bytealign (w3[3], 0, offset); + c0[2] = hc_bytealign (w3[2], w3[3], offset); + c0[1] = hc_bytealign (w3[1], w3[2], offset); + c0[0] = hc_bytealign (w3[0], w3[1], offset); + w3[3] = hc_bytealign (w2[3], w3[0], offset); + w3[2] = hc_bytealign (w2[2], w2[3], offset); + w3[1] = hc_bytealign (w2[1], w2[2], offset); + w3[0] = hc_bytealign (w2[0], w2[1], offset); + w2[3] = hc_bytealign (w1[3], w2[0], offset); + w2[2] = hc_bytealign (w1[2], w1[3], offset); + w2[1] = hc_bytealign (w1[1], w1[2], offset); + w2[0] = hc_bytealign (w1[0], w1[1], offset); + w1[3] = hc_bytealign (w0[3], w1[0], offset); + w1[2] = hc_bytealign (w0[2], w0[3], offset); + w1[1] = hc_bytealign (w0[1], w0[2], offset); + w1[0] = hc_bytealign (w0[0], w0[1], offset); + w0[3] = hc_bytealign ( 0, w0[0], offset); w0[2] = 0; w0[1] = 0; w0[0] = 0; @@ -1330,23 +1330,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le (u32x *w0, u32x *w1, u32x *w2, u3 break; case 4: - c1[0] = amd_bytealign (w3[3], 0, offset); - c0[3] = amd_bytealign (w3[2], w3[3], offset); - c0[2] = amd_bytealign (w3[1], w3[2], offset); - c0[1] = amd_bytealign (w3[0], w3[1], offset); - c0[0] = amd_bytealign (w2[3], w3[0], offset); - w3[3] = amd_bytealign (w2[2], w2[3], offset); - w3[2] = amd_bytealign (w2[1], w2[2], offset); - w3[1] = amd_bytealign (w2[0], w2[1], offset); - w3[0] = amd_bytealign (w1[3], w2[0], offset); - w2[3] = amd_bytealign (w1[2], w1[3], offset); - w2[2] = amd_bytealign (w1[1], w1[2], offset); - w2[1] = amd_bytealign (w1[0], w1[1], offset); - w2[0] = amd_bytealign (w0[3], w1[0], offset); - w1[3] = amd_bytealign (w0[2], w0[3], offset); - w1[2] = amd_bytealign (w0[1], w0[2], offset); - w1[1] = amd_bytealign (w0[0], w0[1], offset); - w1[0] = amd_bytealign ( 0, w0[0], offset); + c1[0] = hc_bytealign (w3[3], 0, offset); + c0[3] = hc_bytealign (w3[2], w3[3], offset); + c0[2] = hc_bytealign (w3[1], w3[2], offset); + c0[1] = hc_bytealign (w3[0], w3[1], offset); + c0[0] = hc_bytealign (w2[3], w3[0], offset); + w3[3] = hc_bytealign (w2[2], w2[3], offset); + w3[2] = hc_bytealign (w2[1], w2[2], offset); + w3[1] = hc_bytealign (w2[0], w2[1], offset); + w3[0] = hc_bytealign (w1[3], w2[0], offset); + w2[3] = hc_bytealign (w1[2], w1[3], offset); + w2[2] = hc_bytealign (w1[1], w1[2], offset); + w2[1] = hc_bytealign (w1[0], w1[1], offset); + w2[0] = hc_bytealign (w0[3], w1[0], offset); + w1[3] = hc_bytealign (w0[2], w0[3], offset); + w1[2] = hc_bytealign (w0[1], w0[2], offset); + w1[1] = hc_bytealign (w0[0], w0[1], offset); + w1[0] = hc_bytealign ( 0, w0[0], offset); w0[3] = 0; w0[2] = 0; w0[1] = 0; @@ -1355,23 +1355,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le (u32x *w0, u32x *w1, u32x *w2, u3 break; case 5: - c1[1] = amd_bytealign (w3[3], 0, offset); - c1[0] = amd_bytealign (w3[2], w3[3], offset); - c0[3] = amd_bytealign (w3[1], w3[2], offset); - c0[2] = amd_bytealign (w3[0], w3[1], offset); - c0[1] = amd_bytealign (w2[3], w3[0], offset); - c0[0] = amd_bytealign (w2[2], w2[3], offset); - w3[3] = amd_bytealign (w2[1], w2[2], offset); - w3[2] = amd_bytealign (w2[0], w2[1], offset); - w3[1] = amd_bytealign (w1[3], w2[0], offset); - w3[0] = amd_bytealign (w1[2], w1[3], offset); - w2[3] = amd_bytealign (w1[1], w1[2], offset); - w2[2] = amd_bytealign (w1[0], w1[1], offset); - w2[1] = amd_bytealign (w0[3], w1[0], offset); - w2[0] = amd_bytealign (w0[2], w0[3], offset); - w1[3] = amd_bytealign (w0[1], w0[2], offset); - w1[2] = amd_bytealign (w0[0], w0[1], offset); - w1[1] = amd_bytealign ( 0, w0[0], offset); + c1[1] = hc_bytealign (w3[3], 0, offset); + c1[0] = hc_bytealign (w3[2], w3[3], offset); + c0[3] = hc_bytealign (w3[1], w3[2], offset); + c0[2] = hc_bytealign (w3[0], w3[1], offset); + c0[1] = hc_bytealign (w2[3], w3[0], offset); + c0[0] = hc_bytealign (w2[2], w2[3], offset); + w3[3] = hc_bytealign (w2[1], w2[2], offset); + w3[2] = hc_bytealign (w2[0], w2[1], offset); + w3[1] = hc_bytealign (w1[3], w2[0], offset); + w3[0] = hc_bytealign (w1[2], w1[3], offset); + w2[3] = hc_bytealign (w1[1], w1[2], offset); + w2[2] = hc_bytealign (w1[0], w1[1], offset); + w2[1] = hc_bytealign (w0[3], w1[0], offset); + w2[0] = hc_bytealign (w0[2], w0[3], offset); + w1[3] = hc_bytealign (w0[1], w0[2], offset); + w1[2] = hc_bytealign (w0[0], w0[1], offset); + w1[1] = hc_bytealign ( 0, w0[0], offset); w1[0] = 0; w0[3] = 0; w0[2] = 0; @@ -1381,23 +1381,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le (u32x *w0, u32x *w1, u32x *w2, u3 break; case 6: - c1[2] = amd_bytealign (w3[3], 0, offset); - c1[1] = amd_bytealign (w3[2], w3[3], offset); - c1[0] = amd_bytealign (w3[1], w3[2], offset); - c0[3] = amd_bytealign (w3[0], w3[1], offset); - c0[2] = amd_bytealign (w2[3], w3[0], offset); - c0[1] = amd_bytealign (w2[2], w2[3], offset); - c0[0] = amd_bytealign (w2[1], w2[2], offset); - w3[3] = amd_bytealign (w2[0], w2[1], offset); - w3[2] = amd_bytealign (w1[3], w2[0], offset); - w3[1] = amd_bytealign (w1[2], w1[3], offset); - w3[0] = amd_bytealign (w1[1], w1[2], offset); - w2[3] = amd_bytealign (w1[0], w1[1], offset); - w2[2] = amd_bytealign (w0[3], w1[0], offset); - w2[1] = amd_bytealign (w0[2], w0[3], offset); - w2[0] = amd_bytealign (w0[1], w0[2], offset); - w1[3] = amd_bytealign (w0[0], w0[1], offset); - w1[2] = amd_bytealign ( 0, w0[0], offset); + c1[2] = hc_bytealign (w3[3], 0, offset); + c1[1] = hc_bytealign (w3[2], w3[3], offset); + c1[0] = hc_bytealign (w3[1], w3[2], offset); + c0[3] = hc_bytealign (w3[0], w3[1], offset); + c0[2] = hc_bytealign (w2[3], w3[0], offset); + c0[1] = hc_bytealign (w2[2], w2[3], offset); + c0[0] = hc_bytealign (w2[1], w2[2], offset); + w3[3] = hc_bytealign (w2[0], w2[1], offset); + w3[2] = hc_bytealign (w1[3], w2[0], offset); + w3[1] = hc_bytealign (w1[2], w1[3], offset); + w3[0] = hc_bytealign (w1[1], w1[2], offset); + w2[3] = hc_bytealign (w1[0], w1[1], offset); + w2[2] = hc_bytealign (w0[3], w1[0], offset); + w2[1] = hc_bytealign (w0[2], w0[3], offset); + w2[0] = hc_bytealign (w0[1], w0[2], offset); + w1[3] = hc_bytealign (w0[0], w0[1], offset); + w1[2] = hc_bytealign ( 0, w0[0], offset); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -1408,23 +1408,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le (u32x *w0, u32x *w1, u32x *w2, u3 break; case 7: - c1[3] = amd_bytealign (w3[3], 0, offset); - c1[2] = amd_bytealign (w3[2], w3[3], offset); - c1[1] = amd_bytealign (w3[1], w3[2], offset); - c1[0] = amd_bytealign (w3[0], w3[1], offset); - c0[3] = amd_bytealign (w2[3], w3[0], offset); - c0[2] = amd_bytealign (w2[2], w2[3], offset); - c0[1] = amd_bytealign (w2[1], w2[2], offset); - c0[0] = amd_bytealign (w2[0], w2[1], offset); - w3[3] = amd_bytealign (w1[3], w2[0], offset); - w3[2] = amd_bytealign (w1[2], w1[3], offset); - w3[1] = amd_bytealign (w1[1], w1[2], offset); - w3[0] = amd_bytealign (w1[0], w1[1], offset); - w2[3] = amd_bytealign (w0[3], w1[0], offset); - w2[2] = amd_bytealign (w0[2], w0[3], offset); - w2[1] = amd_bytealign (w0[1], w0[2], offset); - w2[0] = amd_bytealign (w0[0], w0[1], offset); - w1[3] = amd_bytealign ( 0, w0[0], offset); + c1[3] = hc_bytealign (w3[3], 0, offset); + c1[2] = hc_bytealign (w3[2], w3[3], offset); + c1[1] = hc_bytealign (w3[1], w3[2], offset); + c1[0] = hc_bytealign (w3[0], w3[1], offset); + c0[3] = hc_bytealign (w2[3], w3[0], offset); + c0[2] = hc_bytealign (w2[2], w2[3], offset); + c0[1] = hc_bytealign (w2[1], w2[2], offset); + c0[0] = hc_bytealign (w2[0], w2[1], offset); + w3[3] = hc_bytealign (w1[3], w2[0], offset); + w3[2] = hc_bytealign (w1[2], w1[3], offset); + w3[1] = hc_bytealign (w1[1], w1[2], offset); + w3[0] = hc_bytealign (w1[0], w1[1], offset); + w2[3] = hc_bytealign (w0[3], w1[0], offset); + w2[2] = hc_bytealign (w0[2], w0[3], offset); + w2[1] = hc_bytealign (w0[1], w0[2], offset); + w2[0] = hc_bytealign (w0[0], w0[1], offset); + w1[3] = hc_bytealign ( 0, w0[0], offset); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -1436,23 +1436,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le (u32x *w0, u32x *w1, u32x *w2, u3 break; case 8: - c2[0] = amd_bytealign (w3[3], 0, offset); - c1[3] = amd_bytealign (w3[2], w3[3], offset); - c1[2] = amd_bytealign (w3[1], w3[2], offset); - c1[1] = amd_bytealign (w3[0], w3[1], offset); - c1[0] = amd_bytealign (w2[3], w3[0], offset); - c0[3] = amd_bytealign (w2[2], w2[3], offset); - c0[2] = amd_bytealign (w2[1], w2[2], offset); - c0[1] = amd_bytealign (w2[0], w2[1], offset); - c0[0] = amd_bytealign (w1[3], w2[0], offset); - w3[3] = amd_bytealign (w1[2], w1[3], offset); - w3[2] = amd_bytealign (w1[1], w1[2], offset); - w3[1] = amd_bytealign (w1[0], w1[1], offset); - w3[0] = amd_bytealign (w0[3], w1[0], offset); - w2[3] = amd_bytealign (w0[2], w0[3], offset); - w2[2] = amd_bytealign (w0[1], w0[2], offset); - w2[1] = amd_bytealign (w0[0], w0[1], offset); - w2[0] = amd_bytealign ( 0, w0[0], offset); + c2[0] = hc_bytealign (w3[3], 0, offset); + c1[3] = hc_bytealign (w3[2], w3[3], offset); + c1[2] = hc_bytealign (w3[1], w3[2], offset); + c1[1] = hc_bytealign (w3[0], w3[1], offset); + c1[0] = hc_bytealign (w2[3], w3[0], offset); + c0[3] = hc_bytealign (w2[2], w2[3], offset); + c0[2] = hc_bytealign (w2[1], w2[2], offset); + c0[1] = hc_bytealign (w2[0], w2[1], offset); + c0[0] = hc_bytealign (w1[3], w2[0], offset); + w3[3] = hc_bytealign (w1[2], w1[3], offset); + w3[2] = hc_bytealign (w1[1], w1[2], offset); + w3[1] = hc_bytealign (w1[0], w1[1], offset); + w3[0] = hc_bytealign (w0[3], w1[0], offset); + w2[3] = hc_bytealign (w0[2], w0[3], offset); + w2[2] = hc_bytealign (w0[1], w0[2], offset); + w2[1] = hc_bytealign (w0[0], w0[1], offset); + w2[0] = hc_bytealign ( 0, w0[0], offset); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -1465,23 +1465,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le (u32x *w0, u32x *w1, u32x *w2, u3 break; case 9: - c2[1] = amd_bytealign (w3[3], 0, offset); - c2[0] = amd_bytealign (w3[2], w3[3], offset); - c1[3] = amd_bytealign (w3[1], w3[2], offset); - c1[2] = amd_bytealign (w3[0], w3[1], offset); - c1[1] = amd_bytealign (w2[3], w3[0], offset); - c1[0] = amd_bytealign (w2[2], w2[3], offset); - c0[3] = amd_bytealign (w2[1], w2[2], offset); - c0[2] = amd_bytealign (w2[0], w2[1], offset); - c0[1] = amd_bytealign (w1[3], w2[0], offset); - c0[0] = amd_bytealign (w1[2], w1[3], offset); - w3[3] = amd_bytealign (w1[1], w1[2], offset); - w3[2] = amd_bytealign (w1[0], w1[1], offset); - w3[1] = amd_bytealign (w0[3], w1[0], offset); - w3[0] = amd_bytealign (w0[2], w0[3], offset); - w2[3] = amd_bytealign (w0[1], w0[2], offset); - w2[2] = amd_bytealign (w0[0], w0[1], offset); - w2[1] = amd_bytealign ( 0, w0[0], offset); + c2[1] = hc_bytealign (w3[3], 0, offset); + c2[0] = hc_bytealign (w3[2], w3[3], offset); + c1[3] = hc_bytealign (w3[1], w3[2], offset); + c1[2] = hc_bytealign (w3[0], w3[1], offset); + c1[1] = hc_bytealign (w2[3], w3[0], offset); + c1[0] = hc_bytealign (w2[2], w2[3], offset); + c0[3] = hc_bytealign (w2[1], w2[2], offset); + c0[2] = hc_bytealign (w2[0], w2[1], offset); + c0[1] = hc_bytealign (w1[3], w2[0], offset); + c0[0] = hc_bytealign (w1[2], w1[3], offset); + w3[3] = hc_bytealign (w1[1], w1[2], offset); + w3[2] = hc_bytealign (w1[0], w1[1], offset); + w3[1] = hc_bytealign (w0[3], w1[0], offset); + w3[0] = hc_bytealign (w0[2], w0[3], offset); + w2[3] = hc_bytealign (w0[1], w0[2], offset); + w2[2] = hc_bytealign (w0[0], w0[1], offset); + w2[1] = hc_bytealign ( 0, w0[0], offset); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -1495,23 +1495,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le (u32x *w0, u32x *w1, u32x *w2, u3 break; case 10: - c2[2] = amd_bytealign (w3[3], 0, offset); - c2[1] = amd_bytealign (w3[2], w3[3], offset); - c2[0] = amd_bytealign (w3[1], w3[2], offset); - c1[3] = amd_bytealign (w3[0], w3[1], offset); - c1[2] = amd_bytealign (w2[3], w3[0], offset); - c1[1] = amd_bytealign (w2[2], w2[3], offset); - c1[0] = amd_bytealign (w2[1], w2[2], offset); - c0[3] = amd_bytealign (w2[0], w2[1], offset); - c0[2] = amd_bytealign (w1[3], w2[0], offset); - c0[1] = amd_bytealign (w1[2], w1[3], offset); - c0[0] = amd_bytealign (w1[1], w1[2], offset); - w3[3] = amd_bytealign (w1[0], w1[1], offset); - w3[2] = amd_bytealign (w0[3], w1[0], offset); - w3[1] = amd_bytealign (w0[2], w0[3], offset); - w3[0] = amd_bytealign (w0[1], w0[2], offset); - w2[3] = amd_bytealign (w0[0], w0[1], offset); - w2[2] = amd_bytealign ( 0, w0[0], offset); + c2[2] = hc_bytealign (w3[3], 0, offset); + c2[1] = hc_bytealign (w3[2], w3[3], offset); + c2[0] = hc_bytealign (w3[1], w3[2], offset); + c1[3] = hc_bytealign (w3[0], w3[1], offset); + c1[2] = hc_bytealign (w2[3], w3[0], offset); + c1[1] = hc_bytealign (w2[2], w2[3], offset); + c1[0] = hc_bytealign (w2[1], w2[2], offset); + c0[3] = hc_bytealign (w2[0], w2[1], offset); + c0[2] = hc_bytealign (w1[3], w2[0], offset); + c0[1] = hc_bytealign (w1[2], w1[3], offset); + c0[0] = hc_bytealign (w1[1], w1[2], offset); + w3[3] = hc_bytealign (w1[0], w1[1], offset); + w3[2] = hc_bytealign (w0[3], w1[0], offset); + w3[1] = hc_bytealign (w0[2], w0[3], offset); + w3[0] = hc_bytealign (w0[1], w0[2], offset); + w2[3] = hc_bytealign (w0[0], w0[1], offset); + w2[2] = hc_bytealign ( 0, w0[0], offset); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -1526,23 +1526,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le (u32x *w0, u32x *w1, u32x *w2, u3 break; case 11: - c2[3] = amd_bytealign (w3[3], 0, offset); - c2[2] = amd_bytealign (w3[2], w3[3], offset); - c2[1] = amd_bytealign (w3[1], w3[2], offset); - c2[0] = amd_bytealign (w3[0], w3[1], offset); - c1[3] = amd_bytealign (w2[3], w3[0], offset); - c1[2] = amd_bytealign (w2[2], w2[3], offset); - c1[1] = amd_bytealign (w2[1], w2[2], offset); - c1[0] = amd_bytealign (w2[0], w2[1], offset); - c0[3] = amd_bytealign (w1[3], w2[0], offset); - c0[2] = amd_bytealign (w1[2], w1[3], offset); - c0[1] = amd_bytealign (w1[1], w1[2], offset); - c0[0] = amd_bytealign (w1[0], w1[1], offset); - w3[3] = amd_bytealign (w0[3], w1[0], offset); - w3[2] = amd_bytealign (w0[2], w0[3], offset); - w3[1] = amd_bytealign (w0[1], w0[2], offset); - w3[0] = amd_bytealign (w0[0], w0[1], offset); - w2[3] = amd_bytealign ( 0, w0[0], offset); + c2[3] = hc_bytealign (w3[3], 0, offset); + c2[2] = hc_bytealign (w3[2], w3[3], offset); + c2[1] = hc_bytealign (w3[1], w3[2], offset); + c2[0] = hc_bytealign (w3[0], w3[1], offset); + c1[3] = hc_bytealign (w2[3], w3[0], offset); + c1[2] = hc_bytealign (w2[2], w2[3], offset); + c1[1] = hc_bytealign (w2[1], w2[2], offset); + c1[0] = hc_bytealign (w2[0], w2[1], offset); + c0[3] = hc_bytealign (w1[3], w2[0], offset); + c0[2] = hc_bytealign (w1[2], w1[3], offset); + c0[1] = hc_bytealign (w1[1], w1[2], offset); + c0[0] = hc_bytealign (w1[0], w1[1], offset); + w3[3] = hc_bytealign (w0[3], w1[0], offset); + w3[2] = hc_bytealign (w0[2], w0[3], offset); + w3[1] = hc_bytealign (w0[1], w0[2], offset); + w3[0] = hc_bytealign (w0[0], w0[1], offset); + w2[3] = hc_bytealign ( 0, w0[0], offset); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -1558,23 +1558,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le (u32x *w0, u32x *w1, u32x *w2, u3 break; case 12: - c3[0] = amd_bytealign (w3[3], 0, offset); - c2[3] = amd_bytealign (w3[2], w3[3], offset); - c2[2] = amd_bytealign (w3[1], w3[2], offset); - c2[1] = amd_bytealign (w3[0], w3[1], offset); - c2[0] = amd_bytealign (w2[3], w3[0], offset); - c1[3] = amd_bytealign (w2[2], w2[3], offset); - c1[2] = amd_bytealign (w2[1], w2[2], offset); - c1[1] = amd_bytealign (w2[0], w2[1], offset); - c1[0] = amd_bytealign (w1[3], w2[0], offset); - c0[3] = amd_bytealign (w1[2], w1[3], offset); - c0[2] = amd_bytealign (w1[1], w1[2], offset); - c0[1] = amd_bytealign (w1[0], w1[1], offset); - c0[0] = amd_bytealign (w0[3], w1[0], offset); - w3[3] = amd_bytealign (w0[2], w0[3], offset); - w3[2] = amd_bytealign (w0[1], w0[2], offset); - w3[1] = amd_bytealign (w0[0], w0[1], offset); - w3[0] = amd_bytealign ( 0, w0[0], offset); + c3[0] = hc_bytealign (w3[3], 0, offset); + c2[3] = hc_bytealign (w3[2], w3[3], offset); + c2[2] = hc_bytealign (w3[1], w3[2], offset); + c2[1] = hc_bytealign (w3[0], w3[1], offset); + c2[0] = hc_bytealign (w2[3], w3[0], offset); + c1[3] = hc_bytealign (w2[2], w2[3], offset); + c1[2] = hc_bytealign (w2[1], w2[2], offset); + c1[1] = hc_bytealign (w2[0], w2[1], offset); + c1[0] = hc_bytealign (w1[3], w2[0], offset); + c0[3] = hc_bytealign (w1[2], w1[3], offset); + c0[2] = hc_bytealign (w1[1], w1[2], offset); + c0[1] = hc_bytealign (w1[0], w1[1], offset); + c0[0] = hc_bytealign (w0[3], w1[0], offset); + w3[3] = hc_bytealign (w0[2], w0[3], offset); + w3[2] = hc_bytealign (w0[1], w0[2], offset); + w3[1] = hc_bytealign (w0[0], w0[1], offset); + w3[0] = hc_bytealign ( 0, w0[0], offset); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -1591,23 +1591,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le (u32x *w0, u32x *w1, u32x *w2, u3 break; case 13: - c3[1] = amd_bytealign (w3[3], 0, offset); - c3[0] = amd_bytealign (w3[2], w3[3], offset); - c2[3] = amd_bytealign (w3[1], w3[2], offset); - c2[2] = amd_bytealign (w3[0], w3[1], offset); - c2[1] = amd_bytealign (w2[3], w3[0], offset); - c2[0] = amd_bytealign (w2[2], w2[3], offset); - c1[3] = amd_bytealign (w2[1], w2[2], offset); - c1[2] = amd_bytealign (w2[0], w2[1], offset); - c1[1] = amd_bytealign (w1[3], w2[0], offset); - c1[0] = amd_bytealign (w1[2], w1[3], offset); - c0[3] = amd_bytealign (w1[1], w1[2], offset); - c0[2] = amd_bytealign (w1[0], w1[1], offset); - c0[1] = amd_bytealign (w0[3], w1[0], offset); - c0[0] = amd_bytealign (w0[2], w0[3], offset); - w3[3] = amd_bytealign (w0[1], w0[2], offset); - w3[2] = amd_bytealign (w0[0], w0[1], offset); - w3[1] = amd_bytealign ( 0, w0[0], offset); + c3[1] = hc_bytealign (w3[3], 0, offset); + c3[0] = hc_bytealign (w3[2], w3[3], offset); + c2[3] = hc_bytealign (w3[1], w3[2], offset); + c2[2] = hc_bytealign (w3[0], w3[1], offset); + c2[1] = hc_bytealign (w2[3], w3[0], offset); + c2[0] = hc_bytealign (w2[2], w2[3], offset); + c1[3] = hc_bytealign (w2[1], w2[2], offset); + c1[2] = hc_bytealign (w2[0], w2[1], offset); + c1[1] = hc_bytealign (w1[3], w2[0], offset); + c1[0] = hc_bytealign (w1[2], w1[3], offset); + c0[3] = hc_bytealign (w1[1], w1[2], offset); + c0[2] = hc_bytealign (w1[0], w1[1], offset); + c0[1] = hc_bytealign (w0[3], w1[0], offset); + c0[0] = hc_bytealign (w0[2], w0[3], offset); + w3[3] = hc_bytealign (w0[1], w0[2], offset); + w3[2] = hc_bytealign (w0[0], w0[1], offset); + w3[1] = hc_bytealign ( 0, w0[0], offset); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -1625,23 +1625,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le (u32x *w0, u32x *w1, u32x *w2, u3 break; case 14: - c3[2] = amd_bytealign (w3[3], 0, offset); - c3[1] = amd_bytealign (w3[2], w3[3], offset); - c3[0] = amd_bytealign (w3[1], w3[2], offset); - c2[3] = amd_bytealign (w3[0], w3[1], offset); - c2[2] = amd_bytealign (w2[3], w3[0], offset); - c2[1] = amd_bytealign (w2[2], w2[3], offset); - c2[0] = amd_bytealign (w2[1], w2[2], offset); - c1[3] = amd_bytealign (w2[0], w2[1], offset); - c1[2] = amd_bytealign (w1[3], w2[0], offset); - c1[1] = amd_bytealign (w1[2], w1[3], offset); - c1[0] = amd_bytealign (w1[1], w1[2], offset); - c0[3] = amd_bytealign (w1[0], w1[1], offset); - c0[2] = amd_bytealign (w0[3], w1[0], offset); - c0[1] = amd_bytealign (w0[2], w0[3], offset); - c0[0] = amd_bytealign (w0[1], w0[2], offset); - w3[3] = amd_bytealign (w0[0], w0[1], offset); - w3[2] = amd_bytealign ( 0, w0[0], offset); + c3[2] = hc_bytealign (w3[3], 0, offset); + c3[1] = hc_bytealign (w3[2], w3[3], offset); + c3[0] = hc_bytealign (w3[1], w3[2], offset); + c2[3] = hc_bytealign (w3[0], w3[1], offset); + c2[2] = hc_bytealign (w2[3], w3[0], offset); + c2[1] = hc_bytealign (w2[2], w2[3], offset); + c2[0] = hc_bytealign (w2[1], w2[2], offset); + c1[3] = hc_bytealign (w2[0], w2[1], offset); + c1[2] = hc_bytealign (w1[3], w2[0], offset); + c1[1] = hc_bytealign (w1[2], w1[3], offset); + c1[0] = hc_bytealign (w1[1], w1[2], offset); + c0[3] = hc_bytealign (w1[0], w1[1], offset); + c0[2] = hc_bytealign (w0[3], w1[0], offset); + c0[1] = hc_bytealign (w0[2], w0[3], offset); + c0[0] = hc_bytealign (w0[1], w0[2], offset); + w3[3] = hc_bytealign (w0[0], w0[1], offset); + w3[2] = hc_bytealign ( 0, w0[0], offset); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -1660,23 +1660,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le (u32x *w0, u32x *w1, u32x *w2, u3 break; case 15: - c3[3] = amd_bytealign (w3[3], 0, offset); - c3[2] = amd_bytealign (w3[2], w3[3], offset); - c3[1] = amd_bytealign (w3[1], w3[2], offset); - c3[0] = amd_bytealign (w3[0], w3[1], offset); - c2[3] = amd_bytealign (w2[3], w3[0], offset); - c2[2] = amd_bytealign (w2[2], w2[3], offset); - c2[1] = amd_bytealign (w2[1], w2[2], offset); - c2[0] = amd_bytealign (w2[0], w2[1], offset); - c1[3] = amd_bytealign (w1[3], w2[0], offset); - c1[2] = amd_bytealign (w1[2], w1[3], offset); - c1[1] = amd_bytealign (w1[1], w1[2], offset); - c1[0] = amd_bytealign (w1[0], w1[1], offset); - c0[3] = amd_bytealign (w0[3], w1[0], offset); - c0[2] = amd_bytealign (w0[2], w0[3], offset); - c0[1] = amd_bytealign (w0[1], w0[2], offset); - c0[0] = amd_bytealign (w0[0], w0[1], offset); - w3[3] = amd_bytealign ( 0, w0[0], offset); + c3[3] = hc_bytealign (w3[3], 0, offset); + c3[2] = hc_bytealign (w3[2], w3[3], offset); + c3[1] = hc_bytealign (w3[1], w3[2], offset); + c3[0] = hc_bytealign (w3[0], w3[1], offset); + c2[3] = hc_bytealign (w2[3], w3[0], offset); + c2[2] = hc_bytealign (w2[2], w2[3], offset); + c2[1] = hc_bytealign (w2[1], w2[2], offset); + c2[0] = hc_bytealign (w2[0], w2[1], offset); + c1[3] = hc_bytealign (w1[3], w2[0], offset); + c1[2] = hc_bytealign (w1[2], w1[3], offset); + c1[1] = hc_bytealign (w1[1], w1[2], offset); + c1[0] = hc_bytealign (w1[0], w1[1], offset); + c0[3] = hc_bytealign (w0[3], w1[0], offset); + c0[2] = hc_bytealign (w0[2], w0[3], offset); + c0[1] = hc_bytealign (w0[1], w0[2], offset); + c0[0] = hc_bytealign (w0[0], w0[1], offset); + w3[3] = hc_bytealign ( 0, w0[0], offset); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -1735,23 +1735,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le (u32x *w0, u32x *w1, u32x *w2, u3 switch (offset_switch) { case 0: - c0[0] = amd_bytealign ( 0, w3[3], offset_minus_4); - w3[3] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w3[2] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w3[1] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w3[0] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w2[3] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w2[2] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w2[1] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w2[0] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w1[3] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w1[2] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w1[1] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w1[0] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w0[3] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w0[2] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w0[1] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w0[0] = amd_bytealign (w0[0], 0, offset_minus_4); + c0[0] = hc_bytealign ( 0, w3[3], offset_minus_4); + w3[3] = hc_bytealign (w3[3], w3[2], offset_minus_4); + w3[2] = hc_bytealign (w3[2], w3[1], offset_minus_4); + w3[1] = hc_bytealign (w3[1], w3[0], offset_minus_4); + w3[0] = hc_bytealign (w3[0], w2[3], offset_minus_4); + w2[3] = hc_bytealign (w2[3], w2[2], offset_minus_4); + w2[2] = hc_bytealign (w2[2], w2[1], offset_minus_4); + w2[1] = hc_bytealign (w2[1], w2[0], offset_minus_4); + w2[0] = hc_bytealign (w2[0], w1[3], offset_minus_4); + w1[3] = hc_bytealign (w1[3], w1[2], offset_minus_4); + w1[2] = hc_bytealign (w1[2], w1[1], offset_minus_4); + w1[1] = hc_bytealign (w1[1], w1[0], offset_minus_4); + w1[0] = hc_bytealign (w1[0], w0[3], offset_minus_4); + w0[3] = hc_bytealign (w0[3], w0[2], offset_minus_4); + w0[2] = hc_bytealign (w0[2], w0[1], offset_minus_4); + w0[1] = hc_bytealign (w0[1], w0[0], offset_minus_4); + w0[0] = hc_bytealign (w0[0], 0, offset_minus_4); if (offset_mod_4 == 0) { @@ -1777,23 +1777,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le (u32x *w0, u32x *w1, u32x *w2, u3 break; case 1: - c0[1] = amd_bytealign ( 0, w3[3], offset_minus_4); - c0[0] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w3[3] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w3[2] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w3[1] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w3[0] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w2[3] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w2[2] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w2[1] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w2[0] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w1[3] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w1[2] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w1[1] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w1[0] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w0[3] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w0[2] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w0[1] = amd_bytealign (w0[0], 0, offset_minus_4); + c0[1] = hc_bytealign ( 0, w3[3], offset_minus_4); + c0[0] = hc_bytealign (w3[3], w3[2], offset_minus_4); + w3[3] = hc_bytealign (w3[2], w3[1], offset_minus_4); + w3[2] = hc_bytealign (w3[1], w3[0], offset_minus_4); + w3[1] = hc_bytealign (w3[0], w2[3], offset_minus_4); + w3[0] = hc_bytealign (w2[3], w2[2], offset_minus_4); + w2[3] = hc_bytealign (w2[2], w2[1], offset_minus_4); + w2[2] = hc_bytealign (w2[1], w2[0], offset_minus_4); + w2[1] = hc_bytealign (w2[0], w1[3], offset_minus_4); + w2[0] = hc_bytealign (w1[3], w1[2], offset_minus_4); + w1[3] = hc_bytealign (w1[2], w1[1], offset_minus_4); + w1[2] = hc_bytealign (w1[1], w1[0], offset_minus_4); + w1[1] = hc_bytealign (w1[0], w0[3], offset_minus_4); + w1[0] = hc_bytealign (w0[3], w0[2], offset_minus_4); + w0[3] = hc_bytealign (w0[2], w0[1], offset_minus_4); + w0[2] = hc_bytealign (w0[1], w0[0], offset_minus_4); + w0[1] = hc_bytealign (w0[0], 0, offset_minus_4); w0[0] = 0; if (offset_mod_4 == 0) @@ -1820,23 +1820,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le (u32x *w0, u32x *w1, u32x *w2, u3 break; case 2: - c0[2] = amd_bytealign ( 0, w3[3], offset_minus_4); - c0[1] = amd_bytealign (w3[3], w3[2], offset_minus_4); - c0[0] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w3[3] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w3[2] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w3[1] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w3[0] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w2[3] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w2[2] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w2[1] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w2[0] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w1[3] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w1[2] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w1[1] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w1[0] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w0[3] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w0[2] = amd_bytealign (w0[0], 0, offset_minus_4); + c0[2] = hc_bytealign ( 0, w3[3], offset_minus_4); + c0[1] = hc_bytealign (w3[3], w3[2], offset_minus_4); + c0[0] = hc_bytealign (w3[2], w3[1], offset_minus_4); + w3[3] = hc_bytealign (w3[1], w3[0], offset_minus_4); + w3[2] = hc_bytealign (w3[0], w2[3], offset_minus_4); + w3[1] = hc_bytealign (w2[3], w2[2], offset_minus_4); + w3[0] = hc_bytealign (w2[2], w2[1], offset_minus_4); + w2[3] = hc_bytealign (w2[1], w2[0], offset_minus_4); + w2[2] = hc_bytealign (w2[0], w1[3], offset_minus_4); + w2[1] = hc_bytealign (w1[3], w1[2], offset_minus_4); + w2[0] = hc_bytealign (w1[2], w1[1], offset_minus_4); + w1[3] = hc_bytealign (w1[1], w1[0], offset_minus_4); + w1[2] = hc_bytealign (w1[0], w0[3], offset_minus_4); + w1[1] = hc_bytealign (w0[3], w0[2], offset_minus_4); + w1[0] = hc_bytealign (w0[2], w0[1], offset_minus_4); + w0[3] = hc_bytealign (w0[1], w0[0], offset_minus_4); + w0[2] = hc_bytealign (w0[0], 0, offset_minus_4); w0[1] = 0; w0[0] = 0; @@ -1864,23 +1864,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le (u32x *w0, u32x *w1, u32x *w2, u3 break; case 3: - c0[3] = amd_bytealign ( 0, w3[3], offset_minus_4); - c0[2] = amd_bytealign (w3[3], w3[2], offset_minus_4); - c0[1] = amd_bytealign (w3[2], w3[1], offset_minus_4); - c0[0] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w3[3] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w3[2] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w3[1] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w3[0] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w2[3] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w2[2] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w2[1] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w2[0] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w1[3] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w1[2] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w1[1] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w1[0] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w0[3] = amd_bytealign (w0[0], 0, offset_minus_4); + c0[3] = hc_bytealign ( 0, w3[3], offset_minus_4); + c0[2] = hc_bytealign (w3[3], w3[2], offset_minus_4); + c0[1] = hc_bytealign (w3[2], w3[1], offset_minus_4); + c0[0] = hc_bytealign (w3[1], w3[0], offset_minus_4); + w3[3] = hc_bytealign (w3[0], w2[3], offset_minus_4); + w3[2] = hc_bytealign (w2[3], w2[2], offset_minus_4); + w3[1] = hc_bytealign (w2[2], w2[1], offset_minus_4); + w3[0] = hc_bytealign (w2[1], w2[0], offset_minus_4); + w2[3] = hc_bytealign (w2[0], w1[3], offset_minus_4); + w2[2] = hc_bytealign (w1[3], w1[2], offset_minus_4); + w2[1] = hc_bytealign (w1[2], w1[1], offset_minus_4); + w2[0] = hc_bytealign (w1[1], w1[0], offset_minus_4); + w1[3] = hc_bytealign (w1[0], w0[3], offset_minus_4); + w1[2] = hc_bytealign (w0[3], w0[2], offset_minus_4); + w1[1] = hc_bytealign (w0[2], w0[1], offset_minus_4); + w1[0] = hc_bytealign (w0[1], w0[0], offset_minus_4); + w0[3] = hc_bytealign (w0[0], 0, offset_minus_4); w0[2] = 0; w0[1] = 0; w0[0] = 0; @@ -1909,23 +1909,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le (u32x *w0, u32x *w1, u32x *w2, u3 break; case 4: - c1[0] = amd_bytealign ( 0, w3[3], offset_minus_4); - c0[3] = amd_bytealign (w3[3], w3[2], offset_minus_4); - c0[2] = amd_bytealign (w3[2], w3[1], offset_minus_4); - c0[1] = amd_bytealign (w3[1], w3[0], offset_minus_4); - c0[0] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w3[3] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w3[2] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w3[1] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w3[0] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w2[3] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w2[2] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w2[1] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w2[0] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w1[3] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w1[2] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w1[1] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w1[0] = amd_bytealign (w0[0], 0, offset_minus_4); + c1[0] = hc_bytealign ( 0, w3[3], offset_minus_4); + c0[3] = hc_bytealign (w3[3], w3[2], offset_minus_4); + c0[2] = hc_bytealign (w3[2], w3[1], offset_minus_4); + c0[1] = hc_bytealign (w3[1], w3[0], offset_minus_4); + c0[0] = hc_bytealign (w3[0], w2[3], offset_minus_4); + w3[3] = hc_bytealign (w2[3], w2[2], offset_minus_4); + w3[2] = hc_bytealign (w2[2], w2[1], offset_minus_4); + w3[1] = hc_bytealign (w2[1], w2[0], offset_minus_4); + w3[0] = hc_bytealign (w2[0], w1[3], offset_minus_4); + w2[3] = hc_bytealign (w1[3], w1[2], offset_minus_4); + w2[2] = hc_bytealign (w1[2], w1[1], offset_minus_4); + w2[1] = hc_bytealign (w1[1], w1[0], offset_minus_4); + w2[0] = hc_bytealign (w1[0], w0[3], offset_minus_4); + w1[3] = hc_bytealign (w0[3], w0[2], offset_minus_4); + w1[2] = hc_bytealign (w0[2], w0[1], offset_minus_4); + w1[1] = hc_bytealign (w0[1], w0[0], offset_minus_4); + w1[0] = hc_bytealign (w0[0], 0, offset_minus_4); w0[3] = 0; w0[2] = 0; w0[1] = 0; @@ -1955,23 +1955,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le (u32x *w0, u32x *w1, u32x *w2, u3 break; case 5: - c1[1] = amd_bytealign ( 0, w3[3], offset_minus_4); - c1[0] = amd_bytealign (w3[3], w3[2], offset_minus_4); - c0[3] = amd_bytealign (w3[2], w3[1], offset_minus_4); - c0[2] = amd_bytealign (w3[1], w3[0], offset_minus_4); - c0[1] = amd_bytealign (w3[0], w2[3], offset_minus_4); - c0[0] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w3[3] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w3[2] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w3[1] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w3[0] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w2[3] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w2[2] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w2[1] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w2[0] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w1[3] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w1[2] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w1[1] = amd_bytealign (w0[0], 0, offset_minus_4); + c1[1] = hc_bytealign ( 0, w3[3], offset_minus_4); + c1[0] = hc_bytealign (w3[3], w3[2], offset_minus_4); + c0[3] = hc_bytealign (w3[2], w3[1], offset_minus_4); + c0[2] = hc_bytealign (w3[1], w3[0], offset_minus_4); + c0[1] = hc_bytealign (w3[0], w2[3], offset_minus_4); + c0[0] = hc_bytealign (w2[3], w2[2], offset_minus_4); + w3[3] = hc_bytealign (w2[2], w2[1], offset_minus_4); + w3[2] = hc_bytealign (w2[1], w2[0], offset_minus_4); + w3[1] = hc_bytealign (w2[0], w1[3], offset_minus_4); + w3[0] = hc_bytealign (w1[3], w1[2], offset_minus_4); + w2[3] = hc_bytealign (w1[2], w1[1], offset_minus_4); + w2[2] = hc_bytealign (w1[1], w1[0], offset_minus_4); + w2[1] = hc_bytealign (w1[0], w0[3], offset_minus_4); + w2[0] = hc_bytealign (w0[3], w0[2], offset_minus_4); + w1[3] = hc_bytealign (w0[2], w0[1], offset_minus_4); + w1[2] = hc_bytealign (w0[1], w0[0], offset_minus_4); + w1[1] = hc_bytealign (w0[0], 0, offset_minus_4); w1[0] = 0; w0[3] = 0; w0[2] = 0; @@ -2002,23 +2002,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le (u32x *w0, u32x *w1, u32x *w2, u3 break; case 6: - c1[2] = amd_bytealign ( 0, w3[3], offset_minus_4); - c1[1] = amd_bytealign (w3[3], w3[2], offset_minus_4); - c1[0] = amd_bytealign (w3[2], w3[1], offset_minus_4); - c0[3] = amd_bytealign (w3[1], w3[0], offset_minus_4); - c0[2] = amd_bytealign (w3[0], w2[3], offset_minus_4); - c0[1] = amd_bytealign (w2[3], w2[2], offset_minus_4); - c0[0] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w3[3] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w3[2] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w3[1] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w3[0] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w2[3] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w2[2] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w2[1] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w2[0] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w1[3] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w1[2] = amd_bytealign (w0[0], 0, offset_minus_4); + c1[2] = hc_bytealign ( 0, w3[3], offset_minus_4); + c1[1] = hc_bytealign (w3[3], w3[2], offset_minus_4); + c1[0] = hc_bytealign (w3[2], w3[1], offset_minus_4); + c0[3] = hc_bytealign (w3[1], w3[0], offset_minus_4); + c0[2] = hc_bytealign (w3[0], w2[3], offset_minus_4); + c0[1] = hc_bytealign (w2[3], w2[2], offset_minus_4); + c0[0] = hc_bytealign (w2[2], w2[1], offset_minus_4); + w3[3] = hc_bytealign (w2[1], w2[0], offset_minus_4); + w3[2] = hc_bytealign (w2[0], w1[3], offset_minus_4); + w3[1] = hc_bytealign (w1[3], w1[2], offset_minus_4); + w3[0] = hc_bytealign (w1[2], w1[1], offset_minus_4); + w2[3] = hc_bytealign (w1[1], w1[0], offset_minus_4); + w2[2] = hc_bytealign (w1[0], w0[3], offset_minus_4); + w2[1] = hc_bytealign (w0[3], w0[2], offset_minus_4); + w2[0] = hc_bytealign (w0[2], w0[1], offset_minus_4); + w1[3] = hc_bytealign (w0[1], w0[0], offset_minus_4); + w1[2] = hc_bytealign (w0[0], 0, offset_minus_4); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -2050,23 +2050,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le (u32x *w0, u32x *w1, u32x *w2, u3 break; case 7: - c1[3] = amd_bytealign ( 0, w3[3], offset_minus_4); - c1[2] = amd_bytealign (w3[3], w3[2], offset_minus_4); - c1[1] = amd_bytealign (w3[2], w3[1], offset_minus_4); - c1[0] = amd_bytealign (w3[1], w3[0], offset_minus_4); - c0[3] = amd_bytealign (w3[0], w2[3], offset_minus_4); - c0[2] = amd_bytealign (w2[3], w2[2], offset_minus_4); - c0[1] = amd_bytealign (w2[2], w2[1], offset_minus_4); - c0[0] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w3[3] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w3[2] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w3[1] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w3[0] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w2[3] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w2[2] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w2[1] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w2[0] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w1[3] = amd_bytealign (w0[0], 0, offset_minus_4); + c1[3] = hc_bytealign ( 0, w3[3], offset_minus_4); + c1[2] = hc_bytealign (w3[3], w3[2], offset_minus_4); + c1[1] = hc_bytealign (w3[2], w3[1], offset_minus_4); + c1[0] = hc_bytealign (w3[1], w3[0], offset_minus_4); + c0[3] = hc_bytealign (w3[0], w2[3], offset_minus_4); + c0[2] = hc_bytealign (w2[3], w2[2], offset_minus_4); + c0[1] = hc_bytealign (w2[2], w2[1], offset_minus_4); + c0[0] = hc_bytealign (w2[1], w2[0], offset_minus_4); + w3[3] = hc_bytealign (w2[0], w1[3], offset_minus_4); + w3[2] = hc_bytealign (w1[3], w1[2], offset_minus_4); + w3[1] = hc_bytealign (w1[2], w1[1], offset_minus_4); + w3[0] = hc_bytealign (w1[1], w1[0], offset_minus_4); + w2[3] = hc_bytealign (w1[0], w0[3], offset_minus_4); + w2[2] = hc_bytealign (w0[3], w0[2], offset_minus_4); + w2[1] = hc_bytealign (w0[2], w0[1], offset_minus_4); + w2[0] = hc_bytealign (w0[1], w0[0], offset_minus_4); + w1[3] = hc_bytealign (w0[0], 0, offset_minus_4); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -2099,23 +2099,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le (u32x *w0, u32x *w1, u32x *w2, u3 break; case 8: - c2[0] = amd_bytealign ( 0, w3[3], offset_minus_4); - c1[3] = amd_bytealign (w3[3], w3[2], offset_minus_4); - c1[2] = amd_bytealign (w3[2], w3[1], offset_minus_4); - c1[1] = amd_bytealign (w3[1], w3[0], offset_minus_4); - c1[0] = amd_bytealign (w3[0], w2[3], offset_minus_4); - c0[3] = amd_bytealign (w2[3], w2[2], offset_minus_4); - c0[2] = amd_bytealign (w2[2], w2[1], offset_minus_4); - c0[1] = amd_bytealign (w2[1], w2[0], offset_minus_4); - c0[0] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w3[3] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w3[2] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w3[1] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w3[0] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w2[3] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w2[2] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w2[1] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w2[0] = amd_bytealign (w0[0], 0, offset_minus_4); + c2[0] = hc_bytealign ( 0, w3[3], offset_minus_4); + c1[3] = hc_bytealign (w3[3], w3[2], offset_minus_4); + c1[2] = hc_bytealign (w3[2], w3[1], offset_minus_4); + c1[1] = hc_bytealign (w3[1], w3[0], offset_minus_4); + c1[0] = hc_bytealign (w3[0], w2[3], offset_minus_4); + c0[3] = hc_bytealign (w2[3], w2[2], offset_minus_4); + c0[2] = hc_bytealign (w2[2], w2[1], offset_minus_4); + c0[1] = hc_bytealign (w2[1], w2[0], offset_minus_4); + c0[0] = hc_bytealign (w2[0], w1[3], offset_minus_4); + w3[3] = hc_bytealign (w1[3], w1[2], offset_minus_4); + w3[2] = hc_bytealign (w1[2], w1[1], offset_minus_4); + w3[1] = hc_bytealign (w1[1], w1[0], offset_minus_4); + w3[0] = hc_bytealign (w1[0], w0[3], offset_minus_4); + w2[3] = hc_bytealign (w0[3], w0[2], offset_minus_4); + w2[2] = hc_bytealign (w0[2], w0[1], offset_minus_4); + w2[1] = hc_bytealign (w0[1], w0[0], offset_minus_4); + w2[0] = hc_bytealign (w0[0], 0, offset_minus_4); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -2149,23 +2149,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le (u32x *w0, u32x *w1, u32x *w2, u3 break; case 9: - c2[1] = amd_bytealign ( 0, w3[3], offset_minus_4); - c2[0] = amd_bytealign (w3[3], w3[2], offset_minus_4); - c1[3] = amd_bytealign (w3[2], w3[1], offset_minus_4); - c1[2] = amd_bytealign (w3[1], w3[0], offset_minus_4); - c1[1] = amd_bytealign (w3[0], w2[3], offset_minus_4); - c1[0] = amd_bytealign (w2[3], w2[2], offset_minus_4); - c0[3] = amd_bytealign (w2[2], w2[1], offset_minus_4); - c0[2] = amd_bytealign (w2[1], w2[0], offset_minus_4); - c0[1] = amd_bytealign (w2[0], w1[3], offset_minus_4); - c0[0] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w3[3] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w3[2] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w3[1] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w3[0] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w2[3] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w2[2] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w2[1] = amd_bytealign (w0[0], 0, offset_minus_4); + c2[1] = hc_bytealign ( 0, w3[3], offset_minus_4); + c2[0] = hc_bytealign (w3[3], w3[2], offset_minus_4); + c1[3] = hc_bytealign (w3[2], w3[1], offset_minus_4); + c1[2] = hc_bytealign (w3[1], w3[0], offset_minus_4); + c1[1] = hc_bytealign (w3[0], w2[3], offset_minus_4); + c1[0] = hc_bytealign (w2[3], w2[2], offset_minus_4); + c0[3] = hc_bytealign (w2[2], w2[1], offset_minus_4); + c0[2] = hc_bytealign (w2[1], w2[0], offset_minus_4); + c0[1] = hc_bytealign (w2[0], w1[3], offset_minus_4); + c0[0] = hc_bytealign (w1[3], w1[2], offset_minus_4); + w3[3] = hc_bytealign (w1[2], w1[1], offset_minus_4); + w3[2] = hc_bytealign (w1[1], w1[0], offset_minus_4); + w3[1] = hc_bytealign (w1[0], w0[3], offset_minus_4); + w3[0] = hc_bytealign (w0[3], w0[2], offset_minus_4); + w2[3] = hc_bytealign (w0[2], w0[1], offset_minus_4); + w2[2] = hc_bytealign (w0[1], w0[0], offset_minus_4); + w2[1] = hc_bytealign (w0[0], 0, offset_minus_4); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -2200,23 +2200,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le (u32x *w0, u32x *w1, u32x *w2, u3 break; case 10: - c2[2] = amd_bytealign ( 0, w3[3], offset_minus_4); - c2[1] = amd_bytealign (w3[3], w3[2], offset_minus_4); - c2[0] = amd_bytealign (w3[2], w3[1], offset_minus_4); - c1[3] = amd_bytealign (w3[1], w3[0], offset_minus_4); - c1[2] = amd_bytealign (w3[0], w2[3], offset_minus_4); - c1[1] = amd_bytealign (w2[3], w2[2], offset_minus_4); - c1[0] = amd_bytealign (w2[2], w2[1], offset_minus_4); - c0[3] = amd_bytealign (w2[1], w2[0], offset_minus_4); - c0[2] = amd_bytealign (w2[0], w1[3], offset_minus_4); - c0[1] = amd_bytealign (w1[3], w1[2], offset_minus_4); - c0[0] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w3[3] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w3[2] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w3[1] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w3[0] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w2[3] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w2[2] = amd_bytealign (w0[0], 0, offset_minus_4); + c2[2] = hc_bytealign ( 0, w3[3], offset_minus_4); + c2[1] = hc_bytealign (w3[3], w3[2], offset_minus_4); + c2[0] = hc_bytealign (w3[2], w3[1], offset_minus_4); + c1[3] = hc_bytealign (w3[1], w3[0], offset_minus_4); + c1[2] = hc_bytealign (w3[0], w2[3], offset_minus_4); + c1[1] = hc_bytealign (w2[3], w2[2], offset_minus_4); + c1[0] = hc_bytealign (w2[2], w2[1], offset_minus_4); + c0[3] = hc_bytealign (w2[1], w2[0], offset_minus_4); + c0[2] = hc_bytealign (w2[0], w1[3], offset_minus_4); + c0[1] = hc_bytealign (w1[3], w1[2], offset_minus_4); + c0[0] = hc_bytealign (w1[2], w1[1], offset_minus_4); + w3[3] = hc_bytealign (w1[1], w1[0], offset_minus_4); + w3[2] = hc_bytealign (w1[0], w0[3], offset_minus_4); + w3[1] = hc_bytealign (w0[3], w0[2], offset_minus_4); + w3[0] = hc_bytealign (w0[2], w0[1], offset_minus_4); + w2[3] = hc_bytealign (w0[1], w0[0], offset_minus_4); + w2[2] = hc_bytealign (w0[0], 0, offset_minus_4); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -2252,23 +2252,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le (u32x *w0, u32x *w1, u32x *w2, u3 break; case 11: - c2[3] = amd_bytealign ( 0, w3[3], offset_minus_4); - c2[2] = amd_bytealign (w3[3], w3[2], offset_minus_4); - c2[1] = amd_bytealign (w3[2], w3[1], offset_minus_4); - c2[0] = amd_bytealign (w3[1], w3[0], offset_minus_4); - c1[3] = amd_bytealign (w3[0], w2[3], offset_minus_4); - c1[2] = amd_bytealign (w2[3], w2[2], offset_minus_4); - c1[1] = amd_bytealign (w2[2], w2[1], offset_minus_4); - c1[0] = amd_bytealign (w2[1], w2[0], offset_minus_4); - c0[3] = amd_bytealign (w2[0], w1[3], offset_minus_4); - c0[2] = amd_bytealign (w1[3], w1[2], offset_minus_4); - c0[1] = amd_bytealign (w1[2], w1[1], offset_minus_4); - c0[0] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w3[3] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w3[2] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w3[1] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w3[0] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w2[3] = amd_bytealign (w0[0], 0, offset_minus_4); + c2[3] = hc_bytealign ( 0, w3[3], offset_minus_4); + c2[2] = hc_bytealign (w3[3], w3[2], offset_minus_4); + c2[1] = hc_bytealign (w3[2], w3[1], offset_minus_4); + c2[0] = hc_bytealign (w3[1], w3[0], offset_minus_4); + c1[3] = hc_bytealign (w3[0], w2[3], offset_minus_4); + c1[2] = hc_bytealign (w2[3], w2[2], offset_minus_4); + c1[1] = hc_bytealign (w2[2], w2[1], offset_minus_4); + c1[0] = hc_bytealign (w2[1], w2[0], offset_minus_4); + c0[3] = hc_bytealign (w2[0], w1[3], offset_minus_4); + c0[2] = hc_bytealign (w1[3], w1[2], offset_minus_4); + c0[1] = hc_bytealign (w1[2], w1[1], offset_minus_4); + c0[0] = hc_bytealign (w1[1], w1[0], offset_minus_4); + w3[3] = hc_bytealign (w1[0], w0[3], offset_minus_4); + w3[2] = hc_bytealign (w0[3], w0[2], offset_minus_4); + w3[1] = hc_bytealign (w0[2], w0[1], offset_minus_4); + w3[0] = hc_bytealign (w0[1], w0[0], offset_minus_4); + w2[3] = hc_bytealign (w0[0], 0, offset_minus_4); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -2305,23 +2305,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le (u32x *w0, u32x *w1, u32x *w2, u3 break; case 12: - c3[0] = amd_bytealign ( 0, w3[3], offset_minus_4); - c2[3] = amd_bytealign (w3[3], w3[2], offset_minus_4); - c2[2] = amd_bytealign (w3[2], w3[1], offset_minus_4); - c2[1] = amd_bytealign (w3[1], w3[0], offset_minus_4); - c2[0] = amd_bytealign (w3[0], w2[3], offset_minus_4); - c1[3] = amd_bytealign (w2[3], w2[2], offset_minus_4); - c1[2] = amd_bytealign (w2[2], w2[1], offset_minus_4); - c1[1] = amd_bytealign (w2[1], w2[0], offset_minus_4); - c1[0] = amd_bytealign (w2[0], w1[3], offset_minus_4); - c0[3] = amd_bytealign (w1[3], w1[2], offset_minus_4); - c0[2] = amd_bytealign (w1[2], w1[1], offset_minus_4); - c0[1] = amd_bytealign (w1[1], w1[0], offset_minus_4); - c0[0] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w3[3] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w3[2] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w3[1] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w3[0] = amd_bytealign (w0[0], 0, offset_minus_4); + c3[0] = hc_bytealign ( 0, w3[3], offset_minus_4); + c2[3] = hc_bytealign (w3[3], w3[2], offset_minus_4); + c2[2] = hc_bytealign (w3[2], w3[1], offset_minus_4); + c2[1] = hc_bytealign (w3[1], w3[0], offset_minus_4); + c2[0] = hc_bytealign (w3[0], w2[3], offset_minus_4); + c1[3] = hc_bytealign (w2[3], w2[2], offset_minus_4); + c1[2] = hc_bytealign (w2[2], w2[1], offset_minus_4); + c1[1] = hc_bytealign (w2[1], w2[0], offset_minus_4); + c1[0] = hc_bytealign (w2[0], w1[3], offset_minus_4); + c0[3] = hc_bytealign (w1[3], w1[2], offset_minus_4); + c0[2] = hc_bytealign (w1[2], w1[1], offset_minus_4); + c0[1] = hc_bytealign (w1[1], w1[0], offset_minus_4); + c0[0] = hc_bytealign (w1[0], w0[3], offset_minus_4); + w3[3] = hc_bytealign (w0[3], w0[2], offset_minus_4); + w3[2] = hc_bytealign (w0[2], w0[1], offset_minus_4); + w3[1] = hc_bytealign (w0[1], w0[0], offset_minus_4); + w3[0] = hc_bytealign (w0[0], 0, offset_minus_4); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -2359,23 +2359,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le (u32x *w0, u32x *w1, u32x *w2, u3 break; case 13: - c3[1] = amd_bytealign ( 0, w3[3], offset_minus_4); - c3[0] = amd_bytealign (w3[3], w3[2], offset_minus_4); - c2[3] = amd_bytealign (w3[2], w3[1], offset_minus_4); - c2[2] = amd_bytealign (w3[1], w3[0], offset_minus_4); - c2[1] = amd_bytealign (w3[0], w2[3], offset_minus_4); - c2[0] = amd_bytealign (w2[3], w2[2], offset_minus_4); - c1[3] = amd_bytealign (w2[2], w2[1], offset_minus_4); - c1[2] = amd_bytealign (w2[1], w2[0], offset_minus_4); - c1[1] = amd_bytealign (w2[0], w1[3], offset_minus_4); - c1[0] = amd_bytealign (w1[3], w1[2], offset_minus_4); - c0[3] = amd_bytealign (w1[2], w1[1], offset_minus_4); - c0[2] = amd_bytealign (w1[1], w1[0], offset_minus_4); - c0[1] = amd_bytealign (w1[0], w0[3], offset_minus_4); - c0[0] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w3[3] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w3[2] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w3[1] = amd_bytealign (w0[0], 0, offset_minus_4); + c3[1] = hc_bytealign ( 0, w3[3], offset_minus_4); + c3[0] = hc_bytealign (w3[3], w3[2], offset_minus_4); + c2[3] = hc_bytealign (w3[2], w3[1], offset_minus_4); + c2[2] = hc_bytealign (w3[1], w3[0], offset_minus_4); + c2[1] = hc_bytealign (w3[0], w2[3], offset_minus_4); + c2[0] = hc_bytealign (w2[3], w2[2], offset_minus_4); + c1[3] = hc_bytealign (w2[2], w2[1], offset_minus_4); + c1[2] = hc_bytealign (w2[1], w2[0], offset_minus_4); + c1[1] = hc_bytealign (w2[0], w1[3], offset_minus_4); + c1[0] = hc_bytealign (w1[3], w1[2], offset_minus_4); + c0[3] = hc_bytealign (w1[2], w1[1], offset_minus_4); + c0[2] = hc_bytealign (w1[1], w1[0], offset_minus_4); + c0[1] = hc_bytealign (w1[0], w0[3], offset_minus_4); + c0[0] = hc_bytealign (w0[3], w0[2], offset_minus_4); + w3[3] = hc_bytealign (w0[2], w0[1], offset_minus_4); + w3[2] = hc_bytealign (w0[1], w0[0], offset_minus_4); + w3[1] = hc_bytealign (w0[0], 0, offset_minus_4); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -2414,23 +2414,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le (u32x *w0, u32x *w1, u32x *w2, u3 break; case 14: - c3[2] = amd_bytealign ( 0, w3[3], offset_minus_4); - c3[1] = amd_bytealign (w3[3], w3[2], offset_minus_4); - c3[0] = amd_bytealign (w3[2], w3[1], offset_minus_4); - c2[3] = amd_bytealign (w3[1], w3[0], offset_minus_4); - c2[2] = amd_bytealign (w3[0], w2[3], offset_minus_4); - c2[1] = amd_bytealign (w2[3], w2[2], offset_minus_4); - c2[0] = amd_bytealign (w2[2], w2[1], offset_minus_4); - c1[3] = amd_bytealign (w2[1], w2[0], offset_minus_4); - c1[2] = amd_bytealign (w2[0], w1[3], offset_minus_4); - c1[1] = amd_bytealign (w1[3], w1[2], offset_minus_4); - c1[0] = amd_bytealign (w1[2], w1[1], offset_minus_4); - c0[3] = amd_bytealign (w1[1], w1[0], offset_minus_4); - c0[2] = amd_bytealign (w1[0], w0[3], offset_minus_4); - c0[1] = amd_bytealign (w0[3], w0[2], offset_minus_4); - c0[0] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w3[3] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w3[2] = amd_bytealign (w0[0], 0, offset_minus_4); + c3[2] = hc_bytealign ( 0, w3[3], offset_minus_4); + c3[1] = hc_bytealign (w3[3], w3[2], offset_minus_4); + c3[0] = hc_bytealign (w3[2], w3[1], offset_minus_4); + c2[3] = hc_bytealign (w3[1], w3[0], offset_minus_4); + c2[2] = hc_bytealign (w3[0], w2[3], offset_minus_4); + c2[1] = hc_bytealign (w2[3], w2[2], offset_minus_4); + c2[0] = hc_bytealign (w2[2], w2[1], offset_minus_4); + c1[3] = hc_bytealign (w2[1], w2[0], offset_minus_4); + c1[2] = hc_bytealign (w2[0], w1[3], offset_minus_4); + c1[1] = hc_bytealign (w1[3], w1[2], offset_minus_4); + c1[0] = hc_bytealign (w1[2], w1[1], offset_minus_4); + c0[3] = hc_bytealign (w1[1], w1[0], offset_minus_4); + c0[2] = hc_bytealign (w1[0], w0[3], offset_minus_4); + c0[1] = hc_bytealign (w0[3], w0[2], offset_minus_4); + c0[0] = hc_bytealign (w0[2], w0[1], offset_minus_4); + w3[3] = hc_bytealign (w0[1], w0[0], offset_minus_4); + w3[2] = hc_bytealign (w0[0], 0, offset_minus_4); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -2470,23 +2470,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le (u32x *w0, u32x *w1, u32x *w2, u3 break; case 15: - c3[3] = amd_bytealign ( 0, w3[3], offset_minus_4); - c3[2] = amd_bytealign (w3[3], w3[2], offset_minus_4); - c3[1] = amd_bytealign (w3[2], w3[1], offset_minus_4); - c3[0] = amd_bytealign (w3[1], w3[0], offset_minus_4); - c2[3] = amd_bytealign (w3[0], w2[3], offset_minus_4); - c2[2] = amd_bytealign (w2[3], w2[2], offset_minus_4); - c2[1] = amd_bytealign (w2[2], w2[1], offset_minus_4); - c2[0] = amd_bytealign (w2[1], w2[0], offset_minus_4); - c1[3] = amd_bytealign (w2[0], w1[3], offset_minus_4); - c1[2] = amd_bytealign (w1[3], w1[2], offset_minus_4); - c1[1] = amd_bytealign (w1[2], w1[1], offset_minus_4); - c1[0] = amd_bytealign (w1[1], w1[0], offset_minus_4); - c0[3] = amd_bytealign (w1[0], w0[3], offset_minus_4); - c0[2] = amd_bytealign (w0[3], w0[2], offset_minus_4); - c0[1] = amd_bytealign (w0[2], w0[1], offset_minus_4); - c0[0] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w3[3] = amd_bytealign (w0[0], 0, offset_minus_4); + c3[3] = hc_bytealign ( 0, w3[3], offset_minus_4); + c3[2] = hc_bytealign (w3[3], w3[2], offset_minus_4); + c3[1] = hc_bytealign (w3[2], w3[1], offset_minus_4); + c3[0] = hc_bytealign (w3[1], w3[0], offset_minus_4); + c2[3] = hc_bytealign (w3[0], w2[3], offset_minus_4); + c2[2] = hc_bytealign (w2[3], w2[2], offset_minus_4); + c2[1] = hc_bytealign (w2[2], w2[1], offset_minus_4); + c2[0] = hc_bytealign (w2[1], w2[0], offset_minus_4); + c1[3] = hc_bytealign (w2[0], w1[3], offset_minus_4); + c1[2] = hc_bytealign (w1[3], w1[2], offset_minus_4); + c1[1] = hc_bytealign (w1[2], w1[1], offset_minus_4); + c1[0] = hc_bytealign (w1[1], w1[0], offset_minus_4); + c0[3] = hc_bytealign (w1[0], w0[3], offset_minus_4); + c0[2] = hc_bytealign (w0[3], w0[2], offset_minus_4); + c0[1] = hc_bytealign (w0[2], w0[1], offset_minus_4); + c0[0] = hc_bytealign (w0[1], w0[0], offset_minus_4); + w3[3] = hc_bytealign (w0[0], 0, offset_minus_4); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -2538,79 +2538,79 @@ DECLSPEC void switch_buffer_by_offset_be (u32x *w0, u32x *w1, u32x *w2, u32x *w3 switch (offset_switch) { case 0: - w3[3] = amd_bytealign (w3[2], w3[3], offset); - w3[2] = amd_bytealign (w3[1], w3[2], offset); - w3[1] = amd_bytealign (w3[0], w3[1], offset); - w3[0] = amd_bytealign (w2[3], w3[0], offset); - w2[3] = amd_bytealign (w2[2], w2[3], offset); - w2[2] = amd_bytealign (w2[1], w2[2], offset); - w2[1] = amd_bytealign (w2[0], w2[1], offset); - w2[0] = amd_bytealign (w1[3], w2[0], offset); - w1[3] = amd_bytealign (w1[2], w1[3], offset); - w1[2] = amd_bytealign (w1[1], w1[2], offset); - w1[1] = amd_bytealign (w1[0], w1[1], offset); - w1[0] = amd_bytealign (w0[3], w1[0], offset); - w0[3] = amd_bytealign (w0[2], w0[3], offset); - w0[2] = amd_bytealign (w0[1], w0[2], offset); - w0[1] = amd_bytealign (w0[0], w0[1], offset); - w0[0] = amd_bytealign ( 0, w0[0], offset); + w3[3] = hc_bytealign (w3[2], w3[3], offset); + w3[2] = hc_bytealign (w3[1], w3[2], offset); + w3[1] = hc_bytealign (w3[0], w3[1], offset); + w3[0] = hc_bytealign (w2[3], w3[0], offset); + w2[3] = hc_bytealign (w2[2], w2[3], offset); + w2[2] = hc_bytealign (w2[1], w2[2], offset); + w2[1] = hc_bytealign (w2[0], w2[1], offset); + w2[0] = hc_bytealign (w1[3], w2[0], offset); + w1[3] = hc_bytealign (w1[2], w1[3], offset); + w1[2] = hc_bytealign (w1[1], w1[2], offset); + w1[1] = hc_bytealign (w1[0], w1[1], offset); + w1[0] = hc_bytealign (w0[3], w1[0], offset); + w0[3] = hc_bytealign (w0[2], w0[3], offset); + w0[2] = hc_bytealign (w0[1], w0[2], offset); + w0[1] = hc_bytealign (w0[0], w0[1], offset); + w0[0] = hc_bytealign ( 0, w0[0], offset); break; case 1: - w3[3] = amd_bytealign (w3[1], w3[2], offset); - w3[2] = amd_bytealign (w3[0], w3[1], offset); - w3[1] = amd_bytealign (w2[3], w3[0], offset); - w3[0] = amd_bytealign (w2[2], w2[3], offset); - w2[3] = amd_bytealign (w2[1], w2[2], offset); - w2[2] = amd_bytealign (w2[0], w2[1], offset); - w2[1] = amd_bytealign (w1[3], w2[0], offset); - w2[0] = amd_bytealign (w1[2], w1[3], offset); - w1[3] = amd_bytealign (w1[1], w1[2], offset); - w1[2] = amd_bytealign (w1[0], w1[1], offset); - w1[1] = amd_bytealign (w0[3], w1[0], offset); - w1[0] = amd_bytealign (w0[2], w0[3], offset); - w0[3] = amd_bytealign (w0[1], w0[2], offset); - w0[2] = amd_bytealign (w0[0], w0[1], offset); - w0[1] = amd_bytealign ( 0, w0[0], offset); + w3[3] = hc_bytealign (w3[1], w3[2], offset); + w3[2] = hc_bytealign (w3[0], w3[1], offset); + w3[1] = hc_bytealign (w2[3], w3[0], offset); + w3[0] = hc_bytealign (w2[2], w2[3], offset); + w2[3] = hc_bytealign (w2[1], w2[2], offset); + w2[2] = hc_bytealign (w2[0], w2[1], offset); + w2[1] = hc_bytealign (w1[3], w2[0], offset); + w2[0] = hc_bytealign (w1[2], w1[3], offset); + w1[3] = hc_bytealign (w1[1], w1[2], offset); + w1[2] = hc_bytealign (w1[0], w1[1], offset); + w1[1] = hc_bytealign (w0[3], w1[0], offset); + w1[0] = hc_bytealign (w0[2], w0[3], offset); + w0[3] = hc_bytealign (w0[1], w0[2], offset); + w0[2] = hc_bytealign (w0[0], w0[1], offset); + w0[1] = hc_bytealign ( 0, w0[0], offset); w0[0] = 0; break; case 2: - w3[3] = amd_bytealign (w3[0], w3[1], offset); - w3[2] = amd_bytealign (w2[3], w3[0], offset); - w3[1] = amd_bytealign (w2[2], w2[3], offset); - w3[0] = amd_bytealign (w2[1], w2[2], offset); - w2[3] = amd_bytealign (w2[0], w2[1], offset); - w2[2] = amd_bytealign (w1[3], w2[0], offset); - w2[1] = amd_bytealign (w1[2], w1[3], offset); - w2[0] = amd_bytealign (w1[1], w1[2], offset); - w1[3] = amd_bytealign (w1[0], w1[1], offset); - w1[2] = amd_bytealign (w0[3], w1[0], offset); - w1[1] = amd_bytealign (w0[2], w0[3], offset); - w1[0] = amd_bytealign (w0[1], w0[2], offset); - w0[3] = amd_bytealign (w0[0], w0[1], offset); - w0[2] = amd_bytealign ( 0, w0[0], offset); + w3[3] = hc_bytealign (w3[0], w3[1], offset); + w3[2] = hc_bytealign (w2[3], w3[0], offset); + w3[1] = hc_bytealign (w2[2], w2[3], offset); + w3[0] = hc_bytealign (w2[1], w2[2], offset); + w2[3] = hc_bytealign (w2[0], w2[1], offset); + w2[2] = hc_bytealign (w1[3], w2[0], offset); + w2[1] = hc_bytealign (w1[2], w1[3], offset); + w2[0] = hc_bytealign (w1[1], w1[2], offset); + w1[3] = hc_bytealign (w1[0], w1[1], offset); + w1[2] = hc_bytealign (w0[3], w1[0], offset); + w1[1] = hc_bytealign (w0[2], w0[3], offset); + w1[0] = hc_bytealign (w0[1], w0[2], offset); + w0[3] = hc_bytealign (w0[0], w0[1], offset); + w0[2] = hc_bytealign ( 0, w0[0], offset); w0[1] = 0; w0[0] = 0; break; case 3: - w3[3] = amd_bytealign (w2[3], w3[0], offset); - w3[2] = amd_bytealign (w2[2], w2[3], offset); - w3[1] = amd_bytealign (w2[1], w2[2], offset); - w3[0] = amd_bytealign (w2[0], w2[1], offset); - w2[3] = amd_bytealign (w1[3], w2[0], offset); - w2[2] = amd_bytealign (w1[2], w1[3], offset); - w2[1] = amd_bytealign (w1[1], w1[2], offset); - w2[0] = amd_bytealign (w1[0], w1[1], offset); - w1[3] = amd_bytealign (w0[3], w1[0], offset); - w1[2] = amd_bytealign (w0[2], w0[3], offset); - w1[1] = amd_bytealign (w0[1], w0[2], offset); - w1[0] = amd_bytealign (w0[0], w0[1], offset); - w0[3] = amd_bytealign ( 0, w0[0], offset); + w3[3] = hc_bytealign (w2[3], w3[0], offset); + w3[2] = hc_bytealign (w2[2], w2[3], offset); + w3[1] = hc_bytealign (w2[1], w2[2], offset); + w3[0] = hc_bytealign (w2[0], w2[1], offset); + w2[3] = hc_bytealign (w1[3], w2[0], offset); + w2[2] = hc_bytealign (w1[2], w1[3], offset); + w2[1] = hc_bytealign (w1[1], w1[2], offset); + w2[0] = hc_bytealign (w1[0], w1[1], offset); + w1[3] = hc_bytealign (w0[3], w1[0], offset); + w1[2] = hc_bytealign (w0[2], w0[3], offset); + w1[1] = hc_bytealign (w0[1], w0[2], offset); + w1[0] = hc_bytealign (w0[0], w0[1], offset); + w0[3] = hc_bytealign ( 0, w0[0], offset); w0[2] = 0; w0[1] = 0; w0[0] = 0; @@ -2618,18 +2618,18 @@ DECLSPEC void switch_buffer_by_offset_be (u32x *w0, u32x *w1, u32x *w2, u32x *w3 break; case 4: - w3[3] = amd_bytealign (w2[2], w2[3], offset); - w3[2] = amd_bytealign (w2[1], w2[2], offset); - w3[1] = amd_bytealign (w2[0], w2[1], offset); - w3[0] = amd_bytealign (w1[3], w2[0], offset); - w2[3] = amd_bytealign (w1[2], w1[3], offset); - w2[2] = amd_bytealign (w1[1], w1[2], offset); - w2[1] = amd_bytealign (w1[0], w1[1], offset); - w2[0] = amd_bytealign (w0[3], w1[0], offset); - w1[3] = amd_bytealign (w0[2], w0[3], offset); - w1[2] = amd_bytealign (w0[1], w0[2], offset); - w1[1] = amd_bytealign (w0[0], w0[1], offset); - w1[0] = amd_bytealign ( 0, w0[0], offset); + w3[3] = hc_bytealign (w2[2], w2[3], offset); + w3[2] = hc_bytealign (w2[1], w2[2], offset); + w3[1] = hc_bytealign (w2[0], w2[1], offset); + w3[0] = hc_bytealign (w1[3], w2[0], offset); + w2[3] = hc_bytealign (w1[2], w1[3], offset); + w2[2] = hc_bytealign (w1[1], w1[2], offset); + w2[1] = hc_bytealign (w1[0], w1[1], offset); + w2[0] = hc_bytealign (w0[3], w1[0], offset); + w1[3] = hc_bytealign (w0[2], w0[3], offset); + w1[2] = hc_bytealign (w0[1], w0[2], offset); + w1[1] = hc_bytealign (w0[0], w0[1], offset); + w1[0] = hc_bytealign ( 0, w0[0], offset); w0[3] = 0; w0[2] = 0; w0[1] = 0; @@ -2638,17 +2638,17 @@ DECLSPEC void switch_buffer_by_offset_be (u32x *w0, u32x *w1, u32x *w2, u32x *w3 break; case 5: - w3[3] = amd_bytealign (w2[1], w2[2], offset); - w3[2] = amd_bytealign (w2[0], w2[1], offset); - w3[1] = amd_bytealign (w1[3], w2[0], offset); - w3[0] = amd_bytealign (w1[2], w1[3], offset); - w2[3] = amd_bytealign (w1[1], w1[2], offset); - w2[2] = amd_bytealign (w1[0], w1[1], offset); - w2[1] = amd_bytealign (w0[3], w1[0], offset); - w2[0] = amd_bytealign (w0[2], w0[3], offset); - w1[3] = amd_bytealign (w0[1], w0[2], offset); - w1[2] = amd_bytealign (w0[0], w0[1], offset); - w1[1] = amd_bytealign ( 0, w0[0], offset); + w3[3] = hc_bytealign (w2[1], w2[2], offset); + w3[2] = hc_bytealign (w2[0], w2[1], offset); + w3[1] = hc_bytealign (w1[3], w2[0], offset); + w3[0] = hc_bytealign (w1[2], w1[3], offset); + w2[3] = hc_bytealign (w1[1], w1[2], offset); + w2[2] = hc_bytealign (w1[0], w1[1], offset); + w2[1] = hc_bytealign (w0[3], w1[0], offset); + w2[0] = hc_bytealign (w0[2], w0[3], offset); + w1[3] = hc_bytealign (w0[1], w0[2], offset); + w1[2] = hc_bytealign (w0[0], w0[1], offset); + w1[1] = hc_bytealign ( 0, w0[0], offset); w1[0] = 0; w0[3] = 0; w0[2] = 0; @@ -2658,16 +2658,16 @@ DECLSPEC void switch_buffer_by_offset_be (u32x *w0, u32x *w1, u32x *w2, u32x *w3 break; case 6: - w3[3] = amd_bytealign (w2[0], w2[1], offset); - w3[2] = amd_bytealign (w1[3], w2[0], offset); - w3[1] = amd_bytealign (w1[2], w1[3], offset); - w3[0] = amd_bytealign (w1[1], w1[2], offset); - w2[3] = amd_bytealign (w1[0], w1[1], offset); - w2[2] = amd_bytealign (w0[3], w1[0], offset); - w2[1] = amd_bytealign (w0[2], w0[3], offset); - w2[0] = amd_bytealign (w0[1], w0[2], offset); - w1[3] = amd_bytealign (w0[0], w0[1], offset); - w1[2] = amd_bytealign ( 0, w0[0], offset); + w3[3] = hc_bytealign (w2[0], w2[1], offset); + w3[2] = hc_bytealign (w1[3], w2[0], offset); + w3[1] = hc_bytealign (w1[2], w1[3], offset); + w3[0] = hc_bytealign (w1[1], w1[2], offset); + w2[3] = hc_bytealign (w1[0], w1[1], offset); + w2[2] = hc_bytealign (w0[3], w1[0], offset); + w2[1] = hc_bytealign (w0[2], w0[3], offset); + w2[0] = hc_bytealign (w0[1], w0[2], offset); + w1[3] = hc_bytealign (w0[0], w0[1], offset); + w1[2] = hc_bytealign ( 0, w0[0], offset); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -2678,15 +2678,15 @@ DECLSPEC void switch_buffer_by_offset_be (u32x *w0, u32x *w1, u32x *w2, u32x *w3 break; case 7: - w3[3] = amd_bytealign (w1[3], w2[0], offset); - w3[2] = amd_bytealign (w1[2], w1[3], offset); - w3[1] = amd_bytealign (w1[1], w1[2], offset); - w3[0] = amd_bytealign (w1[0], w1[1], offset); - w2[3] = amd_bytealign (w0[3], w1[0], offset); - w2[2] = amd_bytealign (w0[2], w0[3], offset); - w2[1] = amd_bytealign (w0[1], w0[2], offset); - w2[0] = amd_bytealign (w0[0], w0[1], offset); - w1[3] = amd_bytealign ( 0, w0[0], offset); + w3[3] = hc_bytealign (w1[3], w2[0], offset); + w3[2] = hc_bytealign (w1[2], w1[3], offset); + w3[1] = hc_bytealign (w1[1], w1[2], offset); + w3[0] = hc_bytealign (w1[0], w1[1], offset); + w2[3] = hc_bytealign (w0[3], w1[0], offset); + w2[2] = hc_bytealign (w0[2], w0[3], offset); + w2[1] = hc_bytealign (w0[1], w0[2], offset); + w2[0] = hc_bytealign (w0[0], w0[1], offset); + w1[3] = hc_bytealign ( 0, w0[0], offset); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -2698,14 +2698,14 @@ DECLSPEC void switch_buffer_by_offset_be (u32x *w0, u32x *w1, u32x *w2, u32x *w3 break; case 8: - w3[3] = amd_bytealign (w1[2], w1[3], offset); - w3[2] = amd_bytealign (w1[1], w1[2], offset); - w3[1] = amd_bytealign (w1[0], w1[1], offset); - w3[0] = amd_bytealign (w0[3], w1[0], offset); - w2[3] = amd_bytealign (w0[2], w0[3], offset); - w2[2] = amd_bytealign (w0[1], w0[2], offset); - w2[1] = amd_bytealign (w0[0], w0[1], offset); - w2[0] = amd_bytealign ( 0, w0[0], offset); + w3[3] = hc_bytealign (w1[2], w1[3], offset); + w3[2] = hc_bytealign (w1[1], w1[2], offset); + w3[1] = hc_bytealign (w1[0], w1[1], offset); + w3[0] = hc_bytealign (w0[3], w1[0], offset); + w2[3] = hc_bytealign (w0[2], w0[3], offset); + w2[2] = hc_bytealign (w0[1], w0[2], offset); + w2[1] = hc_bytealign (w0[0], w0[1], offset); + w2[0] = hc_bytealign ( 0, w0[0], offset); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -2718,13 +2718,13 @@ DECLSPEC void switch_buffer_by_offset_be (u32x *w0, u32x *w1, u32x *w2, u32x *w3 break; case 9: - w3[3] = amd_bytealign (w1[1], w1[2], offset); - w3[2] = amd_bytealign (w1[0], w1[1], offset); - w3[1] = amd_bytealign (w0[3], w1[0], offset); - w3[0] = amd_bytealign (w0[2], w0[3], offset); - w2[3] = amd_bytealign (w0[1], w0[2], offset); - w2[2] = amd_bytealign (w0[0], w0[1], offset); - w2[1] = amd_bytealign ( 0, w0[0], offset); + w3[3] = hc_bytealign (w1[1], w1[2], offset); + w3[2] = hc_bytealign (w1[0], w1[1], offset); + w3[1] = hc_bytealign (w0[3], w1[0], offset); + w3[0] = hc_bytealign (w0[2], w0[3], offset); + w2[3] = hc_bytealign (w0[1], w0[2], offset); + w2[2] = hc_bytealign (w0[0], w0[1], offset); + w2[1] = hc_bytealign ( 0, w0[0], offset); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -2738,12 +2738,12 @@ DECLSPEC void switch_buffer_by_offset_be (u32x *w0, u32x *w1, u32x *w2, u32x *w3 break; case 10: - w3[3] = amd_bytealign (w1[0], w1[1], offset); - w3[2] = amd_bytealign (w0[3], w1[0], offset); - w3[1] = amd_bytealign (w0[2], w0[3], offset); - w3[0] = amd_bytealign (w0[1], w0[2], offset); - w2[3] = amd_bytealign (w0[0], w0[1], offset); - w2[2] = amd_bytealign ( 0, w0[0], offset); + w3[3] = hc_bytealign (w1[0], w1[1], offset); + w3[2] = hc_bytealign (w0[3], w1[0], offset); + w3[1] = hc_bytealign (w0[2], w0[3], offset); + w3[0] = hc_bytealign (w0[1], w0[2], offset); + w2[3] = hc_bytealign (w0[0], w0[1], offset); + w2[2] = hc_bytealign ( 0, w0[0], offset); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -2758,11 +2758,11 @@ DECLSPEC void switch_buffer_by_offset_be (u32x *w0, u32x *w1, u32x *w2, u32x *w3 break; case 11: - w3[3] = amd_bytealign (w0[3], w1[0], offset); - w3[2] = amd_bytealign (w0[2], w0[3], offset); - w3[1] = amd_bytealign (w0[1], w0[2], offset); - w3[0] = amd_bytealign (w0[0], w0[1], offset); - w2[3] = amd_bytealign ( 0, w0[0], offset); + w3[3] = hc_bytealign (w0[3], w1[0], offset); + w3[2] = hc_bytealign (w0[2], w0[3], offset); + w3[1] = hc_bytealign (w0[1], w0[2], offset); + w3[0] = hc_bytealign (w0[0], w0[1], offset); + w2[3] = hc_bytealign ( 0, w0[0], offset); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -2778,10 +2778,10 @@ DECLSPEC void switch_buffer_by_offset_be (u32x *w0, u32x *w1, u32x *w2, u32x *w3 break; case 12: - w3[3] = amd_bytealign (w0[2], w0[3], offset); - w3[2] = amd_bytealign (w0[1], w0[2], offset); - w3[1] = amd_bytealign (w0[0], w0[1], offset); - w3[0] = amd_bytealign ( 0, w0[0], offset); + w3[3] = hc_bytealign (w0[2], w0[3], offset); + w3[2] = hc_bytealign (w0[1], w0[2], offset); + w3[1] = hc_bytealign (w0[0], w0[1], offset); + w3[0] = hc_bytealign ( 0, w0[0], offset); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -2798,9 +2798,9 @@ DECLSPEC void switch_buffer_by_offset_be (u32x *w0, u32x *w1, u32x *w2, u32x *w3 break; case 13: - w3[3] = amd_bytealign (w0[1], w0[2], offset); - w3[2] = amd_bytealign (w0[0], w0[1], offset); - w3[1] = amd_bytealign ( 0, w0[0], offset); + w3[3] = hc_bytealign (w0[1], w0[2], offset); + w3[2] = hc_bytealign (w0[0], w0[1], offset); + w3[1] = hc_bytealign ( 0, w0[0], offset); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -2818,8 +2818,8 @@ DECLSPEC void switch_buffer_by_offset_be (u32x *w0, u32x *w1, u32x *w2, u32x *w3 break; case 14: - w3[3] = amd_bytealign (w0[0], w0[1], offset); - w3[2] = amd_bytealign ( 0, w0[0], offset); + w3[3] = hc_bytealign (w0[0], w0[1], offset); + w3[2] = hc_bytealign ( 0, w0[0], offset); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -2838,7 +2838,7 @@ DECLSPEC void switch_buffer_by_offset_be (u32x *w0, u32x *w1, u32x *w2, u32x *w3 break; case 15: - w3[3] = amd_bytealign ( 0, w0[0], offset); + w3[3] = hc_bytealign ( 0, w0[0], offset); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -2873,79 +2873,79 @@ DECLSPEC void switch_buffer_by_offset_be (u32x *w0, u32x *w1, u32x *w2, u32x *w3 switch (offset_switch) { case 0: - w3[3] = __byte_perm (w3[3], w3[2], selector); - w3[2] = __byte_perm (w3[2], w3[1], selector); - w3[1] = __byte_perm (w3[1], w3[0], selector); - w3[0] = __byte_perm (w3[0], w2[3], selector); - w2[3] = __byte_perm (w2[3], w2[2], selector); - w2[2] = __byte_perm (w2[2], w2[1], selector); - w2[1] = __byte_perm (w2[1], w2[0], selector); - w2[0] = __byte_perm (w2[0], w1[3], selector); - w1[3] = __byte_perm (w1[3], w1[2], selector); - w1[2] = __byte_perm (w1[2], w1[1], selector); - w1[1] = __byte_perm (w1[1], w1[0], selector); - w1[0] = __byte_perm (w1[0], w0[3], selector); - w0[3] = __byte_perm (w0[3], w0[2], selector); - w0[2] = __byte_perm (w0[2], w0[1], selector); - w0[1] = __byte_perm (w0[1], w0[0], selector); - w0[0] = __byte_perm (w0[0], 0, selector); + w3[3] = hc_byte_perm (w3[3], w3[2], selector); + w3[2] = hc_byte_perm (w3[2], w3[1], selector); + w3[1] = hc_byte_perm (w3[1], w3[0], selector); + w3[0] = hc_byte_perm (w3[0], w2[3], selector); + w2[3] = hc_byte_perm (w2[3], w2[2], selector); + w2[2] = hc_byte_perm (w2[2], w2[1], selector); + w2[1] = hc_byte_perm (w2[1], w2[0], selector); + w2[0] = hc_byte_perm (w2[0], w1[3], selector); + w1[3] = hc_byte_perm (w1[3], w1[2], selector); + w1[2] = hc_byte_perm (w1[2], w1[1], selector); + w1[1] = hc_byte_perm (w1[1], w1[0], selector); + w1[0] = hc_byte_perm (w1[0], w0[3], selector); + w0[3] = hc_byte_perm (w0[3], w0[2], selector); + w0[2] = hc_byte_perm (w0[2], w0[1], selector); + w0[1] = hc_byte_perm (w0[1], w0[0], selector); + w0[0] = hc_byte_perm (w0[0], 0, selector); break; case 1: - w3[3] = __byte_perm (w3[2], w3[1], selector); - w3[2] = __byte_perm (w3[1], w3[0], selector); - w3[1] = __byte_perm (w3[0], w2[3], selector); - w3[0] = __byte_perm (w2[3], w2[2], selector); - w2[3] = __byte_perm (w2[2], w2[1], selector); - w2[2] = __byte_perm (w2[1], w2[0], selector); - w2[1] = __byte_perm (w2[0], w1[3], selector); - w2[0] = __byte_perm (w1[3], w1[2], selector); - w1[3] = __byte_perm (w1[2], w1[1], selector); - w1[2] = __byte_perm (w1[1], w1[0], selector); - w1[1] = __byte_perm (w1[0], w0[3], selector); - w1[0] = __byte_perm (w0[3], w0[2], selector); - w0[3] = __byte_perm (w0[2], w0[1], selector); - w0[2] = __byte_perm (w0[1], w0[0], selector); - w0[1] = __byte_perm (w0[0], 0, selector); + w3[3] = hc_byte_perm (w3[2], w3[1], selector); + w3[2] = hc_byte_perm (w3[1], w3[0], selector); + w3[1] = hc_byte_perm (w3[0], w2[3], selector); + w3[0] = hc_byte_perm (w2[3], w2[2], selector); + w2[3] = hc_byte_perm (w2[2], w2[1], selector); + w2[2] = hc_byte_perm (w2[1], w2[0], selector); + w2[1] = hc_byte_perm (w2[0], w1[3], selector); + w2[0] = hc_byte_perm (w1[3], w1[2], selector); + w1[3] = hc_byte_perm (w1[2], w1[1], selector); + w1[2] = hc_byte_perm (w1[1], w1[0], selector); + w1[1] = hc_byte_perm (w1[0], w0[3], selector); + w1[0] = hc_byte_perm (w0[3], w0[2], selector); + w0[3] = hc_byte_perm (w0[2], w0[1], selector); + w0[2] = hc_byte_perm (w0[1], w0[0], selector); + w0[1] = hc_byte_perm (w0[0], 0, selector); w0[0] = 0; break; case 2: - w3[3] = __byte_perm (w3[1], w3[0], selector); - w3[2] = __byte_perm (w3[0], w2[3], selector); - w3[1] = __byte_perm (w2[3], w2[2], selector); - w3[0] = __byte_perm (w2[2], w2[1], selector); - w2[3] = __byte_perm (w2[1], w2[0], selector); - w2[2] = __byte_perm (w2[0], w1[3], selector); - w2[1] = __byte_perm (w1[3], w1[2], selector); - w2[0] = __byte_perm (w1[2], w1[1], selector); - w1[3] = __byte_perm (w1[1], w1[0], selector); - w1[2] = __byte_perm (w1[0], w0[3], selector); - w1[1] = __byte_perm (w0[3], w0[2], selector); - w1[0] = __byte_perm (w0[2], w0[1], selector); - w0[3] = __byte_perm (w0[1], w0[0], selector); - w0[2] = __byte_perm (w0[0], 0, selector); + w3[3] = hc_byte_perm (w3[1], w3[0], selector); + w3[2] = hc_byte_perm (w3[0], w2[3], selector); + w3[1] = hc_byte_perm (w2[3], w2[2], selector); + w3[0] = hc_byte_perm (w2[2], w2[1], selector); + w2[3] = hc_byte_perm (w2[1], w2[0], selector); + w2[2] = hc_byte_perm (w2[0], w1[3], selector); + w2[1] = hc_byte_perm (w1[3], w1[2], selector); + w2[0] = hc_byte_perm (w1[2], w1[1], selector); + w1[3] = hc_byte_perm (w1[1], w1[0], selector); + w1[2] = hc_byte_perm (w1[0], w0[3], selector); + w1[1] = hc_byte_perm (w0[3], w0[2], selector); + w1[0] = hc_byte_perm (w0[2], w0[1], selector); + w0[3] = hc_byte_perm (w0[1], w0[0], selector); + w0[2] = hc_byte_perm (w0[0], 0, selector); w0[1] = 0; w0[0] = 0; break; case 3: - w3[3] = __byte_perm (w3[0], w2[3], selector); - w3[2] = __byte_perm (w2[3], w2[2], selector); - w3[1] = __byte_perm (w2[2], w2[1], selector); - w3[0] = __byte_perm (w2[1], w2[0], selector); - w2[3] = __byte_perm (w2[0], w1[3], selector); - w2[2] = __byte_perm (w1[3], w1[2], selector); - w2[1] = __byte_perm (w1[2], w1[1], selector); - w2[0] = __byte_perm (w1[1], w1[0], selector); - w1[3] = __byte_perm (w1[0], w0[3], selector); - w1[2] = __byte_perm (w0[3], w0[2], selector); - w1[1] = __byte_perm (w0[2], w0[1], selector); - w1[0] = __byte_perm (w0[1], w0[0], selector); - w0[3] = __byte_perm (w0[0], 0, selector); + w3[3] = hc_byte_perm (w3[0], w2[3], selector); + w3[2] = hc_byte_perm (w2[3], w2[2], selector); + w3[1] = hc_byte_perm (w2[2], w2[1], selector); + w3[0] = hc_byte_perm (w2[1], w2[0], selector); + w2[3] = hc_byte_perm (w2[0], w1[3], selector); + w2[2] = hc_byte_perm (w1[3], w1[2], selector); + w2[1] = hc_byte_perm (w1[2], w1[1], selector); + w2[0] = hc_byte_perm (w1[1], w1[0], selector); + w1[3] = hc_byte_perm (w1[0], w0[3], selector); + w1[2] = hc_byte_perm (w0[3], w0[2], selector); + w1[1] = hc_byte_perm (w0[2], w0[1], selector); + w1[0] = hc_byte_perm (w0[1], w0[0], selector); + w0[3] = hc_byte_perm (w0[0], 0, selector); w0[2] = 0; w0[1] = 0; w0[0] = 0; @@ -2953,18 +2953,18 @@ DECLSPEC void switch_buffer_by_offset_be (u32x *w0, u32x *w1, u32x *w2, u32x *w3 break; case 4: - w3[3] = __byte_perm (w2[3], w2[2], selector); - w3[2] = __byte_perm (w2[2], w2[1], selector); - w3[1] = __byte_perm (w2[1], w2[0], selector); - w3[0] = __byte_perm (w2[0], w1[3], selector); - w2[3] = __byte_perm (w1[3], w1[2], selector); - w2[2] = __byte_perm (w1[2], w1[1], selector); - w2[1] = __byte_perm (w1[1], w1[0], selector); - w2[0] = __byte_perm (w1[0], w0[3], selector); - w1[3] = __byte_perm (w0[3], w0[2], selector); - w1[2] = __byte_perm (w0[2], w0[1], selector); - w1[1] = __byte_perm (w0[1], w0[0], selector); - w1[0] = __byte_perm (w0[0], 0, selector); + w3[3] = hc_byte_perm (w2[3], w2[2], selector); + w3[2] = hc_byte_perm (w2[2], w2[1], selector); + w3[1] = hc_byte_perm (w2[1], w2[0], selector); + w3[0] = hc_byte_perm (w2[0], w1[3], selector); + w2[3] = hc_byte_perm (w1[3], w1[2], selector); + w2[2] = hc_byte_perm (w1[2], w1[1], selector); + w2[1] = hc_byte_perm (w1[1], w1[0], selector); + w2[0] = hc_byte_perm (w1[0], w0[3], selector); + w1[3] = hc_byte_perm (w0[3], w0[2], selector); + w1[2] = hc_byte_perm (w0[2], w0[1], selector); + w1[1] = hc_byte_perm (w0[1], w0[0], selector); + w1[0] = hc_byte_perm (w0[0], 0, selector); w0[3] = 0; w0[2] = 0; w0[1] = 0; @@ -2973,17 +2973,17 @@ DECLSPEC void switch_buffer_by_offset_be (u32x *w0, u32x *w1, u32x *w2, u32x *w3 break; case 5: - w3[3] = __byte_perm (w2[2], w2[1], selector); - w3[2] = __byte_perm (w2[1], w2[0], selector); - w3[1] = __byte_perm (w2[0], w1[3], selector); - w3[0] = __byte_perm (w1[3], w1[2], selector); - w2[3] = __byte_perm (w1[2], w1[1], selector); - w2[2] = __byte_perm (w1[1], w1[0], selector); - w2[1] = __byte_perm (w1[0], w0[3], selector); - w2[0] = __byte_perm (w0[3], w0[2], selector); - w1[3] = __byte_perm (w0[2], w0[1], selector); - w1[2] = __byte_perm (w0[1], w0[0], selector); - w1[1] = __byte_perm (w0[0], 0, selector); + w3[3] = hc_byte_perm (w2[2], w2[1], selector); + w3[2] = hc_byte_perm (w2[1], w2[0], selector); + w3[1] = hc_byte_perm (w2[0], w1[3], selector); + w3[0] = hc_byte_perm (w1[3], w1[2], selector); + w2[3] = hc_byte_perm (w1[2], w1[1], selector); + w2[2] = hc_byte_perm (w1[1], w1[0], selector); + w2[1] = hc_byte_perm (w1[0], w0[3], selector); + w2[0] = hc_byte_perm (w0[3], w0[2], selector); + w1[3] = hc_byte_perm (w0[2], w0[1], selector); + w1[2] = hc_byte_perm (w0[1], w0[0], selector); + w1[1] = hc_byte_perm (w0[0], 0, selector); w1[0] = 0; w0[3] = 0; w0[2] = 0; @@ -2993,16 +2993,16 @@ DECLSPEC void switch_buffer_by_offset_be (u32x *w0, u32x *w1, u32x *w2, u32x *w3 break; case 6: - w3[3] = __byte_perm (w2[1], w2[0], selector); - w3[2] = __byte_perm (w2[0], w1[3], selector); - w3[1] = __byte_perm (w1[3], w1[2], selector); - w3[0] = __byte_perm (w1[2], w1[1], selector); - w2[3] = __byte_perm (w1[1], w1[0], selector); - w2[2] = __byte_perm (w1[0], w0[3], selector); - w2[1] = __byte_perm (w0[3], w0[2], selector); - w2[0] = __byte_perm (w0[2], w0[1], selector); - w1[3] = __byte_perm (w0[1], w0[0], selector); - w1[2] = __byte_perm (w0[0], 0, selector); + w3[3] = hc_byte_perm (w2[1], w2[0], selector); + w3[2] = hc_byte_perm (w2[0], w1[3], selector); + w3[1] = hc_byte_perm (w1[3], w1[2], selector); + w3[0] = hc_byte_perm (w1[2], w1[1], selector); + w2[3] = hc_byte_perm (w1[1], w1[0], selector); + w2[2] = hc_byte_perm (w1[0], w0[3], selector); + w2[1] = hc_byte_perm (w0[3], w0[2], selector); + w2[0] = hc_byte_perm (w0[2], w0[1], selector); + w1[3] = hc_byte_perm (w0[1], w0[0], selector); + w1[2] = hc_byte_perm (w0[0], 0, selector); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -3013,15 +3013,15 @@ DECLSPEC void switch_buffer_by_offset_be (u32x *w0, u32x *w1, u32x *w2, u32x *w3 break; case 7: - w3[3] = __byte_perm (w2[0], w1[3], selector); - w3[2] = __byte_perm (w1[3], w1[2], selector); - w3[1] = __byte_perm (w1[2], w1[1], selector); - w3[0] = __byte_perm (w1[1], w1[0], selector); - w2[3] = __byte_perm (w1[0], w0[3], selector); - w2[2] = __byte_perm (w0[3], w0[2], selector); - w2[1] = __byte_perm (w0[2], w0[1], selector); - w2[0] = __byte_perm (w0[1], w0[0], selector); - w1[3] = __byte_perm (w0[0], 0, selector); + w3[3] = hc_byte_perm (w2[0], w1[3], selector); + w3[2] = hc_byte_perm (w1[3], w1[2], selector); + w3[1] = hc_byte_perm (w1[2], w1[1], selector); + w3[0] = hc_byte_perm (w1[1], w1[0], selector); + w2[3] = hc_byte_perm (w1[0], w0[3], selector); + w2[2] = hc_byte_perm (w0[3], w0[2], selector); + w2[1] = hc_byte_perm (w0[2], w0[1], selector); + w2[0] = hc_byte_perm (w0[1], w0[0], selector); + w1[3] = hc_byte_perm (w0[0], 0, selector); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -3033,14 +3033,14 @@ DECLSPEC void switch_buffer_by_offset_be (u32x *w0, u32x *w1, u32x *w2, u32x *w3 break; case 8: - w3[3] = __byte_perm (w1[3], w1[2], selector); - w3[2] = __byte_perm (w1[2], w1[1], selector); - w3[1] = __byte_perm (w1[1], w1[0], selector); - w3[0] = __byte_perm (w1[0], w0[3], selector); - w2[3] = __byte_perm (w0[3], w0[2], selector); - w2[2] = __byte_perm (w0[2], w0[1], selector); - w2[1] = __byte_perm (w0[1], w0[0], selector); - w2[0] = __byte_perm (w0[0], 0, selector); + w3[3] = hc_byte_perm (w1[3], w1[2], selector); + w3[2] = hc_byte_perm (w1[2], w1[1], selector); + w3[1] = hc_byte_perm (w1[1], w1[0], selector); + w3[0] = hc_byte_perm (w1[0], w0[3], selector); + w2[3] = hc_byte_perm (w0[3], w0[2], selector); + w2[2] = hc_byte_perm (w0[2], w0[1], selector); + w2[1] = hc_byte_perm (w0[1], w0[0], selector); + w2[0] = hc_byte_perm (w0[0], 0, selector); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -3053,13 +3053,13 @@ DECLSPEC void switch_buffer_by_offset_be (u32x *w0, u32x *w1, u32x *w2, u32x *w3 break; case 9: - w3[3] = __byte_perm (w1[2], w1[1], selector); - w3[2] = __byte_perm (w1[1], w1[0], selector); - w3[1] = __byte_perm (w1[0], w0[3], selector); - w3[0] = __byte_perm (w0[3], w0[2], selector); - w2[3] = __byte_perm (w0[2], w0[1], selector); - w2[2] = __byte_perm (w0[1], w0[0], selector); - w2[1] = __byte_perm (w0[0], 0, selector); + w3[3] = hc_byte_perm (w1[2], w1[1], selector); + w3[2] = hc_byte_perm (w1[1], w1[0], selector); + w3[1] = hc_byte_perm (w1[0], w0[3], selector); + w3[0] = hc_byte_perm (w0[3], w0[2], selector); + w2[3] = hc_byte_perm (w0[2], w0[1], selector); + w2[2] = hc_byte_perm (w0[1], w0[0], selector); + w2[1] = hc_byte_perm (w0[0], 0, selector); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -3073,12 +3073,12 @@ DECLSPEC void switch_buffer_by_offset_be (u32x *w0, u32x *w1, u32x *w2, u32x *w3 break; case 10: - w3[3] = __byte_perm (w1[1], w1[0], selector); - w3[2] = __byte_perm (w1[0], w0[3], selector); - w3[1] = __byte_perm (w0[3], w0[2], selector); - w3[0] = __byte_perm (w0[2], w0[1], selector); - w2[3] = __byte_perm (w0[1], w0[0], selector); - w2[2] = __byte_perm (w0[0], 0, selector); + w3[3] = hc_byte_perm (w1[1], w1[0], selector); + w3[2] = hc_byte_perm (w1[0], w0[3], selector); + w3[1] = hc_byte_perm (w0[3], w0[2], selector); + w3[0] = hc_byte_perm (w0[2], w0[1], selector); + w2[3] = hc_byte_perm (w0[1], w0[0], selector); + w2[2] = hc_byte_perm (w0[0], 0, selector); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -3093,11 +3093,11 @@ DECLSPEC void switch_buffer_by_offset_be (u32x *w0, u32x *w1, u32x *w2, u32x *w3 break; case 11: - w3[3] = __byte_perm (w1[0], w0[3], selector); - w3[2] = __byte_perm (w0[3], w0[2], selector); - w3[1] = __byte_perm (w0[2], w0[1], selector); - w3[0] = __byte_perm (w0[1], w0[0], selector); - w2[3] = __byte_perm (w0[0], 0, selector); + w3[3] = hc_byte_perm (w1[0], w0[3], selector); + w3[2] = hc_byte_perm (w0[3], w0[2], selector); + w3[1] = hc_byte_perm (w0[2], w0[1], selector); + w3[0] = hc_byte_perm (w0[1], w0[0], selector); + w2[3] = hc_byte_perm (w0[0], 0, selector); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -3113,10 +3113,10 @@ DECLSPEC void switch_buffer_by_offset_be (u32x *w0, u32x *w1, u32x *w2, u32x *w3 break; case 12: - w3[3] = __byte_perm (w0[3], w0[2], selector); - w3[2] = __byte_perm (w0[2], w0[1], selector); - w3[1] = __byte_perm (w0[1], w0[0], selector); - w3[0] = __byte_perm (w0[0], 0, selector); + w3[3] = hc_byte_perm (w0[3], w0[2], selector); + w3[2] = hc_byte_perm (w0[2], w0[1], selector); + w3[1] = hc_byte_perm (w0[1], w0[0], selector); + w3[0] = hc_byte_perm (w0[0], 0, selector); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -3133,9 +3133,9 @@ DECLSPEC void switch_buffer_by_offset_be (u32x *w0, u32x *w1, u32x *w2, u32x *w3 break; case 13: - w3[3] = __byte_perm (w0[2], w0[1], selector); - w3[2] = __byte_perm (w0[1], w0[0], selector); - w3[1] = __byte_perm (w0[0], 0, selector); + w3[3] = hc_byte_perm (w0[2], w0[1], selector); + w3[2] = hc_byte_perm (w0[1], w0[0], selector); + w3[1] = hc_byte_perm (w0[0], 0, selector); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -3153,8 +3153,8 @@ DECLSPEC void switch_buffer_by_offset_be (u32x *w0, u32x *w1, u32x *w2, u32x *w3 break; case 14: - w3[3] = __byte_perm (w0[1], w0[0], selector); - w3[2] = __byte_perm (w0[0], 0, selector); + w3[3] = hc_byte_perm (w0[1], w0[0], selector); + w3[2] = hc_byte_perm (w0[0], 0, selector); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -3173,7 +3173,7 @@ DECLSPEC void switch_buffer_by_offset_be (u32x *w0, u32x *w1, u32x *w2, u32x *w3 break; case 15: - w3[3] = __byte_perm (w0[0], 0, selector); + w3[3] = hc_byte_perm (w0[0], 0, selector); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -3204,89 +3204,89 @@ DECLSPEC void switch_buffer_by_offset_carry_be (u32x *w0, u32x *w1, u32x *w2, u3 switch (offset_switch) { case 0: - c0[0] = amd_bytealign (w3[3], 0, offset); - w3[3] = amd_bytealign (w3[2], w3[3], offset); - w3[2] = amd_bytealign (w3[1], w3[2], offset); - w3[1] = amd_bytealign (w3[0], w3[1], offset); - w3[0] = amd_bytealign (w2[3], w3[0], offset); - w2[3] = amd_bytealign (w2[2], w2[3], offset); - w2[2] = amd_bytealign (w2[1], w2[2], offset); - w2[1] = amd_bytealign (w2[0], w2[1], offset); - w2[0] = amd_bytealign (w1[3], w2[0], offset); - w1[3] = amd_bytealign (w1[2], w1[3], offset); - w1[2] = amd_bytealign (w1[1], w1[2], offset); - w1[1] = amd_bytealign (w1[0], w1[1], offset); - w1[0] = amd_bytealign (w0[3], w1[0], offset); - w0[3] = amd_bytealign (w0[2], w0[3], offset); - w0[2] = amd_bytealign (w0[1], w0[2], offset); - w0[1] = amd_bytealign (w0[0], w0[1], offset); - w0[0] = amd_bytealign ( 0, w0[0], offset); + c0[0] = hc_bytealign (w3[3], 0, offset); + w3[3] = hc_bytealign (w3[2], w3[3], offset); + w3[2] = hc_bytealign (w3[1], w3[2], offset); + w3[1] = hc_bytealign (w3[0], w3[1], offset); + w3[0] = hc_bytealign (w2[3], w3[0], offset); + w2[3] = hc_bytealign (w2[2], w2[3], offset); + w2[2] = hc_bytealign (w2[1], w2[2], offset); + w2[1] = hc_bytealign (w2[0], w2[1], offset); + w2[0] = hc_bytealign (w1[3], w2[0], offset); + w1[3] = hc_bytealign (w1[2], w1[3], offset); + w1[2] = hc_bytealign (w1[1], w1[2], offset); + w1[1] = hc_bytealign (w1[0], w1[1], offset); + w1[0] = hc_bytealign (w0[3], w1[0], offset); + w0[3] = hc_bytealign (w0[2], w0[3], offset); + w0[2] = hc_bytealign (w0[1], w0[2], offset); + w0[1] = hc_bytealign (w0[0], w0[1], offset); + w0[0] = hc_bytealign ( 0, w0[0], offset); break; case 1: - c0[1] = amd_bytealign (w3[3], 0, offset); - c0[0] = amd_bytealign (w3[2], w3[3], offset); - w3[3] = amd_bytealign (w3[1], w3[2], offset); - w3[2] = amd_bytealign (w3[0], w3[1], offset); - w3[1] = amd_bytealign (w2[3], w3[0], offset); - w3[0] = amd_bytealign (w2[2], w2[3], offset); - w2[3] = amd_bytealign (w2[1], w2[2], offset); - w2[2] = amd_bytealign (w2[0], w2[1], offset); - w2[1] = amd_bytealign (w1[3], w2[0], offset); - w2[0] = amd_bytealign (w1[2], w1[3], offset); - w1[3] = amd_bytealign (w1[1], w1[2], offset); - w1[2] = amd_bytealign (w1[0], w1[1], offset); - w1[1] = amd_bytealign (w0[3], w1[0], offset); - w1[0] = amd_bytealign (w0[2], w0[3], offset); - w0[3] = amd_bytealign (w0[1], w0[2], offset); - w0[2] = amd_bytealign (w0[0], w0[1], offset); - w0[1] = amd_bytealign ( 0, w0[0], offset); + c0[1] = hc_bytealign (w3[3], 0, offset); + c0[0] = hc_bytealign (w3[2], w3[3], offset); + w3[3] = hc_bytealign (w3[1], w3[2], offset); + w3[2] = hc_bytealign (w3[0], w3[1], offset); + w3[1] = hc_bytealign (w2[3], w3[0], offset); + w3[0] = hc_bytealign (w2[2], w2[3], offset); + w2[3] = hc_bytealign (w2[1], w2[2], offset); + w2[2] = hc_bytealign (w2[0], w2[1], offset); + w2[1] = hc_bytealign (w1[3], w2[0], offset); + w2[0] = hc_bytealign (w1[2], w1[3], offset); + w1[3] = hc_bytealign (w1[1], w1[2], offset); + w1[2] = hc_bytealign (w1[0], w1[1], offset); + w1[1] = hc_bytealign (w0[3], w1[0], offset); + w1[0] = hc_bytealign (w0[2], w0[3], offset); + w0[3] = hc_bytealign (w0[1], w0[2], offset); + w0[2] = hc_bytealign (w0[0], w0[1], offset); + w0[1] = hc_bytealign ( 0, w0[0], offset); w0[0] = 0; break; case 2: - c0[2] = amd_bytealign (w3[3], 0, offset); - c0[1] = amd_bytealign (w3[2], w3[3], offset); - c0[0] = amd_bytealign (w3[1], w3[2], offset); - w3[3] = amd_bytealign (w3[0], w3[1], offset); - w3[2] = amd_bytealign (w2[3], w3[0], offset); - w3[1] = amd_bytealign (w2[2], w2[3], offset); - w3[0] = amd_bytealign (w2[1], w2[2], offset); - w2[3] = amd_bytealign (w2[0], w2[1], offset); - w2[2] = amd_bytealign (w1[3], w2[0], offset); - w2[1] = amd_bytealign (w1[2], w1[3], offset); - w2[0] = amd_bytealign (w1[1], w1[2], offset); - w1[3] = amd_bytealign (w1[0], w1[1], offset); - w1[2] = amd_bytealign (w0[3], w1[0], offset); - w1[1] = amd_bytealign (w0[2], w0[3], offset); - w1[0] = amd_bytealign (w0[1], w0[2], offset); - w0[3] = amd_bytealign (w0[0], w0[1], offset); - w0[2] = amd_bytealign ( 0, w0[0], offset); + c0[2] = hc_bytealign (w3[3], 0, offset); + c0[1] = hc_bytealign (w3[2], w3[3], offset); + c0[0] = hc_bytealign (w3[1], w3[2], offset); + w3[3] = hc_bytealign (w3[0], w3[1], offset); + w3[2] = hc_bytealign (w2[3], w3[0], offset); + w3[1] = hc_bytealign (w2[2], w2[3], offset); + w3[0] = hc_bytealign (w2[1], w2[2], offset); + w2[3] = hc_bytealign (w2[0], w2[1], offset); + w2[2] = hc_bytealign (w1[3], w2[0], offset); + w2[1] = hc_bytealign (w1[2], w1[3], offset); + w2[0] = hc_bytealign (w1[1], w1[2], offset); + w1[3] = hc_bytealign (w1[0], w1[1], offset); + w1[2] = hc_bytealign (w0[3], w1[0], offset); + w1[1] = hc_bytealign (w0[2], w0[3], offset); + w1[0] = hc_bytealign (w0[1], w0[2], offset); + w0[3] = hc_bytealign (w0[0], w0[1], offset); + w0[2] = hc_bytealign ( 0, w0[0], offset); w0[1] = 0; w0[0] = 0; break; case 3: - c0[3] = amd_bytealign (w3[3], 0, offset); - c0[2] = amd_bytealign (w3[2], w3[3], offset); - c0[1] = amd_bytealign (w3[1], w3[2], offset); - c0[0] = amd_bytealign (w3[0], w3[1], offset); - w3[3] = amd_bytealign (w2[3], w3[0], offset); - w3[2] = amd_bytealign (w2[2], w2[3], offset); - w3[1] = amd_bytealign (w2[1], w2[2], offset); - w3[0] = amd_bytealign (w2[0], w2[1], offset); - w2[3] = amd_bytealign (w1[3], w2[0], offset); - w2[2] = amd_bytealign (w1[2], w1[3], offset); - w2[1] = amd_bytealign (w1[1], w1[2], offset); - w2[0] = amd_bytealign (w1[0], w1[1], offset); - w1[3] = amd_bytealign (w0[3], w1[0], offset); - w1[2] = amd_bytealign (w0[2], w0[3], offset); - w1[1] = amd_bytealign (w0[1], w0[2], offset); - w1[0] = amd_bytealign (w0[0], w0[1], offset); - w0[3] = amd_bytealign ( 0, w0[0], offset); + c0[3] = hc_bytealign (w3[3], 0, offset); + c0[2] = hc_bytealign (w3[2], w3[3], offset); + c0[1] = hc_bytealign (w3[1], w3[2], offset); + c0[0] = hc_bytealign (w3[0], w3[1], offset); + w3[3] = hc_bytealign (w2[3], w3[0], offset); + w3[2] = hc_bytealign (w2[2], w2[3], offset); + w3[1] = hc_bytealign (w2[1], w2[2], offset); + w3[0] = hc_bytealign (w2[0], w2[1], offset); + w2[3] = hc_bytealign (w1[3], w2[0], offset); + w2[2] = hc_bytealign (w1[2], w1[3], offset); + w2[1] = hc_bytealign (w1[1], w1[2], offset); + w2[0] = hc_bytealign (w1[0], w1[1], offset); + w1[3] = hc_bytealign (w0[3], w1[0], offset); + w1[2] = hc_bytealign (w0[2], w0[3], offset); + w1[1] = hc_bytealign (w0[1], w0[2], offset); + w1[0] = hc_bytealign (w0[0], w0[1], offset); + w0[3] = hc_bytealign ( 0, w0[0], offset); w0[2] = 0; w0[1] = 0; w0[0] = 0; @@ -3294,23 +3294,23 @@ DECLSPEC void switch_buffer_by_offset_carry_be (u32x *w0, u32x *w1, u32x *w2, u3 break; case 4: - c1[0] = amd_bytealign (w3[3], 0, offset); - c0[3] = amd_bytealign (w3[2], w3[3], offset); - c0[2] = amd_bytealign (w3[1], w3[2], offset); - c0[1] = amd_bytealign (w3[0], w3[1], offset); - c0[0] = amd_bytealign (w2[3], w3[0], offset); - w3[3] = amd_bytealign (w2[2], w2[3], offset); - w3[2] = amd_bytealign (w2[1], w2[2], offset); - w3[1] = amd_bytealign (w2[0], w2[1], offset); - w3[0] = amd_bytealign (w1[3], w2[0], offset); - w2[3] = amd_bytealign (w1[2], w1[3], offset); - w2[2] = amd_bytealign (w1[1], w1[2], offset); - w2[1] = amd_bytealign (w1[0], w1[1], offset); - w2[0] = amd_bytealign (w0[3], w1[0], offset); - w1[3] = amd_bytealign (w0[2], w0[3], offset); - w1[2] = amd_bytealign (w0[1], w0[2], offset); - w1[1] = amd_bytealign (w0[0], w0[1], offset); - w1[0] = amd_bytealign ( 0, w0[0], offset); + c1[0] = hc_bytealign (w3[3], 0, offset); + c0[3] = hc_bytealign (w3[2], w3[3], offset); + c0[2] = hc_bytealign (w3[1], w3[2], offset); + c0[1] = hc_bytealign (w3[0], w3[1], offset); + c0[0] = hc_bytealign (w2[3], w3[0], offset); + w3[3] = hc_bytealign (w2[2], w2[3], offset); + w3[2] = hc_bytealign (w2[1], w2[2], offset); + w3[1] = hc_bytealign (w2[0], w2[1], offset); + w3[0] = hc_bytealign (w1[3], w2[0], offset); + w2[3] = hc_bytealign (w1[2], w1[3], offset); + w2[2] = hc_bytealign (w1[1], w1[2], offset); + w2[1] = hc_bytealign (w1[0], w1[1], offset); + w2[0] = hc_bytealign (w0[3], w1[0], offset); + w1[3] = hc_bytealign (w0[2], w0[3], offset); + w1[2] = hc_bytealign (w0[1], w0[2], offset); + w1[1] = hc_bytealign (w0[0], w0[1], offset); + w1[0] = hc_bytealign ( 0, w0[0], offset); w0[3] = 0; w0[2] = 0; w0[1] = 0; @@ -3319,23 +3319,23 @@ DECLSPEC void switch_buffer_by_offset_carry_be (u32x *w0, u32x *w1, u32x *w2, u3 break; case 5: - c1[1] = amd_bytealign (w3[3], 0, offset); - c1[0] = amd_bytealign (w3[2], w3[3], offset); - c0[3] = amd_bytealign (w3[1], w3[2], offset); - c0[2] = amd_bytealign (w3[0], w3[1], offset); - c0[1] = amd_bytealign (w2[3], w3[0], offset); - c0[0] = amd_bytealign (w2[2], w2[3], offset); - w3[3] = amd_bytealign (w2[1], w2[2], offset); - w3[2] = amd_bytealign (w2[0], w2[1], offset); - w3[1] = amd_bytealign (w1[3], w2[0], offset); - w3[0] = amd_bytealign (w1[2], w1[3], offset); - w2[3] = amd_bytealign (w1[1], w1[2], offset); - w2[2] = amd_bytealign (w1[0], w1[1], offset); - w2[1] = amd_bytealign (w0[3], w1[0], offset); - w2[0] = amd_bytealign (w0[2], w0[3], offset); - w1[3] = amd_bytealign (w0[1], w0[2], offset); - w1[2] = amd_bytealign (w0[0], w0[1], offset); - w1[1] = amd_bytealign ( 0, w0[0], offset); + c1[1] = hc_bytealign (w3[3], 0, offset); + c1[0] = hc_bytealign (w3[2], w3[3], offset); + c0[3] = hc_bytealign (w3[1], w3[2], offset); + c0[2] = hc_bytealign (w3[0], w3[1], offset); + c0[1] = hc_bytealign (w2[3], w3[0], offset); + c0[0] = hc_bytealign (w2[2], w2[3], offset); + w3[3] = hc_bytealign (w2[1], w2[2], offset); + w3[2] = hc_bytealign (w2[0], w2[1], offset); + w3[1] = hc_bytealign (w1[3], w2[0], offset); + w3[0] = hc_bytealign (w1[2], w1[3], offset); + w2[3] = hc_bytealign (w1[1], w1[2], offset); + w2[2] = hc_bytealign (w1[0], w1[1], offset); + w2[1] = hc_bytealign (w0[3], w1[0], offset); + w2[0] = hc_bytealign (w0[2], w0[3], offset); + w1[3] = hc_bytealign (w0[1], w0[2], offset); + w1[2] = hc_bytealign (w0[0], w0[1], offset); + w1[1] = hc_bytealign ( 0, w0[0], offset); w1[0] = 0; w0[3] = 0; w0[2] = 0; @@ -3345,23 +3345,23 @@ DECLSPEC void switch_buffer_by_offset_carry_be (u32x *w0, u32x *w1, u32x *w2, u3 break; case 6: - c1[2] = amd_bytealign (w3[3], 0, offset); - c1[1] = amd_bytealign (w3[2], w3[3], offset); - c1[0] = amd_bytealign (w3[1], w3[2], offset); - c0[3] = amd_bytealign (w3[0], w3[1], offset); - c0[2] = amd_bytealign (w2[3], w3[0], offset); - c0[1] = amd_bytealign (w2[2], w2[3], offset); - c0[0] = amd_bytealign (w2[1], w2[2], offset); - w3[3] = amd_bytealign (w2[0], w2[1], offset); - w3[2] = amd_bytealign (w1[3], w2[0], offset); - w3[1] = amd_bytealign (w1[2], w1[3], offset); - w3[0] = amd_bytealign (w1[1], w1[2], offset); - w2[3] = amd_bytealign (w1[0], w1[1], offset); - w2[2] = amd_bytealign (w0[3], w1[0], offset); - w2[1] = amd_bytealign (w0[2], w0[3], offset); - w2[0] = amd_bytealign (w0[1], w0[2], offset); - w1[3] = amd_bytealign (w0[0], w0[1], offset); - w1[2] = amd_bytealign ( 0, w0[0], offset); + c1[2] = hc_bytealign (w3[3], 0, offset); + c1[1] = hc_bytealign (w3[2], w3[3], offset); + c1[0] = hc_bytealign (w3[1], w3[2], offset); + c0[3] = hc_bytealign (w3[0], w3[1], offset); + c0[2] = hc_bytealign (w2[3], w3[0], offset); + c0[1] = hc_bytealign (w2[2], w2[3], offset); + c0[0] = hc_bytealign (w2[1], w2[2], offset); + w3[3] = hc_bytealign (w2[0], w2[1], offset); + w3[2] = hc_bytealign (w1[3], w2[0], offset); + w3[1] = hc_bytealign (w1[2], w1[3], offset); + w3[0] = hc_bytealign (w1[1], w1[2], offset); + w2[3] = hc_bytealign (w1[0], w1[1], offset); + w2[2] = hc_bytealign (w0[3], w1[0], offset); + w2[1] = hc_bytealign (w0[2], w0[3], offset); + w2[0] = hc_bytealign (w0[1], w0[2], offset); + w1[3] = hc_bytealign (w0[0], w0[1], offset); + w1[2] = hc_bytealign ( 0, w0[0], offset); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -3372,23 +3372,23 @@ DECLSPEC void switch_buffer_by_offset_carry_be (u32x *w0, u32x *w1, u32x *w2, u3 break; case 7: - c1[3] = amd_bytealign (w3[3], 0, offset); - c1[2] = amd_bytealign (w3[2], w3[3], offset); - c1[1] = amd_bytealign (w3[1], w3[2], offset); - c1[0] = amd_bytealign (w3[0], w3[1], offset); - c0[3] = amd_bytealign (w2[3], w3[0], offset); - c0[2] = amd_bytealign (w2[2], w2[3], offset); - c0[1] = amd_bytealign (w2[1], w2[2], offset); - c0[0] = amd_bytealign (w2[0], w2[1], offset); - w3[3] = amd_bytealign (w1[3], w2[0], offset); - w3[2] = amd_bytealign (w1[2], w1[3], offset); - w3[1] = amd_bytealign (w1[1], w1[2], offset); - w3[0] = amd_bytealign (w1[0], w1[1], offset); - w2[3] = amd_bytealign (w0[3], w1[0], offset); - w2[2] = amd_bytealign (w0[2], w0[3], offset); - w2[1] = amd_bytealign (w0[1], w0[2], offset); - w2[0] = amd_bytealign (w0[0], w0[1], offset); - w1[3] = amd_bytealign ( 0, w0[0], offset); + c1[3] = hc_bytealign (w3[3], 0, offset); + c1[2] = hc_bytealign (w3[2], w3[3], offset); + c1[1] = hc_bytealign (w3[1], w3[2], offset); + c1[0] = hc_bytealign (w3[0], w3[1], offset); + c0[3] = hc_bytealign (w2[3], w3[0], offset); + c0[2] = hc_bytealign (w2[2], w2[3], offset); + c0[1] = hc_bytealign (w2[1], w2[2], offset); + c0[0] = hc_bytealign (w2[0], w2[1], offset); + w3[3] = hc_bytealign (w1[3], w2[0], offset); + w3[2] = hc_bytealign (w1[2], w1[3], offset); + w3[1] = hc_bytealign (w1[1], w1[2], offset); + w3[0] = hc_bytealign (w1[0], w1[1], offset); + w2[3] = hc_bytealign (w0[3], w1[0], offset); + w2[2] = hc_bytealign (w0[2], w0[3], offset); + w2[1] = hc_bytealign (w0[1], w0[2], offset); + w2[0] = hc_bytealign (w0[0], w0[1], offset); + w1[3] = hc_bytealign ( 0, w0[0], offset); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -3400,23 +3400,23 @@ DECLSPEC void switch_buffer_by_offset_carry_be (u32x *w0, u32x *w1, u32x *w2, u3 break; case 8: - c2[0] = amd_bytealign (w3[3], 0, offset); - c1[3] = amd_bytealign (w3[2], w3[3], offset); - c1[2] = amd_bytealign (w3[1], w3[2], offset); - c1[1] = amd_bytealign (w3[0], w3[1], offset); - c1[0] = amd_bytealign (w2[3], w3[0], offset); - c0[3] = amd_bytealign (w2[2], w2[3], offset); - c0[2] = amd_bytealign (w2[1], w2[2], offset); - c0[1] = amd_bytealign (w2[0], w2[1], offset); - c0[0] = amd_bytealign (w1[3], w2[0], offset); - w3[3] = amd_bytealign (w1[2], w1[3], offset); - w3[2] = amd_bytealign (w1[1], w1[2], offset); - w3[1] = amd_bytealign (w1[0], w1[1], offset); - w3[0] = amd_bytealign (w0[3], w1[0], offset); - w2[3] = amd_bytealign (w0[2], w0[3], offset); - w2[2] = amd_bytealign (w0[1], w0[2], offset); - w2[1] = amd_bytealign (w0[0], w0[1], offset); - w2[0] = amd_bytealign ( 0, w0[0], offset); + c2[0] = hc_bytealign (w3[3], 0, offset); + c1[3] = hc_bytealign (w3[2], w3[3], offset); + c1[2] = hc_bytealign (w3[1], w3[2], offset); + c1[1] = hc_bytealign (w3[0], w3[1], offset); + c1[0] = hc_bytealign (w2[3], w3[0], offset); + c0[3] = hc_bytealign (w2[2], w2[3], offset); + c0[2] = hc_bytealign (w2[1], w2[2], offset); + c0[1] = hc_bytealign (w2[0], w2[1], offset); + c0[0] = hc_bytealign (w1[3], w2[0], offset); + w3[3] = hc_bytealign (w1[2], w1[3], offset); + w3[2] = hc_bytealign (w1[1], w1[2], offset); + w3[1] = hc_bytealign (w1[0], w1[1], offset); + w3[0] = hc_bytealign (w0[3], w1[0], offset); + w2[3] = hc_bytealign (w0[2], w0[3], offset); + w2[2] = hc_bytealign (w0[1], w0[2], offset); + w2[1] = hc_bytealign (w0[0], w0[1], offset); + w2[0] = hc_bytealign ( 0, w0[0], offset); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -3429,23 +3429,23 @@ DECLSPEC void switch_buffer_by_offset_carry_be (u32x *w0, u32x *w1, u32x *w2, u3 break; case 9: - c2[1] = amd_bytealign (w3[3], 0, offset); - c2[0] = amd_bytealign (w3[2], w3[3], offset); - c1[3] = amd_bytealign (w3[1], w3[2], offset); - c1[2] = amd_bytealign (w3[0], w3[1], offset); - c1[1] = amd_bytealign (w2[3], w3[0], offset); - c1[0] = amd_bytealign (w2[2], w2[3], offset); - c0[3] = amd_bytealign (w2[1], w2[2], offset); - c0[2] = amd_bytealign (w2[0], w2[1], offset); - c0[1] = amd_bytealign (w1[3], w2[0], offset); - c0[0] = amd_bytealign (w1[2], w1[3], offset); - w3[3] = amd_bytealign (w1[1], w1[2], offset); - w3[2] = amd_bytealign (w1[0], w1[1], offset); - w3[1] = amd_bytealign (w0[3], w1[0], offset); - w3[0] = amd_bytealign (w0[2], w0[3], offset); - w2[3] = amd_bytealign (w0[1], w0[2], offset); - w2[2] = amd_bytealign (w0[0], w0[1], offset); - w2[1] = amd_bytealign ( 0, w0[0], offset); + c2[1] = hc_bytealign (w3[3], 0, offset); + c2[0] = hc_bytealign (w3[2], w3[3], offset); + c1[3] = hc_bytealign (w3[1], w3[2], offset); + c1[2] = hc_bytealign (w3[0], w3[1], offset); + c1[1] = hc_bytealign (w2[3], w3[0], offset); + c1[0] = hc_bytealign (w2[2], w2[3], offset); + c0[3] = hc_bytealign (w2[1], w2[2], offset); + c0[2] = hc_bytealign (w2[0], w2[1], offset); + c0[1] = hc_bytealign (w1[3], w2[0], offset); + c0[0] = hc_bytealign (w1[2], w1[3], offset); + w3[3] = hc_bytealign (w1[1], w1[2], offset); + w3[2] = hc_bytealign (w1[0], w1[1], offset); + w3[1] = hc_bytealign (w0[3], w1[0], offset); + w3[0] = hc_bytealign (w0[2], w0[3], offset); + w2[3] = hc_bytealign (w0[1], w0[2], offset); + w2[2] = hc_bytealign (w0[0], w0[1], offset); + w2[1] = hc_bytealign ( 0, w0[0], offset); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -3459,23 +3459,23 @@ DECLSPEC void switch_buffer_by_offset_carry_be (u32x *w0, u32x *w1, u32x *w2, u3 break; case 10: - c2[2] = amd_bytealign (w3[3], 0, offset); - c2[1] = amd_bytealign (w3[2], w3[3], offset); - c2[0] = amd_bytealign (w3[1], w3[2], offset); - c1[3] = amd_bytealign (w3[0], w3[1], offset); - c1[2] = amd_bytealign (w2[3], w3[0], offset); - c1[1] = amd_bytealign (w2[2], w2[3], offset); - c1[0] = amd_bytealign (w2[1], w2[2], offset); - c0[3] = amd_bytealign (w2[0], w2[1], offset); - c0[2] = amd_bytealign (w1[3], w2[0], offset); - c0[1] = amd_bytealign (w1[2], w1[3], offset); - c0[0] = amd_bytealign (w1[1], w1[2], offset); - w3[3] = amd_bytealign (w1[0], w1[1], offset); - w3[2] = amd_bytealign (w0[3], w1[0], offset); - w3[1] = amd_bytealign (w0[2], w0[3], offset); - w3[0] = amd_bytealign (w0[1], w0[2], offset); - w2[3] = amd_bytealign (w0[0], w0[1], offset); - w2[2] = amd_bytealign ( 0, w0[0], offset); + c2[2] = hc_bytealign (w3[3], 0, offset); + c2[1] = hc_bytealign (w3[2], w3[3], offset); + c2[0] = hc_bytealign (w3[1], w3[2], offset); + c1[3] = hc_bytealign (w3[0], w3[1], offset); + c1[2] = hc_bytealign (w2[3], w3[0], offset); + c1[1] = hc_bytealign (w2[2], w2[3], offset); + c1[0] = hc_bytealign (w2[1], w2[2], offset); + c0[3] = hc_bytealign (w2[0], w2[1], offset); + c0[2] = hc_bytealign (w1[3], w2[0], offset); + c0[1] = hc_bytealign (w1[2], w1[3], offset); + c0[0] = hc_bytealign (w1[1], w1[2], offset); + w3[3] = hc_bytealign (w1[0], w1[1], offset); + w3[2] = hc_bytealign (w0[3], w1[0], offset); + w3[1] = hc_bytealign (w0[2], w0[3], offset); + w3[0] = hc_bytealign (w0[1], w0[2], offset); + w2[3] = hc_bytealign (w0[0], w0[1], offset); + w2[2] = hc_bytealign ( 0, w0[0], offset); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -3490,23 +3490,23 @@ DECLSPEC void switch_buffer_by_offset_carry_be (u32x *w0, u32x *w1, u32x *w2, u3 break; case 11: - c2[3] = amd_bytealign (w3[3], 0, offset); - c2[2] = amd_bytealign (w3[2], w3[3], offset); - c2[1] = amd_bytealign (w3[1], w3[2], offset); - c2[0] = amd_bytealign (w3[0], w3[1], offset); - c1[3] = amd_bytealign (w2[3], w3[0], offset); - c1[2] = amd_bytealign (w2[2], w2[3], offset); - c1[1] = amd_bytealign (w2[1], w2[2], offset); - c1[0] = amd_bytealign (w2[0], w2[1], offset); - c0[3] = amd_bytealign (w1[3], w2[0], offset); - c0[2] = amd_bytealign (w1[2], w1[3], offset); - c0[1] = amd_bytealign (w1[1], w1[2], offset); - c0[0] = amd_bytealign (w1[0], w1[1], offset); - w3[3] = amd_bytealign (w0[3], w1[0], offset); - w3[2] = amd_bytealign (w0[2], w0[3], offset); - w3[1] = amd_bytealign (w0[1], w0[2], offset); - w3[0] = amd_bytealign (w0[0], w0[1], offset); - w2[3] = amd_bytealign ( 0, w0[0], offset); + c2[3] = hc_bytealign (w3[3], 0, offset); + c2[2] = hc_bytealign (w3[2], w3[3], offset); + c2[1] = hc_bytealign (w3[1], w3[2], offset); + c2[0] = hc_bytealign (w3[0], w3[1], offset); + c1[3] = hc_bytealign (w2[3], w3[0], offset); + c1[2] = hc_bytealign (w2[2], w2[3], offset); + c1[1] = hc_bytealign (w2[1], w2[2], offset); + c1[0] = hc_bytealign (w2[0], w2[1], offset); + c0[3] = hc_bytealign (w1[3], w2[0], offset); + c0[2] = hc_bytealign (w1[2], w1[3], offset); + c0[1] = hc_bytealign (w1[1], w1[2], offset); + c0[0] = hc_bytealign (w1[0], w1[1], offset); + w3[3] = hc_bytealign (w0[3], w1[0], offset); + w3[2] = hc_bytealign (w0[2], w0[3], offset); + w3[1] = hc_bytealign (w0[1], w0[2], offset); + w3[0] = hc_bytealign (w0[0], w0[1], offset); + w2[3] = hc_bytealign ( 0, w0[0], offset); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -3522,23 +3522,23 @@ DECLSPEC void switch_buffer_by_offset_carry_be (u32x *w0, u32x *w1, u32x *w2, u3 break; case 12: - c3[0] = amd_bytealign (w3[3], 0, offset); - c2[3] = amd_bytealign (w3[2], w3[3], offset); - c2[2] = amd_bytealign (w3[1], w3[2], offset); - c2[1] = amd_bytealign (w3[0], w3[1], offset); - c2[0] = amd_bytealign (w2[3], w3[0], offset); - c1[3] = amd_bytealign (w2[2], w2[3], offset); - c1[2] = amd_bytealign (w2[1], w2[2], offset); - c1[1] = amd_bytealign (w2[0], w2[1], offset); - c1[0] = amd_bytealign (w1[3], w2[0], offset); - c0[3] = amd_bytealign (w1[2], w1[3], offset); - c0[2] = amd_bytealign (w1[1], w1[2], offset); - c0[1] = amd_bytealign (w1[0], w1[1], offset); - c0[0] = amd_bytealign (w0[3], w1[0], offset); - w3[3] = amd_bytealign (w0[2], w0[3], offset); - w3[2] = amd_bytealign (w0[1], w0[2], offset); - w3[1] = amd_bytealign (w0[0], w0[1], offset); - w3[0] = amd_bytealign ( 0, w0[0], offset); + c3[0] = hc_bytealign (w3[3], 0, offset); + c2[3] = hc_bytealign (w3[2], w3[3], offset); + c2[2] = hc_bytealign (w3[1], w3[2], offset); + c2[1] = hc_bytealign (w3[0], w3[1], offset); + c2[0] = hc_bytealign (w2[3], w3[0], offset); + c1[3] = hc_bytealign (w2[2], w2[3], offset); + c1[2] = hc_bytealign (w2[1], w2[2], offset); + c1[1] = hc_bytealign (w2[0], w2[1], offset); + c1[0] = hc_bytealign (w1[3], w2[0], offset); + c0[3] = hc_bytealign (w1[2], w1[3], offset); + c0[2] = hc_bytealign (w1[1], w1[2], offset); + c0[1] = hc_bytealign (w1[0], w1[1], offset); + c0[0] = hc_bytealign (w0[3], w1[0], offset); + w3[3] = hc_bytealign (w0[2], w0[3], offset); + w3[2] = hc_bytealign (w0[1], w0[2], offset); + w3[1] = hc_bytealign (w0[0], w0[1], offset); + w3[0] = hc_bytealign ( 0, w0[0], offset); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -3555,23 +3555,23 @@ DECLSPEC void switch_buffer_by_offset_carry_be (u32x *w0, u32x *w1, u32x *w2, u3 break; case 13: - c3[1] = amd_bytealign (w3[3], 0, offset); - c3[0] = amd_bytealign (w3[2], w3[3], offset); - c2[3] = amd_bytealign (w3[1], w3[2], offset); - c2[2] = amd_bytealign (w3[0], w3[1], offset); - c2[1] = amd_bytealign (w2[3], w3[0], offset); - c2[0] = amd_bytealign (w2[2], w2[3], offset); - c1[3] = amd_bytealign (w2[1], w2[2], offset); - c1[2] = amd_bytealign (w2[0], w2[1], offset); - c1[1] = amd_bytealign (w1[3], w2[0], offset); - c1[0] = amd_bytealign (w1[2], w1[3], offset); - c0[3] = amd_bytealign (w1[1], w1[2], offset); - c0[2] = amd_bytealign (w1[0], w1[1], offset); - c0[1] = amd_bytealign (w0[3], w1[0], offset); - c0[0] = amd_bytealign (w0[2], w0[3], offset); - w3[3] = amd_bytealign (w0[1], w0[2], offset); - w3[2] = amd_bytealign (w0[0], w0[1], offset); - w3[1] = amd_bytealign ( 0, w0[0], offset); + c3[1] = hc_bytealign (w3[3], 0, offset); + c3[0] = hc_bytealign (w3[2], w3[3], offset); + c2[3] = hc_bytealign (w3[1], w3[2], offset); + c2[2] = hc_bytealign (w3[0], w3[1], offset); + c2[1] = hc_bytealign (w2[3], w3[0], offset); + c2[0] = hc_bytealign (w2[2], w2[3], offset); + c1[3] = hc_bytealign (w2[1], w2[2], offset); + c1[2] = hc_bytealign (w2[0], w2[1], offset); + c1[1] = hc_bytealign (w1[3], w2[0], offset); + c1[0] = hc_bytealign (w1[2], w1[3], offset); + c0[3] = hc_bytealign (w1[1], w1[2], offset); + c0[2] = hc_bytealign (w1[0], w1[1], offset); + c0[1] = hc_bytealign (w0[3], w1[0], offset); + c0[0] = hc_bytealign (w0[2], w0[3], offset); + w3[3] = hc_bytealign (w0[1], w0[2], offset); + w3[2] = hc_bytealign (w0[0], w0[1], offset); + w3[1] = hc_bytealign ( 0, w0[0], offset); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -3589,23 +3589,23 @@ DECLSPEC void switch_buffer_by_offset_carry_be (u32x *w0, u32x *w1, u32x *w2, u3 break; case 14: - c3[2] = amd_bytealign (w3[3], 0, offset); - c3[1] = amd_bytealign (w3[2], w3[3], offset); - c3[0] = amd_bytealign (w3[1], w3[2], offset); - c2[3] = amd_bytealign (w3[0], w3[1], offset); - c2[2] = amd_bytealign (w2[3], w3[0], offset); - c2[1] = amd_bytealign (w2[2], w2[3], offset); - c2[0] = amd_bytealign (w2[1], w2[2], offset); - c1[3] = amd_bytealign (w2[0], w2[1], offset); - c1[2] = amd_bytealign (w1[3], w2[0], offset); - c1[1] = amd_bytealign (w1[2], w1[3], offset); - c1[0] = amd_bytealign (w1[1], w1[2], offset); - c0[3] = amd_bytealign (w1[0], w1[1], offset); - c0[2] = amd_bytealign (w0[3], w1[0], offset); - c0[1] = amd_bytealign (w0[2], w0[3], offset); - c0[0] = amd_bytealign (w0[1], w0[2], offset); - w3[3] = amd_bytealign (w0[0], w0[1], offset); - w3[2] = amd_bytealign ( 0, w0[0], offset); + c3[2] = hc_bytealign (w3[3], 0, offset); + c3[1] = hc_bytealign (w3[2], w3[3], offset); + c3[0] = hc_bytealign (w3[1], w3[2], offset); + c2[3] = hc_bytealign (w3[0], w3[1], offset); + c2[2] = hc_bytealign (w2[3], w3[0], offset); + c2[1] = hc_bytealign (w2[2], w2[3], offset); + c2[0] = hc_bytealign (w2[1], w2[2], offset); + c1[3] = hc_bytealign (w2[0], w2[1], offset); + c1[2] = hc_bytealign (w1[3], w2[0], offset); + c1[1] = hc_bytealign (w1[2], w1[3], offset); + c1[0] = hc_bytealign (w1[1], w1[2], offset); + c0[3] = hc_bytealign (w1[0], w1[1], offset); + c0[2] = hc_bytealign (w0[3], w1[0], offset); + c0[1] = hc_bytealign (w0[2], w0[3], offset); + c0[0] = hc_bytealign (w0[1], w0[2], offset); + w3[3] = hc_bytealign (w0[0], w0[1], offset); + w3[2] = hc_bytealign ( 0, w0[0], offset); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -3624,23 +3624,23 @@ DECLSPEC void switch_buffer_by_offset_carry_be (u32x *w0, u32x *w1, u32x *w2, u3 break; case 15: - c3[3] = amd_bytealign (w3[3], 0, offset); - c3[2] = amd_bytealign (w3[2], w3[3], offset); - c3[1] = amd_bytealign (w3[1], w3[2], offset); - c3[0] = amd_bytealign (w3[0], w3[1], offset); - c2[3] = amd_bytealign (w2[3], w3[0], offset); - c2[2] = amd_bytealign (w2[2], w2[3], offset); - c2[1] = amd_bytealign (w2[1], w2[2], offset); - c2[0] = amd_bytealign (w2[0], w2[1], offset); - c1[3] = amd_bytealign (w1[3], w2[0], offset); - c1[2] = amd_bytealign (w1[2], w1[3], offset); - c1[1] = amd_bytealign (w1[1], w1[2], offset); - c1[0] = amd_bytealign (w1[0], w1[1], offset); - c0[3] = amd_bytealign (w0[3], w1[0], offset); - c0[2] = amd_bytealign (w0[2], w0[3], offset); - c0[1] = amd_bytealign (w0[1], w0[2], offset); - c0[0] = amd_bytealign (w0[0], w0[1], offset); - w3[3] = amd_bytealign ( 0, w0[0], offset); + c3[3] = hc_bytealign (w3[3], 0, offset); + c3[2] = hc_bytealign (w3[2], w3[3], offset); + c3[1] = hc_bytealign (w3[1], w3[2], offset); + c3[0] = hc_bytealign (w3[0], w3[1], offset); + c2[3] = hc_bytealign (w2[3], w3[0], offset); + c2[2] = hc_bytealign (w2[2], w2[3], offset); + c2[1] = hc_bytealign (w2[1], w2[2], offset); + c2[0] = hc_bytealign (w2[0], w2[1], offset); + c1[3] = hc_bytealign (w1[3], w2[0], offset); + c1[2] = hc_bytealign (w1[2], w1[3], offset); + c1[1] = hc_bytealign (w1[1], w1[2], offset); + c1[0] = hc_bytealign (w1[0], w1[1], offset); + c0[3] = hc_bytealign (w0[3], w1[0], offset); + c0[2] = hc_bytealign (w0[2], w0[3], offset); + c0[1] = hc_bytealign (w0[1], w0[2], offset); + c0[0] = hc_bytealign (w0[0], w0[1], offset); + w3[3] = hc_bytealign ( 0, w0[0], offset); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -3674,89 +3674,89 @@ DECLSPEC void switch_buffer_by_offset_carry_be (u32x *w0, u32x *w1, u32x *w2, u3 switch (offset_switch) { case 0: - c0[0] = __byte_perm ( 0, w3[3], selector); - w3[3] = __byte_perm (w3[3], w3[2], selector); - w3[2] = __byte_perm (w3[2], w3[1], selector); - w3[1] = __byte_perm (w3[1], w3[0], selector); - w3[0] = __byte_perm (w3[0], w2[3], selector); - w2[3] = __byte_perm (w2[3], w2[2], selector); - w2[2] = __byte_perm (w2[2], w2[1], selector); - w2[1] = __byte_perm (w2[1], w2[0], selector); - w2[0] = __byte_perm (w2[0], w1[3], selector); - w1[3] = __byte_perm (w1[3], w1[2], selector); - w1[2] = __byte_perm (w1[2], w1[1], selector); - w1[1] = __byte_perm (w1[1], w1[0], selector); - w1[0] = __byte_perm (w1[0], w0[3], selector); - w0[3] = __byte_perm (w0[3], w0[2], selector); - w0[2] = __byte_perm (w0[2], w0[1], selector); - w0[1] = __byte_perm (w0[1], w0[0], selector); - w0[0] = __byte_perm (w0[0], 0, selector); + c0[0] = hc_byte_perm ( 0, w3[3], selector); + w3[3] = hc_byte_perm (w3[3], w3[2], selector); + w3[2] = hc_byte_perm (w3[2], w3[1], selector); + w3[1] = hc_byte_perm (w3[1], w3[0], selector); + w3[0] = hc_byte_perm (w3[0], w2[3], selector); + w2[3] = hc_byte_perm (w2[3], w2[2], selector); + w2[2] = hc_byte_perm (w2[2], w2[1], selector); + w2[1] = hc_byte_perm (w2[1], w2[0], selector); + w2[0] = hc_byte_perm (w2[0], w1[3], selector); + w1[3] = hc_byte_perm (w1[3], w1[2], selector); + w1[2] = hc_byte_perm (w1[2], w1[1], selector); + w1[1] = hc_byte_perm (w1[1], w1[0], selector); + w1[0] = hc_byte_perm (w1[0], w0[3], selector); + w0[3] = hc_byte_perm (w0[3], w0[2], selector); + w0[2] = hc_byte_perm (w0[2], w0[1], selector); + w0[1] = hc_byte_perm (w0[1], w0[0], selector); + w0[0] = hc_byte_perm (w0[0], 0, selector); break; case 1: - c0[1] = __byte_perm ( 0, w3[3], selector); - c0[0] = __byte_perm (w3[3], w3[2], selector); - w3[3] = __byte_perm (w3[2], w3[1], selector); - w3[2] = __byte_perm (w3[1], w3[0], selector); - w3[1] = __byte_perm (w3[0], w2[3], selector); - w3[0] = __byte_perm (w2[3], w2[2], selector); - w2[3] = __byte_perm (w2[2], w2[1], selector); - w2[2] = __byte_perm (w2[1], w2[0], selector); - w2[1] = __byte_perm (w2[0], w1[3], selector); - w2[0] = __byte_perm (w1[3], w1[2], selector); - w1[3] = __byte_perm (w1[2], w1[1], selector); - w1[2] = __byte_perm (w1[1], w1[0], selector); - w1[1] = __byte_perm (w1[0], w0[3], selector); - w1[0] = __byte_perm (w0[3], w0[2], selector); - w0[3] = __byte_perm (w0[2], w0[1], selector); - w0[2] = __byte_perm (w0[1], w0[0], selector); - w0[1] = __byte_perm (w0[0], 0, selector); + c0[1] = hc_byte_perm ( 0, w3[3], selector); + c0[0] = hc_byte_perm (w3[3], w3[2], selector); + w3[3] = hc_byte_perm (w3[2], w3[1], selector); + w3[2] = hc_byte_perm (w3[1], w3[0], selector); + w3[1] = hc_byte_perm (w3[0], w2[3], selector); + w3[0] = hc_byte_perm (w2[3], w2[2], selector); + w2[3] = hc_byte_perm (w2[2], w2[1], selector); + w2[2] = hc_byte_perm (w2[1], w2[0], selector); + w2[1] = hc_byte_perm (w2[0], w1[3], selector); + w2[0] = hc_byte_perm (w1[3], w1[2], selector); + w1[3] = hc_byte_perm (w1[2], w1[1], selector); + w1[2] = hc_byte_perm (w1[1], w1[0], selector); + w1[1] = hc_byte_perm (w1[0], w0[3], selector); + w1[0] = hc_byte_perm (w0[3], w0[2], selector); + w0[3] = hc_byte_perm (w0[2], w0[1], selector); + w0[2] = hc_byte_perm (w0[1], w0[0], selector); + w0[1] = hc_byte_perm (w0[0], 0, selector); w0[0] = 0; break; case 2: - c0[2] = __byte_perm ( 0, w3[3], selector); - c0[1] = __byte_perm (w3[3], w3[2], selector); - c0[0] = __byte_perm (w3[2], w3[1], selector); - w3[3] = __byte_perm (w3[1], w3[0], selector); - w3[2] = __byte_perm (w3[0], w2[3], selector); - w3[1] = __byte_perm (w2[3], w2[2], selector); - w3[0] = __byte_perm (w2[2], w2[1], selector); - w2[3] = __byte_perm (w2[1], w2[0], selector); - w2[2] = __byte_perm (w2[0], w1[3], selector); - w2[1] = __byte_perm (w1[3], w1[2], selector); - w2[0] = __byte_perm (w1[2], w1[1], selector); - w1[3] = __byte_perm (w1[1], w1[0], selector); - w1[2] = __byte_perm (w1[0], w0[3], selector); - w1[1] = __byte_perm (w0[3], w0[2], selector); - w1[0] = __byte_perm (w0[2], w0[1], selector); - w0[3] = __byte_perm (w0[1], w0[0], selector); - w0[2] = __byte_perm (w0[0], 0, selector); + c0[2] = hc_byte_perm ( 0, w3[3], selector); + c0[1] = hc_byte_perm (w3[3], w3[2], selector); + c0[0] = hc_byte_perm (w3[2], w3[1], selector); + w3[3] = hc_byte_perm (w3[1], w3[0], selector); + w3[2] = hc_byte_perm (w3[0], w2[3], selector); + w3[1] = hc_byte_perm (w2[3], w2[2], selector); + w3[0] = hc_byte_perm (w2[2], w2[1], selector); + w2[3] = hc_byte_perm (w2[1], w2[0], selector); + w2[2] = hc_byte_perm (w2[0], w1[3], selector); + w2[1] = hc_byte_perm (w1[3], w1[2], selector); + w2[0] = hc_byte_perm (w1[2], w1[1], selector); + w1[3] = hc_byte_perm (w1[1], w1[0], selector); + w1[2] = hc_byte_perm (w1[0], w0[3], selector); + w1[1] = hc_byte_perm (w0[3], w0[2], selector); + w1[0] = hc_byte_perm (w0[2], w0[1], selector); + w0[3] = hc_byte_perm (w0[1], w0[0], selector); + w0[2] = hc_byte_perm (w0[0], 0, selector); w0[1] = 0; w0[0] = 0; break; case 3: - c0[3] = __byte_perm ( 0, w3[3], selector); - c0[2] = __byte_perm (w3[3], w3[2], selector); - c0[1] = __byte_perm (w3[2], w3[1], selector); - c0[0] = __byte_perm (w3[1], w3[0], selector); - w3[3] = __byte_perm (w3[0], w2[3], selector); - w3[2] = __byte_perm (w2[3], w2[2], selector); - w3[1] = __byte_perm (w2[2], w2[1], selector); - w3[0] = __byte_perm (w2[1], w2[0], selector); - w2[3] = __byte_perm (w2[0], w1[3], selector); - w2[2] = __byte_perm (w1[3], w1[2], selector); - w2[1] = __byte_perm (w1[2], w1[1], selector); - w2[0] = __byte_perm (w1[1], w1[0], selector); - w1[3] = __byte_perm (w1[0], w0[3], selector); - w1[2] = __byte_perm (w0[3], w0[2], selector); - w1[1] = __byte_perm (w0[2], w0[1], selector); - w1[0] = __byte_perm (w0[1], w0[0], selector); - w0[3] = __byte_perm (w0[0], 0, selector); + c0[3] = hc_byte_perm ( 0, w3[3], selector); + c0[2] = hc_byte_perm (w3[3], w3[2], selector); + c0[1] = hc_byte_perm (w3[2], w3[1], selector); + c0[0] = hc_byte_perm (w3[1], w3[0], selector); + w3[3] = hc_byte_perm (w3[0], w2[3], selector); + w3[2] = hc_byte_perm (w2[3], w2[2], selector); + w3[1] = hc_byte_perm (w2[2], w2[1], selector); + w3[0] = hc_byte_perm (w2[1], w2[0], selector); + w2[3] = hc_byte_perm (w2[0], w1[3], selector); + w2[2] = hc_byte_perm (w1[3], w1[2], selector); + w2[1] = hc_byte_perm (w1[2], w1[1], selector); + w2[0] = hc_byte_perm (w1[1], w1[0], selector); + w1[3] = hc_byte_perm (w1[0], w0[3], selector); + w1[2] = hc_byte_perm (w0[3], w0[2], selector); + w1[1] = hc_byte_perm (w0[2], w0[1], selector); + w1[0] = hc_byte_perm (w0[1], w0[0], selector); + w0[3] = hc_byte_perm (w0[0], 0, selector); w0[2] = 0; w0[1] = 0; w0[0] = 0; @@ -3764,23 +3764,23 @@ DECLSPEC void switch_buffer_by_offset_carry_be (u32x *w0, u32x *w1, u32x *w2, u3 break; case 4: - c1[0] = __byte_perm ( 0, w3[3], selector); - c0[3] = __byte_perm (w3[3], w3[2], selector); - c0[2] = __byte_perm (w3[2], w3[1], selector); - c0[1] = __byte_perm (w3[1], w3[0], selector); - c0[0] = __byte_perm (w3[0], w2[3], selector); - w3[3] = __byte_perm (w2[3], w2[2], selector); - w3[2] = __byte_perm (w2[2], w2[1], selector); - w3[1] = __byte_perm (w2[1], w2[0], selector); - w3[0] = __byte_perm (w2[0], w1[3], selector); - w2[3] = __byte_perm (w1[3], w1[2], selector); - w2[2] = __byte_perm (w1[2], w1[1], selector); - w2[1] = __byte_perm (w1[1], w1[0], selector); - w2[0] = __byte_perm (w1[0], w0[3], selector); - w1[3] = __byte_perm (w0[3], w0[2], selector); - w1[2] = __byte_perm (w0[2], w0[1], selector); - w1[1] = __byte_perm (w0[1], w0[0], selector); - w1[0] = __byte_perm (w0[0], 0, selector); + c1[0] = hc_byte_perm ( 0, w3[3], selector); + c0[3] = hc_byte_perm (w3[3], w3[2], selector); + c0[2] = hc_byte_perm (w3[2], w3[1], selector); + c0[1] = hc_byte_perm (w3[1], w3[0], selector); + c0[0] = hc_byte_perm (w3[0], w2[3], selector); + w3[3] = hc_byte_perm (w2[3], w2[2], selector); + w3[2] = hc_byte_perm (w2[2], w2[1], selector); + w3[1] = hc_byte_perm (w2[1], w2[0], selector); + w3[0] = hc_byte_perm (w2[0], w1[3], selector); + w2[3] = hc_byte_perm (w1[3], w1[2], selector); + w2[2] = hc_byte_perm (w1[2], w1[1], selector); + w2[1] = hc_byte_perm (w1[1], w1[0], selector); + w2[0] = hc_byte_perm (w1[0], w0[3], selector); + w1[3] = hc_byte_perm (w0[3], w0[2], selector); + w1[2] = hc_byte_perm (w0[2], w0[1], selector); + w1[1] = hc_byte_perm (w0[1], w0[0], selector); + w1[0] = hc_byte_perm (w0[0], 0, selector); w0[3] = 0; w0[2] = 0; w0[1] = 0; @@ -3789,23 +3789,23 @@ DECLSPEC void switch_buffer_by_offset_carry_be (u32x *w0, u32x *w1, u32x *w2, u3 break; case 5: - c1[1] = __byte_perm ( 0, w3[3], selector); - c1[0] = __byte_perm (w3[3], w3[2], selector); - c0[3] = __byte_perm (w3[2], w3[1], selector); - c0[2] = __byte_perm (w3[1], w3[0], selector); - c0[1] = __byte_perm (w3[0], w2[3], selector); - c0[0] = __byte_perm (w2[3], w2[2], selector); - w3[3] = __byte_perm (w2[2], w2[1], selector); - w3[2] = __byte_perm (w2[1], w2[0], selector); - w3[1] = __byte_perm (w2[0], w1[3], selector); - w3[0] = __byte_perm (w1[3], w1[2], selector); - w2[3] = __byte_perm (w1[2], w1[1], selector); - w2[2] = __byte_perm (w1[1], w1[0], selector); - w2[1] = __byte_perm (w1[0], w0[3], selector); - w2[0] = __byte_perm (w0[3], w0[2], selector); - w1[3] = __byte_perm (w0[2], w0[1], selector); - w1[2] = __byte_perm (w0[1], w0[0], selector); - w1[1] = __byte_perm (w0[0], 0, selector); + c1[1] = hc_byte_perm ( 0, w3[3], selector); + c1[0] = hc_byte_perm (w3[3], w3[2], selector); + c0[3] = hc_byte_perm (w3[2], w3[1], selector); + c0[2] = hc_byte_perm (w3[1], w3[0], selector); + c0[1] = hc_byte_perm (w3[0], w2[3], selector); + c0[0] = hc_byte_perm (w2[3], w2[2], selector); + w3[3] = hc_byte_perm (w2[2], w2[1], selector); + w3[2] = hc_byte_perm (w2[1], w2[0], selector); + w3[1] = hc_byte_perm (w2[0], w1[3], selector); + w3[0] = hc_byte_perm (w1[3], w1[2], selector); + w2[3] = hc_byte_perm (w1[2], w1[1], selector); + w2[2] = hc_byte_perm (w1[1], w1[0], selector); + w2[1] = hc_byte_perm (w1[0], w0[3], selector); + w2[0] = hc_byte_perm (w0[3], w0[2], selector); + w1[3] = hc_byte_perm (w0[2], w0[1], selector); + w1[2] = hc_byte_perm (w0[1], w0[0], selector); + w1[1] = hc_byte_perm (w0[0], 0, selector); w1[0] = 0; w0[3] = 0; w0[2] = 0; @@ -3815,23 +3815,23 @@ DECLSPEC void switch_buffer_by_offset_carry_be (u32x *w0, u32x *w1, u32x *w2, u3 break; case 6: - c1[2] = __byte_perm ( 0, w3[3], selector); - c1[1] = __byte_perm (w3[3], w3[2], selector); - c1[0] = __byte_perm (w3[2], w3[1], selector); - c0[3] = __byte_perm (w3[1], w3[0], selector); - c0[2] = __byte_perm (w3[0], w2[3], selector); - c0[1] = __byte_perm (w2[3], w2[2], selector); - c0[0] = __byte_perm (w2[2], w2[1], selector); - w3[3] = __byte_perm (w2[1], w2[0], selector); - w3[2] = __byte_perm (w2[0], w1[3], selector); - w3[1] = __byte_perm (w1[3], w1[2], selector); - w3[0] = __byte_perm (w1[2], w1[1], selector); - w2[3] = __byte_perm (w1[1], w1[0], selector); - w2[2] = __byte_perm (w1[0], w0[3], selector); - w2[1] = __byte_perm (w0[3], w0[2], selector); - w2[0] = __byte_perm (w0[2], w0[1], selector); - w1[3] = __byte_perm (w0[1], w0[0], selector); - w1[2] = __byte_perm (w0[0], 0, selector); + c1[2] = hc_byte_perm ( 0, w3[3], selector); + c1[1] = hc_byte_perm (w3[3], w3[2], selector); + c1[0] = hc_byte_perm (w3[2], w3[1], selector); + c0[3] = hc_byte_perm (w3[1], w3[0], selector); + c0[2] = hc_byte_perm (w3[0], w2[3], selector); + c0[1] = hc_byte_perm (w2[3], w2[2], selector); + c0[0] = hc_byte_perm (w2[2], w2[1], selector); + w3[3] = hc_byte_perm (w2[1], w2[0], selector); + w3[2] = hc_byte_perm (w2[0], w1[3], selector); + w3[1] = hc_byte_perm (w1[3], w1[2], selector); + w3[0] = hc_byte_perm (w1[2], w1[1], selector); + w2[3] = hc_byte_perm (w1[1], w1[0], selector); + w2[2] = hc_byte_perm (w1[0], w0[3], selector); + w2[1] = hc_byte_perm (w0[3], w0[2], selector); + w2[0] = hc_byte_perm (w0[2], w0[1], selector); + w1[3] = hc_byte_perm (w0[1], w0[0], selector); + w1[2] = hc_byte_perm (w0[0], 0, selector); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -3842,23 +3842,23 @@ DECLSPEC void switch_buffer_by_offset_carry_be (u32x *w0, u32x *w1, u32x *w2, u3 break; case 7: - c1[3] = __byte_perm ( 0, w3[3], selector); - c1[2] = __byte_perm (w3[3], w3[2], selector); - c1[1] = __byte_perm (w3[2], w3[1], selector); - c1[0] = __byte_perm (w3[1], w3[0], selector); - c0[3] = __byte_perm (w3[0], w2[3], selector); - c0[2] = __byte_perm (w2[3], w2[2], selector); - c0[1] = __byte_perm (w2[2], w2[1], selector); - c0[0] = __byte_perm (w2[1], w2[0], selector); - w3[3] = __byte_perm (w2[0], w1[3], selector); - w3[2] = __byte_perm (w1[3], w1[2], selector); - w3[1] = __byte_perm (w1[2], w1[1], selector); - w3[0] = __byte_perm (w1[1], w1[0], selector); - w2[3] = __byte_perm (w1[0], w0[3], selector); - w2[2] = __byte_perm (w0[3], w0[2], selector); - w2[1] = __byte_perm (w0[2], w0[1], selector); - w2[0] = __byte_perm (w0[1], w0[0], selector); - w1[3] = __byte_perm (w0[0], 0, selector); + c1[3] = hc_byte_perm ( 0, w3[3], selector); + c1[2] = hc_byte_perm (w3[3], w3[2], selector); + c1[1] = hc_byte_perm (w3[2], w3[1], selector); + c1[0] = hc_byte_perm (w3[1], w3[0], selector); + c0[3] = hc_byte_perm (w3[0], w2[3], selector); + c0[2] = hc_byte_perm (w2[3], w2[2], selector); + c0[1] = hc_byte_perm (w2[2], w2[1], selector); + c0[0] = hc_byte_perm (w2[1], w2[0], selector); + w3[3] = hc_byte_perm (w2[0], w1[3], selector); + w3[2] = hc_byte_perm (w1[3], w1[2], selector); + w3[1] = hc_byte_perm (w1[2], w1[1], selector); + w3[0] = hc_byte_perm (w1[1], w1[0], selector); + w2[3] = hc_byte_perm (w1[0], w0[3], selector); + w2[2] = hc_byte_perm (w0[3], w0[2], selector); + w2[1] = hc_byte_perm (w0[2], w0[1], selector); + w2[0] = hc_byte_perm (w0[1], w0[0], selector); + w1[3] = hc_byte_perm (w0[0], 0, selector); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -3870,23 +3870,23 @@ DECLSPEC void switch_buffer_by_offset_carry_be (u32x *w0, u32x *w1, u32x *w2, u3 break; case 8: - c2[0] = __byte_perm ( 0, w3[3], selector); - c1[3] = __byte_perm (w3[3], w3[2], selector); - c1[2] = __byte_perm (w3[2], w3[1], selector); - c1[1] = __byte_perm (w3[1], w3[0], selector); - c1[0] = __byte_perm (w3[0], w2[3], selector); - c0[3] = __byte_perm (w2[3], w2[2], selector); - c0[2] = __byte_perm (w2[2], w2[1], selector); - c0[1] = __byte_perm (w2[1], w2[0], selector); - c0[0] = __byte_perm (w2[0], w1[3], selector); - w3[3] = __byte_perm (w1[3], w1[2], selector); - w3[2] = __byte_perm (w1[2], w1[1], selector); - w3[1] = __byte_perm (w1[1], w1[0], selector); - w3[0] = __byte_perm (w1[0], w0[3], selector); - w2[3] = __byte_perm (w0[3], w0[2], selector); - w2[2] = __byte_perm (w0[2], w0[1], selector); - w2[1] = __byte_perm (w0[1], w0[0], selector); - w2[0] = __byte_perm (w0[0], 0, selector); + c2[0] = hc_byte_perm ( 0, w3[3], selector); + c1[3] = hc_byte_perm (w3[3], w3[2], selector); + c1[2] = hc_byte_perm (w3[2], w3[1], selector); + c1[1] = hc_byte_perm (w3[1], w3[0], selector); + c1[0] = hc_byte_perm (w3[0], w2[3], selector); + c0[3] = hc_byte_perm (w2[3], w2[2], selector); + c0[2] = hc_byte_perm (w2[2], w2[1], selector); + c0[1] = hc_byte_perm (w2[1], w2[0], selector); + c0[0] = hc_byte_perm (w2[0], w1[3], selector); + w3[3] = hc_byte_perm (w1[3], w1[2], selector); + w3[2] = hc_byte_perm (w1[2], w1[1], selector); + w3[1] = hc_byte_perm (w1[1], w1[0], selector); + w3[0] = hc_byte_perm (w1[0], w0[3], selector); + w2[3] = hc_byte_perm (w0[3], w0[2], selector); + w2[2] = hc_byte_perm (w0[2], w0[1], selector); + w2[1] = hc_byte_perm (w0[1], w0[0], selector); + w2[0] = hc_byte_perm (w0[0], 0, selector); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -3899,23 +3899,23 @@ DECLSPEC void switch_buffer_by_offset_carry_be (u32x *w0, u32x *w1, u32x *w2, u3 break; case 9: - c2[1] = __byte_perm ( 0, w3[3], selector); - c2[0] = __byte_perm (w3[3], w3[2], selector); - c1[3] = __byte_perm (w3[2], w3[1], selector); - c1[2] = __byte_perm (w3[1], w3[0], selector); - c1[1] = __byte_perm (w3[0], w2[3], selector); - c1[0] = __byte_perm (w2[3], w2[2], selector); - c0[3] = __byte_perm (w2[2], w2[1], selector); - c0[2] = __byte_perm (w2[1], w2[0], selector); - c0[1] = __byte_perm (w2[0], w1[3], selector); - c0[0] = __byte_perm (w1[3], w1[2], selector); - w3[3] = __byte_perm (w1[2], w1[1], selector); - w3[2] = __byte_perm (w1[1], w1[0], selector); - w3[1] = __byte_perm (w1[0], w0[3], selector); - w3[0] = __byte_perm (w0[3], w0[2], selector); - w2[3] = __byte_perm (w0[2], w0[1], selector); - w2[2] = __byte_perm (w0[1], w0[0], selector); - w2[1] = __byte_perm (w0[0], 0, selector); + c2[1] = hc_byte_perm ( 0, w3[3], selector); + c2[0] = hc_byte_perm (w3[3], w3[2], selector); + c1[3] = hc_byte_perm (w3[2], w3[1], selector); + c1[2] = hc_byte_perm (w3[1], w3[0], selector); + c1[1] = hc_byte_perm (w3[0], w2[3], selector); + c1[0] = hc_byte_perm (w2[3], w2[2], selector); + c0[3] = hc_byte_perm (w2[2], w2[1], selector); + c0[2] = hc_byte_perm (w2[1], w2[0], selector); + c0[1] = hc_byte_perm (w2[0], w1[3], selector); + c0[0] = hc_byte_perm (w1[3], w1[2], selector); + w3[3] = hc_byte_perm (w1[2], w1[1], selector); + w3[2] = hc_byte_perm (w1[1], w1[0], selector); + w3[1] = hc_byte_perm (w1[0], w0[3], selector); + w3[0] = hc_byte_perm (w0[3], w0[2], selector); + w2[3] = hc_byte_perm (w0[2], w0[1], selector); + w2[2] = hc_byte_perm (w0[1], w0[0], selector); + w2[1] = hc_byte_perm (w0[0], 0, selector); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -3929,23 +3929,23 @@ DECLSPEC void switch_buffer_by_offset_carry_be (u32x *w0, u32x *w1, u32x *w2, u3 break; case 10: - c2[2] = __byte_perm ( 0, w3[3], selector); - c2[1] = __byte_perm (w3[3], w3[2], selector); - c2[0] = __byte_perm (w3[2], w3[1], selector); - c1[3] = __byte_perm (w3[1], w3[0], selector); - c1[2] = __byte_perm (w3[0], w2[3], selector); - c1[1] = __byte_perm (w2[3], w2[2], selector); - c1[0] = __byte_perm (w2[2], w2[1], selector); - c0[3] = __byte_perm (w2[1], w2[0], selector); - c0[2] = __byte_perm (w2[0], w1[3], selector); - c0[1] = __byte_perm (w1[3], w1[2], selector); - c0[0] = __byte_perm (w1[2], w1[1], selector); - w3[3] = __byte_perm (w1[1], w1[0], selector); - w3[2] = __byte_perm (w1[0], w0[3], selector); - w3[1] = __byte_perm (w0[3], w0[2], selector); - w3[0] = __byte_perm (w0[2], w0[1], selector); - w2[3] = __byte_perm (w0[1], w0[0], selector); - w2[2] = __byte_perm (w0[0], 0, selector); + c2[2] = hc_byte_perm ( 0, w3[3], selector); + c2[1] = hc_byte_perm (w3[3], w3[2], selector); + c2[0] = hc_byte_perm (w3[2], w3[1], selector); + c1[3] = hc_byte_perm (w3[1], w3[0], selector); + c1[2] = hc_byte_perm (w3[0], w2[3], selector); + c1[1] = hc_byte_perm (w2[3], w2[2], selector); + c1[0] = hc_byte_perm (w2[2], w2[1], selector); + c0[3] = hc_byte_perm (w2[1], w2[0], selector); + c0[2] = hc_byte_perm (w2[0], w1[3], selector); + c0[1] = hc_byte_perm (w1[3], w1[2], selector); + c0[0] = hc_byte_perm (w1[2], w1[1], selector); + w3[3] = hc_byte_perm (w1[1], w1[0], selector); + w3[2] = hc_byte_perm (w1[0], w0[3], selector); + w3[1] = hc_byte_perm (w0[3], w0[2], selector); + w3[0] = hc_byte_perm (w0[2], w0[1], selector); + w2[3] = hc_byte_perm (w0[1], w0[0], selector); + w2[2] = hc_byte_perm (w0[0], 0, selector); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -3960,23 +3960,23 @@ DECLSPEC void switch_buffer_by_offset_carry_be (u32x *w0, u32x *w1, u32x *w2, u3 break; case 11: - c2[3] = __byte_perm ( 0, w3[3], selector); - c2[2] = __byte_perm (w3[3], w3[2], selector); - c2[1] = __byte_perm (w3[2], w3[1], selector); - c2[0] = __byte_perm (w3[1], w3[0], selector); - c1[3] = __byte_perm (w3[0], w2[3], selector); - c1[2] = __byte_perm (w2[3], w2[2], selector); - c1[1] = __byte_perm (w2[2], w2[1], selector); - c1[0] = __byte_perm (w2[1], w2[0], selector); - c0[3] = __byte_perm (w2[0], w1[3], selector); - c0[2] = __byte_perm (w1[3], w1[2], selector); - c0[1] = __byte_perm (w1[2], w1[1], selector); - c0[0] = __byte_perm (w1[1], w1[0], selector); - w3[3] = __byte_perm (w1[0], w0[3], selector); - w3[2] = __byte_perm (w0[3], w0[2], selector); - w3[1] = __byte_perm (w0[2], w0[1], selector); - w3[0] = __byte_perm (w0[1], w0[0], selector); - w2[3] = __byte_perm (w0[0], 0, selector); + c2[3] = hc_byte_perm ( 0, w3[3], selector); + c2[2] = hc_byte_perm (w3[3], w3[2], selector); + c2[1] = hc_byte_perm (w3[2], w3[1], selector); + c2[0] = hc_byte_perm (w3[1], w3[0], selector); + c1[3] = hc_byte_perm (w3[0], w2[3], selector); + c1[2] = hc_byte_perm (w2[3], w2[2], selector); + c1[1] = hc_byte_perm (w2[2], w2[1], selector); + c1[0] = hc_byte_perm (w2[1], w2[0], selector); + c0[3] = hc_byte_perm (w2[0], w1[3], selector); + c0[2] = hc_byte_perm (w1[3], w1[2], selector); + c0[1] = hc_byte_perm (w1[2], w1[1], selector); + c0[0] = hc_byte_perm (w1[1], w1[0], selector); + w3[3] = hc_byte_perm (w1[0], w0[3], selector); + w3[2] = hc_byte_perm (w0[3], w0[2], selector); + w3[1] = hc_byte_perm (w0[2], w0[1], selector); + w3[0] = hc_byte_perm (w0[1], w0[0], selector); + w2[3] = hc_byte_perm (w0[0], 0, selector); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -3992,23 +3992,23 @@ DECLSPEC void switch_buffer_by_offset_carry_be (u32x *w0, u32x *w1, u32x *w2, u3 break; case 12: - c3[0] = __byte_perm ( 0, w3[3], selector); - c2[3] = __byte_perm (w3[3], w3[2], selector); - c2[2] = __byte_perm (w3[2], w3[1], selector); - c2[1] = __byte_perm (w3[1], w3[0], selector); - c2[0] = __byte_perm (w3[0], w2[3], selector); - c1[3] = __byte_perm (w2[3], w2[2], selector); - c1[2] = __byte_perm (w2[2], w2[1], selector); - c1[1] = __byte_perm (w2[1], w2[0], selector); - c1[0] = __byte_perm (w2[0], w1[3], selector); - c0[3] = __byte_perm (w1[3], w1[2], selector); - c0[2] = __byte_perm (w1[2], w1[1], selector); - c0[1] = __byte_perm (w1[1], w1[0], selector); - c0[0] = __byte_perm (w1[0], w0[3], selector); - w3[3] = __byte_perm (w0[3], w0[2], selector); - w3[2] = __byte_perm (w0[2], w0[1], selector); - w3[1] = __byte_perm (w0[1], w0[0], selector); - w3[0] = __byte_perm (w0[0], 0, selector); + c3[0] = hc_byte_perm ( 0, w3[3], selector); + c2[3] = hc_byte_perm (w3[3], w3[2], selector); + c2[2] = hc_byte_perm (w3[2], w3[1], selector); + c2[1] = hc_byte_perm (w3[1], w3[0], selector); + c2[0] = hc_byte_perm (w3[0], w2[3], selector); + c1[3] = hc_byte_perm (w2[3], w2[2], selector); + c1[2] = hc_byte_perm (w2[2], w2[1], selector); + c1[1] = hc_byte_perm (w2[1], w2[0], selector); + c1[0] = hc_byte_perm (w2[0], w1[3], selector); + c0[3] = hc_byte_perm (w1[3], w1[2], selector); + c0[2] = hc_byte_perm (w1[2], w1[1], selector); + c0[1] = hc_byte_perm (w1[1], w1[0], selector); + c0[0] = hc_byte_perm (w1[0], w0[3], selector); + w3[3] = hc_byte_perm (w0[3], w0[2], selector); + w3[2] = hc_byte_perm (w0[2], w0[1], selector); + w3[1] = hc_byte_perm (w0[1], w0[0], selector); + w3[0] = hc_byte_perm (w0[0], 0, selector); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -4025,23 +4025,23 @@ DECLSPEC void switch_buffer_by_offset_carry_be (u32x *w0, u32x *w1, u32x *w2, u3 break; case 13: - c3[1] = __byte_perm ( 0, w3[3], selector); - c3[0] = __byte_perm (w3[3], w3[2], selector); - c2[3] = __byte_perm (w3[2], w3[1], selector); - c2[2] = __byte_perm (w3[1], w3[0], selector); - c2[1] = __byte_perm (w3[0], w2[3], selector); - c2[0] = __byte_perm (w2[3], w2[2], selector); - c1[3] = __byte_perm (w2[2], w2[1], selector); - c1[2] = __byte_perm (w2[1], w2[0], selector); - c1[1] = __byte_perm (w2[0], w1[3], selector); - c1[0] = __byte_perm (w1[3], w1[2], selector); - c0[3] = __byte_perm (w1[2], w1[1], selector); - c0[2] = __byte_perm (w1[1], w1[0], selector); - c0[1] = __byte_perm (w1[0], w0[3], selector); - c0[0] = __byte_perm (w0[3], w0[2], selector); - w3[3] = __byte_perm (w0[2], w0[1], selector); - w3[2] = __byte_perm (w0[1], w0[0], selector); - w3[1] = __byte_perm (w0[0], 0, selector); + c3[1] = hc_byte_perm ( 0, w3[3], selector); + c3[0] = hc_byte_perm (w3[3], w3[2], selector); + c2[3] = hc_byte_perm (w3[2], w3[1], selector); + c2[2] = hc_byte_perm (w3[1], w3[0], selector); + c2[1] = hc_byte_perm (w3[0], w2[3], selector); + c2[0] = hc_byte_perm (w2[3], w2[2], selector); + c1[3] = hc_byte_perm (w2[2], w2[1], selector); + c1[2] = hc_byte_perm (w2[1], w2[0], selector); + c1[1] = hc_byte_perm (w2[0], w1[3], selector); + c1[0] = hc_byte_perm (w1[3], w1[2], selector); + c0[3] = hc_byte_perm (w1[2], w1[1], selector); + c0[2] = hc_byte_perm (w1[1], w1[0], selector); + c0[1] = hc_byte_perm (w1[0], w0[3], selector); + c0[0] = hc_byte_perm (w0[3], w0[2], selector); + w3[3] = hc_byte_perm (w0[2], w0[1], selector); + w3[2] = hc_byte_perm (w0[1], w0[0], selector); + w3[1] = hc_byte_perm (w0[0], 0, selector); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -4059,23 +4059,23 @@ DECLSPEC void switch_buffer_by_offset_carry_be (u32x *w0, u32x *w1, u32x *w2, u3 break; case 14: - c3[2] = __byte_perm ( 0, w3[3], selector); - c3[1] = __byte_perm (w3[3], w3[2], selector); - c3[0] = __byte_perm (w3[2], w3[1], selector); - c2[3] = __byte_perm (w3[1], w3[0], selector); - c2[2] = __byte_perm (w3[0], w2[3], selector); - c2[1] = __byte_perm (w2[3], w2[2], selector); - c2[0] = __byte_perm (w2[2], w2[1], selector); - c1[3] = __byte_perm (w2[1], w2[0], selector); - c1[2] = __byte_perm (w2[0], w1[3], selector); - c1[1] = __byte_perm (w1[3], w1[2], selector); - c1[0] = __byte_perm (w1[2], w1[1], selector); - c0[3] = __byte_perm (w1[1], w1[0], selector); - c0[2] = __byte_perm (w1[0], w0[3], selector); - c0[1] = __byte_perm (w0[3], w0[2], selector); - c0[0] = __byte_perm (w0[2], w0[1], selector); - w3[3] = __byte_perm (w0[1], w0[0], selector); - w3[2] = __byte_perm (w0[0], 0, selector); + c3[2] = hc_byte_perm ( 0, w3[3], selector); + c3[1] = hc_byte_perm (w3[3], w3[2], selector); + c3[0] = hc_byte_perm (w3[2], w3[1], selector); + c2[3] = hc_byte_perm (w3[1], w3[0], selector); + c2[2] = hc_byte_perm (w3[0], w2[3], selector); + c2[1] = hc_byte_perm (w2[3], w2[2], selector); + c2[0] = hc_byte_perm (w2[2], w2[1], selector); + c1[3] = hc_byte_perm (w2[1], w2[0], selector); + c1[2] = hc_byte_perm (w2[0], w1[3], selector); + c1[1] = hc_byte_perm (w1[3], w1[2], selector); + c1[0] = hc_byte_perm (w1[2], w1[1], selector); + c0[3] = hc_byte_perm (w1[1], w1[0], selector); + c0[2] = hc_byte_perm (w1[0], w0[3], selector); + c0[1] = hc_byte_perm (w0[3], w0[2], selector); + c0[0] = hc_byte_perm (w0[2], w0[1], selector); + w3[3] = hc_byte_perm (w0[1], w0[0], selector); + w3[2] = hc_byte_perm (w0[0], 0, selector); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -4094,23 +4094,23 @@ DECLSPEC void switch_buffer_by_offset_carry_be (u32x *w0, u32x *w1, u32x *w2, u3 break; case 15: - c3[3] = __byte_perm ( 0, w3[3], selector); - c3[2] = __byte_perm (w3[3], w3[2], selector); - c3[1] = __byte_perm (w3[2], w3[1], selector); - c3[0] = __byte_perm (w3[1], w3[0], selector); - c2[3] = __byte_perm (w3[0], w2[3], selector); - c2[2] = __byte_perm (w2[3], w2[2], selector); - c2[1] = __byte_perm (w2[2], w2[1], selector); - c2[0] = __byte_perm (w2[1], w2[0], selector); - c1[3] = __byte_perm (w2[0], w1[3], selector); - c1[2] = __byte_perm (w1[3], w1[2], selector); - c1[1] = __byte_perm (w1[2], w1[1], selector); - c1[0] = __byte_perm (w1[1], w1[0], selector); - c0[3] = __byte_perm (w1[0], w0[3], selector); - c0[2] = __byte_perm (w0[3], w0[2], selector); - c0[1] = __byte_perm (w0[2], w0[1], selector); - c0[0] = __byte_perm (w0[1], w0[0], selector); - w3[3] = __byte_perm (w0[0], 0, selector); + c3[3] = hc_byte_perm ( 0, w3[3], selector); + c3[2] = hc_byte_perm (w3[3], w3[2], selector); + c3[1] = hc_byte_perm (w3[2], w3[1], selector); + c3[0] = hc_byte_perm (w3[1], w3[0], selector); + c2[3] = hc_byte_perm (w3[0], w2[3], selector); + c2[2] = hc_byte_perm (w2[3], w2[2], selector); + c2[1] = hc_byte_perm (w2[2], w2[1], selector); + c2[0] = hc_byte_perm (w2[1], w2[0], selector); + c1[3] = hc_byte_perm (w2[0], w1[3], selector); + c1[2] = hc_byte_perm (w1[3], w1[2], selector); + c1[1] = hc_byte_perm (w1[2], w1[1], selector); + c1[0] = hc_byte_perm (w1[1], w1[0], selector); + c0[3] = hc_byte_perm (w1[0], w0[3], selector); + c0[2] = hc_byte_perm (w0[3], w0[2], selector); + c0[1] = hc_byte_perm (w0[2], w0[1], selector); + c0[0] = hc_byte_perm (w0[1], w0[0], selector); + w3[3] = hc_byte_perm (w0[0], 0, selector); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -4177,143 +4177,143 @@ DECLSPEC void switch_buffer_by_offset_8x4_le (u32x *w0, u32x *w1, u32x *w2, u32x switch (offset_switch) { case 0: - w7[3] = amd_bytealign (w7[2], w7[3], offset); - w7[2] = amd_bytealign (w7[1], w7[2], offset); - w7[1] = amd_bytealign (w7[0], w7[1], offset); - w7[0] = amd_bytealign (w6[3], w7[0], offset); - w6[3] = amd_bytealign (w6[2], w6[3], offset); - w6[2] = amd_bytealign (w6[1], w6[2], offset); - w6[1] = amd_bytealign (w6[0], w6[1], offset); - w6[0] = amd_bytealign (w5[3], w6[0], offset); - w5[3] = amd_bytealign (w5[2], w5[3], offset); - w5[2] = amd_bytealign (w5[1], w5[2], offset); - w5[1] = amd_bytealign (w5[0], w5[1], offset); - w5[0] = amd_bytealign (w4[3], w5[0], offset); - w4[3] = amd_bytealign (w4[2], w4[3], offset); - w4[2] = amd_bytealign (w4[1], w4[2], offset); - w4[1] = amd_bytealign (w4[0], w4[1], offset); - w4[0] = amd_bytealign (w3[3], w4[0], offset); - w3[3] = amd_bytealign (w3[2], w3[3], offset); - w3[2] = amd_bytealign (w3[1], w3[2], offset); - w3[1] = amd_bytealign (w3[0], w3[1], offset); - w3[0] = amd_bytealign (w2[3], w3[0], offset); - w2[3] = amd_bytealign (w2[2], w2[3], offset); - w2[2] = amd_bytealign (w2[1], w2[2], offset); - w2[1] = amd_bytealign (w2[0], w2[1], offset); - w2[0] = amd_bytealign (w1[3], w2[0], offset); - w1[3] = amd_bytealign (w1[2], w1[3], offset); - w1[2] = amd_bytealign (w1[1], w1[2], offset); - w1[1] = amd_bytealign (w1[0], w1[1], offset); - w1[0] = amd_bytealign (w0[3], w1[0], offset); - w0[3] = amd_bytealign (w0[2], w0[3], offset); - w0[2] = amd_bytealign (w0[1], w0[2], offset); - w0[1] = amd_bytealign (w0[0], w0[1], offset); - w0[0] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w7[2], w7[3], offset); + w7[2] = hc_bytealign (w7[1], w7[2], offset); + w7[1] = hc_bytealign (w7[0], w7[1], offset); + w7[0] = hc_bytealign (w6[3], w7[0], offset); + w6[3] = hc_bytealign (w6[2], w6[3], offset); + w6[2] = hc_bytealign (w6[1], w6[2], offset); + w6[1] = hc_bytealign (w6[0], w6[1], offset); + w6[0] = hc_bytealign (w5[3], w6[0], offset); + w5[3] = hc_bytealign (w5[2], w5[3], offset); + w5[2] = hc_bytealign (w5[1], w5[2], offset); + w5[1] = hc_bytealign (w5[0], w5[1], offset); + w5[0] = hc_bytealign (w4[3], w5[0], offset); + w4[3] = hc_bytealign (w4[2], w4[3], offset); + w4[2] = hc_bytealign (w4[1], w4[2], offset); + w4[1] = hc_bytealign (w4[0], w4[1], offset); + w4[0] = hc_bytealign (w3[3], w4[0], offset); + w3[3] = hc_bytealign (w3[2], w3[3], offset); + w3[2] = hc_bytealign (w3[1], w3[2], offset); + w3[1] = hc_bytealign (w3[0], w3[1], offset); + w3[0] = hc_bytealign (w2[3], w3[0], offset); + w2[3] = hc_bytealign (w2[2], w2[3], offset); + w2[2] = hc_bytealign (w2[1], w2[2], offset); + w2[1] = hc_bytealign (w2[0], w2[1], offset); + w2[0] = hc_bytealign (w1[3], w2[0], offset); + w1[3] = hc_bytealign (w1[2], w1[3], offset); + w1[2] = hc_bytealign (w1[1], w1[2], offset); + w1[1] = hc_bytealign (w1[0], w1[1], offset); + w1[0] = hc_bytealign (w0[3], w1[0], offset); + w0[3] = hc_bytealign (w0[2], w0[3], offset); + w0[2] = hc_bytealign (w0[1], w0[2], offset); + w0[1] = hc_bytealign (w0[0], w0[1], offset); + w0[0] = hc_bytealign ( 0, w0[0], offset); break; case 1: - w7[3] = amd_bytealign (w7[1], w7[2], offset); - w7[2] = amd_bytealign (w7[0], w7[1], offset); - w7[1] = amd_bytealign (w6[3], w7[0], offset); - w7[0] = amd_bytealign (w6[2], w6[3], offset); - w6[3] = amd_bytealign (w6[1], w6[2], offset); - w6[2] = amd_bytealign (w6[0], w6[1], offset); - w6[1] = amd_bytealign (w5[3], w6[0], offset); - w6[0] = amd_bytealign (w5[2], w5[3], offset); - w5[3] = amd_bytealign (w5[1], w5[2], offset); - w5[2] = amd_bytealign (w5[0], w5[1], offset); - w5[1] = amd_bytealign (w4[3], w5[0], offset); - w5[0] = amd_bytealign (w4[2], w4[3], offset); - w4[3] = amd_bytealign (w4[1], w4[2], offset); - w4[2] = amd_bytealign (w4[0], w4[1], offset); - w4[1] = amd_bytealign (w3[3], w4[0], offset); - w4[0] = amd_bytealign (w3[2], w3[3], offset); - w3[3] = amd_bytealign (w3[1], w3[2], offset); - w3[2] = amd_bytealign (w3[0], w3[1], offset); - w3[1] = amd_bytealign (w2[3], w3[0], offset); - w3[0] = amd_bytealign (w2[2], w2[3], offset); - w2[3] = amd_bytealign (w2[1], w2[2], offset); - w2[2] = amd_bytealign (w2[0], w2[1], offset); - w2[1] = amd_bytealign (w1[3], w2[0], offset); - w2[0] = amd_bytealign (w1[2], w1[3], offset); - w1[3] = amd_bytealign (w1[1], w1[2], offset); - w1[2] = amd_bytealign (w1[0], w1[1], offset); - w1[1] = amd_bytealign (w0[3], w1[0], offset); - w1[0] = amd_bytealign (w0[2], w0[3], offset); - w0[3] = amd_bytealign (w0[1], w0[2], offset); - w0[2] = amd_bytealign (w0[0], w0[1], offset); - w0[1] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w7[1], w7[2], offset); + w7[2] = hc_bytealign (w7[0], w7[1], offset); + w7[1] = hc_bytealign (w6[3], w7[0], offset); + w7[0] = hc_bytealign (w6[2], w6[3], offset); + w6[3] = hc_bytealign (w6[1], w6[2], offset); + w6[2] = hc_bytealign (w6[0], w6[1], offset); + w6[1] = hc_bytealign (w5[3], w6[0], offset); + w6[0] = hc_bytealign (w5[2], w5[3], offset); + w5[3] = hc_bytealign (w5[1], w5[2], offset); + w5[2] = hc_bytealign (w5[0], w5[1], offset); + w5[1] = hc_bytealign (w4[3], w5[0], offset); + w5[0] = hc_bytealign (w4[2], w4[3], offset); + w4[3] = hc_bytealign (w4[1], w4[2], offset); + w4[2] = hc_bytealign (w4[0], w4[1], offset); + w4[1] = hc_bytealign (w3[3], w4[0], offset); + w4[0] = hc_bytealign (w3[2], w3[3], offset); + w3[3] = hc_bytealign (w3[1], w3[2], offset); + w3[2] = hc_bytealign (w3[0], w3[1], offset); + w3[1] = hc_bytealign (w2[3], w3[0], offset); + w3[0] = hc_bytealign (w2[2], w2[3], offset); + w2[3] = hc_bytealign (w2[1], w2[2], offset); + w2[2] = hc_bytealign (w2[0], w2[1], offset); + w2[1] = hc_bytealign (w1[3], w2[0], offset); + w2[0] = hc_bytealign (w1[2], w1[3], offset); + w1[3] = hc_bytealign (w1[1], w1[2], offset); + w1[2] = hc_bytealign (w1[0], w1[1], offset); + w1[1] = hc_bytealign (w0[3], w1[0], offset); + w1[0] = hc_bytealign (w0[2], w0[3], offset); + w0[3] = hc_bytealign (w0[1], w0[2], offset); + w0[2] = hc_bytealign (w0[0], w0[1], offset); + w0[1] = hc_bytealign ( 0, w0[0], offset); w0[0] = 0; break; case 2: - w7[3] = amd_bytealign (w7[0], w7[1], offset); - w7[2] = amd_bytealign (w6[3], w7[0], offset); - w7[1] = amd_bytealign (w6[2], w6[3], offset); - w7[0] = amd_bytealign (w6[1], w6[2], offset); - w6[3] = amd_bytealign (w6[0], w6[1], offset); - w6[2] = amd_bytealign (w5[3], w6[0], offset); - w6[1] = amd_bytealign (w5[2], w5[3], offset); - w6[0] = amd_bytealign (w5[1], w5[2], offset); - w5[3] = amd_bytealign (w5[0], w5[1], offset); - w5[2] = amd_bytealign (w4[3], w5[0], offset); - w5[1] = amd_bytealign (w4[2], w4[3], offset); - w5[0] = amd_bytealign (w4[1], w4[2], offset); - w4[3] = amd_bytealign (w4[0], w4[1], offset); - w4[2] = amd_bytealign (w3[3], w4[0], offset); - w4[1] = amd_bytealign (w3[2], w3[3], offset); - w4[0] = amd_bytealign (w3[1], w3[2], offset); - w3[3] = amd_bytealign (w3[0], w3[1], offset); - w3[2] = amd_bytealign (w2[3], w3[0], offset); - w3[1] = amd_bytealign (w2[2], w2[3], offset); - w3[0] = amd_bytealign (w2[1], w2[2], offset); - w2[3] = amd_bytealign (w2[0], w2[1], offset); - w2[2] = amd_bytealign (w1[3], w2[0], offset); - w2[1] = amd_bytealign (w1[2], w1[3], offset); - w2[0] = amd_bytealign (w1[1], w1[2], offset); - w1[3] = amd_bytealign (w1[0], w1[1], offset); - w1[2] = amd_bytealign (w0[3], w1[0], offset); - w1[1] = amd_bytealign (w0[2], w0[3], offset); - w1[0] = amd_bytealign (w0[1], w0[2], offset); - w0[3] = amd_bytealign (w0[0], w0[1], offset); - w0[2] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w7[0], w7[1], offset); + w7[2] = hc_bytealign (w6[3], w7[0], offset); + w7[1] = hc_bytealign (w6[2], w6[3], offset); + w7[0] = hc_bytealign (w6[1], w6[2], offset); + w6[3] = hc_bytealign (w6[0], w6[1], offset); + w6[2] = hc_bytealign (w5[3], w6[0], offset); + w6[1] = hc_bytealign (w5[2], w5[3], offset); + w6[0] = hc_bytealign (w5[1], w5[2], offset); + w5[3] = hc_bytealign (w5[0], w5[1], offset); + w5[2] = hc_bytealign (w4[3], w5[0], offset); + w5[1] = hc_bytealign (w4[2], w4[3], offset); + w5[0] = hc_bytealign (w4[1], w4[2], offset); + w4[3] = hc_bytealign (w4[0], w4[1], offset); + w4[2] = hc_bytealign (w3[3], w4[0], offset); + w4[1] = hc_bytealign (w3[2], w3[3], offset); + w4[0] = hc_bytealign (w3[1], w3[2], offset); + w3[3] = hc_bytealign (w3[0], w3[1], offset); + w3[2] = hc_bytealign (w2[3], w3[0], offset); + w3[1] = hc_bytealign (w2[2], w2[3], offset); + w3[0] = hc_bytealign (w2[1], w2[2], offset); + w2[3] = hc_bytealign (w2[0], w2[1], offset); + w2[2] = hc_bytealign (w1[3], w2[0], offset); + w2[1] = hc_bytealign (w1[2], w1[3], offset); + w2[0] = hc_bytealign (w1[1], w1[2], offset); + w1[3] = hc_bytealign (w1[0], w1[1], offset); + w1[2] = hc_bytealign (w0[3], w1[0], offset); + w1[1] = hc_bytealign (w0[2], w0[3], offset); + w1[0] = hc_bytealign (w0[1], w0[2], offset); + w0[3] = hc_bytealign (w0[0], w0[1], offset); + w0[2] = hc_bytealign ( 0, w0[0], offset); w0[1] = 0; w0[0] = 0; break; case 3: - w7[3] = amd_bytealign (w6[3], w7[0], offset); - w7[2] = amd_bytealign (w6[2], w6[3], offset); - w7[1] = amd_bytealign (w6[1], w6[2], offset); - w7[0] = amd_bytealign (w6[0], w6[1], offset); - w6[3] = amd_bytealign (w5[3], w6[0], offset); - w6[2] = amd_bytealign (w5[2], w5[3], offset); - w6[1] = amd_bytealign (w5[1], w5[2], offset); - w6[0] = amd_bytealign (w5[0], w5[1], offset); - w5[3] = amd_bytealign (w4[3], w5[0], offset); - w5[2] = amd_bytealign (w4[2], w4[3], offset); - w5[1] = amd_bytealign (w4[1], w4[2], offset); - w5[0] = amd_bytealign (w4[0], w4[1], offset); - w4[3] = amd_bytealign (w3[3], w4[0], offset); - w4[2] = amd_bytealign (w3[2], w3[3], offset); - w4[1] = amd_bytealign (w3[1], w3[2], offset); - w4[0] = amd_bytealign (w3[0], w3[1], offset); - w3[3] = amd_bytealign (w2[3], w3[0], offset); - w3[2] = amd_bytealign (w2[2], w2[3], offset); - w3[1] = amd_bytealign (w2[1], w2[2], offset); - w3[0] = amd_bytealign (w2[0], w2[1], offset); - w2[3] = amd_bytealign (w1[3], w2[0], offset); - w2[2] = amd_bytealign (w1[2], w1[3], offset); - w2[1] = amd_bytealign (w1[1], w1[2], offset); - w2[0] = amd_bytealign (w1[0], w1[1], offset); - w1[3] = amd_bytealign (w0[3], w1[0], offset); - w1[2] = amd_bytealign (w0[2], w0[3], offset); - w1[1] = amd_bytealign (w0[1], w0[2], offset); - w1[0] = amd_bytealign (w0[0], w0[1], offset); - w0[3] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w6[3], w7[0], offset); + w7[2] = hc_bytealign (w6[2], w6[3], offset); + w7[1] = hc_bytealign (w6[1], w6[2], offset); + w7[0] = hc_bytealign (w6[0], w6[1], offset); + w6[3] = hc_bytealign (w5[3], w6[0], offset); + w6[2] = hc_bytealign (w5[2], w5[3], offset); + w6[1] = hc_bytealign (w5[1], w5[2], offset); + w6[0] = hc_bytealign (w5[0], w5[1], offset); + w5[3] = hc_bytealign (w4[3], w5[0], offset); + w5[2] = hc_bytealign (w4[2], w4[3], offset); + w5[1] = hc_bytealign (w4[1], w4[2], offset); + w5[0] = hc_bytealign (w4[0], w4[1], offset); + w4[3] = hc_bytealign (w3[3], w4[0], offset); + w4[2] = hc_bytealign (w3[2], w3[3], offset); + w4[1] = hc_bytealign (w3[1], w3[2], offset); + w4[0] = hc_bytealign (w3[0], w3[1], offset); + w3[3] = hc_bytealign (w2[3], w3[0], offset); + w3[2] = hc_bytealign (w2[2], w2[3], offset); + w3[1] = hc_bytealign (w2[1], w2[2], offset); + w3[0] = hc_bytealign (w2[0], w2[1], offset); + w2[3] = hc_bytealign (w1[3], w2[0], offset); + w2[2] = hc_bytealign (w1[2], w1[3], offset); + w2[1] = hc_bytealign (w1[1], w1[2], offset); + w2[0] = hc_bytealign (w1[0], w1[1], offset); + w1[3] = hc_bytealign (w0[3], w1[0], offset); + w1[2] = hc_bytealign (w0[2], w0[3], offset); + w1[1] = hc_bytealign (w0[1], w0[2], offset); + w1[0] = hc_bytealign (w0[0], w0[1], offset); + w0[3] = hc_bytealign ( 0, w0[0], offset); w0[2] = 0; w0[1] = 0; w0[0] = 0; @@ -4321,34 +4321,34 @@ DECLSPEC void switch_buffer_by_offset_8x4_le (u32x *w0, u32x *w1, u32x *w2, u32x break; case 4: - w7[3] = amd_bytealign (w6[2], w6[3], offset); - w7[2] = amd_bytealign (w6[1], w6[2], offset); - w7[1] = amd_bytealign (w6[0], w6[1], offset); - w7[0] = amd_bytealign (w5[3], w6[0], offset); - w6[3] = amd_bytealign (w5[2], w5[3], offset); - w6[2] = amd_bytealign (w5[1], w5[2], offset); - w6[1] = amd_bytealign (w5[0], w5[1], offset); - w6[0] = amd_bytealign (w4[3], w5[0], offset); - w5[3] = amd_bytealign (w4[2], w4[3], offset); - w5[2] = amd_bytealign (w4[1], w4[2], offset); - w5[1] = amd_bytealign (w4[0], w4[1], offset); - w5[0] = amd_bytealign (w3[3], w4[0], offset); - w4[3] = amd_bytealign (w3[2], w3[3], offset); - w4[2] = amd_bytealign (w3[1], w3[2], offset); - w4[1] = amd_bytealign (w3[0], w3[1], offset); - w4[0] = amd_bytealign (w2[3], w3[0], offset); - w3[3] = amd_bytealign (w2[2], w2[3], offset); - w3[2] = amd_bytealign (w2[1], w2[2], offset); - w3[1] = amd_bytealign (w2[0], w2[1], offset); - w3[0] = amd_bytealign (w1[3], w2[0], offset); - w2[3] = amd_bytealign (w1[2], w1[3], offset); - w2[2] = amd_bytealign (w1[1], w1[2], offset); - w2[1] = amd_bytealign (w1[0], w1[1], offset); - w2[0] = amd_bytealign (w0[3], w1[0], offset); - w1[3] = amd_bytealign (w0[2], w0[3], offset); - w1[2] = amd_bytealign (w0[1], w0[2], offset); - w1[1] = amd_bytealign (w0[0], w0[1], offset); - w1[0] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w6[2], w6[3], offset); + w7[2] = hc_bytealign (w6[1], w6[2], offset); + w7[1] = hc_bytealign (w6[0], w6[1], offset); + w7[0] = hc_bytealign (w5[3], w6[0], offset); + w6[3] = hc_bytealign (w5[2], w5[3], offset); + w6[2] = hc_bytealign (w5[1], w5[2], offset); + w6[1] = hc_bytealign (w5[0], w5[1], offset); + w6[0] = hc_bytealign (w4[3], w5[0], offset); + w5[3] = hc_bytealign (w4[2], w4[3], offset); + w5[2] = hc_bytealign (w4[1], w4[2], offset); + w5[1] = hc_bytealign (w4[0], w4[1], offset); + w5[0] = hc_bytealign (w3[3], w4[0], offset); + w4[3] = hc_bytealign (w3[2], w3[3], offset); + w4[2] = hc_bytealign (w3[1], w3[2], offset); + w4[1] = hc_bytealign (w3[0], w3[1], offset); + w4[0] = hc_bytealign (w2[3], w3[0], offset); + w3[3] = hc_bytealign (w2[2], w2[3], offset); + w3[2] = hc_bytealign (w2[1], w2[2], offset); + w3[1] = hc_bytealign (w2[0], w2[1], offset); + w3[0] = hc_bytealign (w1[3], w2[0], offset); + w2[3] = hc_bytealign (w1[2], w1[3], offset); + w2[2] = hc_bytealign (w1[1], w1[2], offset); + w2[1] = hc_bytealign (w1[0], w1[1], offset); + w2[0] = hc_bytealign (w0[3], w1[0], offset); + w1[3] = hc_bytealign (w0[2], w0[3], offset); + w1[2] = hc_bytealign (w0[1], w0[2], offset); + w1[1] = hc_bytealign (w0[0], w0[1], offset); + w1[0] = hc_bytealign ( 0, w0[0], offset); w0[3] = 0; w0[2] = 0; w0[1] = 0; @@ -4357,33 +4357,33 @@ DECLSPEC void switch_buffer_by_offset_8x4_le (u32x *w0, u32x *w1, u32x *w2, u32x break; case 5: - w7[3] = amd_bytealign (w6[1], w6[2], offset); - w7[2] = amd_bytealign (w6[0], w6[1], offset); - w7[1] = amd_bytealign (w5[3], w6[0], offset); - w7[0] = amd_bytealign (w5[2], w5[3], offset); - w6[3] = amd_bytealign (w5[1], w5[2], offset); - w6[2] = amd_bytealign (w5[0], w5[1], offset); - w6[1] = amd_bytealign (w4[3], w5[0], offset); - w6[0] = amd_bytealign (w4[2], w4[3], offset); - w5[3] = amd_bytealign (w4[1], w4[2], offset); - w5[2] = amd_bytealign (w4[0], w4[1], offset); - w5[1] = amd_bytealign (w3[3], w4[0], offset); - w5[0] = amd_bytealign (w3[2], w3[3], offset); - w4[3] = amd_bytealign (w3[1], w3[2], offset); - w4[2] = amd_bytealign (w3[0], w3[1], offset); - w4[1] = amd_bytealign (w2[3], w3[0], offset); - w4[0] = amd_bytealign (w2[2], w2[3], offset); - w3[3] = amd_bytealign (w2[1], w2[2], offset); - w3[2] = amd_bytealign (w2[0], w2[1], offset); - w3[1] = amd_bytealign (w1[3], w2[0], offset); - w3[0] = amd_bytealign (w1[2], w1[3], offset); - w2[3] = amd_bytealign (w1[1], w1[2], offset); - w2[2] = amd_bytealign (w1[0], w1[1], offset); - w2[1] = amd_bytealign (w0[3], w1[0], offset); - w2[0] = amd_bytealign (w0[2], w0[3], offset); - w1[3] = amd_bytealign (w0[1], w0[2], offset); - w1[2] = amd_bytealign (w0[0], w0[1], offset); - w1[1] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w6[1], w6[2], offset); + w7[2] = hc_bytealign (w6[0], w6[1], offset); + w7[1] = hc_bytealign (w5[3], w6[0], offset); + w7[0] = hc_bytealign (w5[2], w5[3], offset); + w6[3] = hc_bytealign (w5[1], w5[2], offset); + w6[2] = hc_bytealign (w5[0], w5[1], offset); + w6[1] = hc_bytealign (w4[3], w5[0], offset); + w6[0] = hc_bytealign (w4[2], w4[3], offset); + w5[3] = hc_bytealign (w4[1], w4[2], offset); + w5[2] = hc_bytealign (w4[0], w4[1], offset); + w5[1] = hc_bytealign (w3[3], w4[0], offset); + w5[0] = hc_bytealign (w3[2], w3[3], offset); + w4[3] = hc_bytealign (w3[1], w3[2], offset); + w4[2] = hc_bytealign (w3[0], w3[1], offset); + w4[1] = hc_bytealign (w2[3], w3[0], offset); + w4[0] = hc_bytealign (w2[2], w2[3], offset); + w3[3] = hc_bytealign (w2[1], w2[2], offset); + w3[2] = hc_bytealign (w2[0], w2[1], offset); + w3[1] = hc_bytealign (w1[3], w2[0], offset); + w3[0] = hc_bytealign (w1[2], w1[3], offset); + w2[3] = hc_bytealign (w1[1], w1[2], offset); + w2[2] = hc_bytealign (w1[0], w1[1], offset); + w2[1] = hc_bytealign (w0[3], w1[0], offset); + w2[0] = hc_bytealign (w0[2], w0[3], offset); + w1[3] = hc_bytealign (w0[1], w0[2], offset); + w1[2] = hc_bytealign (w0[0], w0[1], offset); + w1[1] = hc_bytealign ( 0, w0[0], offset); w1[0] = 0; w0[3] = 0; w0[2] = 0; @@ -4393,32 +4393,32 @@ DECLSPEC void switch_buffer_by_offset_8x4_le (u32x *w0, u32x *w1, u32x *w2, u32x break; case 6: - w7[3] = amd_bytealign (w6[0], w6[1], offset); - w7[2] = amd_bytealign (w5[3], w6[0], offset); - w7[1] = amd_bytealign (w5[2], w5[3], offset); - w7[0] = amd_bytealign (w5[1], w5[2], offset); - w6[3] = amd_bytealign (w5[0], w5[1], offset); - w6[2] = amd_bytealign (w4[3], w5[0], offset); - w6[1] = amd_bytealign (w4[2], w4[3], offset); - w6[0] = amd_bytealign (w4[1], w4[2], offset); - w5[3] = amd_bytealign (w4[0], w4[1], offset); - w5[2] = amd_bytealign (w3[3], w4[0], offset); - w5[1] = amd_bytealign (w3[2], w3[3], offset); - w5[0] = amd_bytealign (w3[1], w3[2], offset); - w4[3] = amd_bytealign (w3[0], w3[1], offset); - w4[2] = amd_bytealign (w2[3], w3[0], offset); - w4[1] = amd_bytealign (w2[2], w2[3], offset); - w4[0] = amd_bytealign (w2[1], w2[2], offset); - w3[3] = amd_bytealign (w2[0], w2[1], offset); - w3[2] = amd_bytealign (w1[3], w2[0], offset); - w3[1] = amd_bytealign (w1[2], w1[3], offset); - w3[0] = amd_bytealign (w1[1], w1[2], offset); - w2[3] = amd_bytealign (w1[0], w1[1], offset); - w2[2] = amd_bytealign (w0[3], w1[0], offset); - w2[1] = amd_bytealign (w0[2], w0[3], offset); - w2[0] = amd_bytealign (w0[1], w0[2], offset); - w1[3] = amd_bytealign (w0[0], w0[1], offset); - w1[2] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w6[0], w6[1], offset); + w7[2] = hc_bytealign (w5[3], w6[0], offset); + w7[1] = hc_bytealign (w5[2], w5[3], offset); + w7[0] = hc_bytealign (w5[1], w5[2], offset); + w6[3] = hc_bytealign (w5[0], w5[1], offset); + w6[2] = hc_bytealign (w4[3], w5[0], offset); + w6[1] = hc_bytealign (w4[2], w4[3], offset); + w6[0] = hc_bytealign (w4[1], w4[2], offset); + w5[3] = hc_bytealign (w4[0], w4[1], offset); + w5[2] = hc_bytealign (w3[3], w4[0], offset); + w5[1] = hc_bytealign (w3[2], w3[3], offset); + w5[0] = hc_bytealign (w3[1], w3[2], offset); + w4[3] = hc_bytealign (w3[0], w3[1], offset); + w4[2] = hc_bytealign (w2[3], w3[0], offset); + w4[1] = hc_bytealign (w2[2], w2[3], offset); + w4[0] = hc_bytealign (w2[1], w2[2], offset); + w3[3] = hc_bytealign (w2[0], w2[1], offset); + w3[2] = hc_bytealign (w1[3], w2[0], offset); + w3[1] = hc_bytealign (w1[2], w1[3], offset); + w3[0] = hc_bytealign (w1[1], w1[2], offset); + w2[3] = hc_bytealign (w1[0], w1[1], offset); + w2[2] = hc_bytealign (w0[3], w1[0], offset); + w2[1] = hc_bytealign (w0[2], w0[3], offset); + w2[0] = hc_bytealign (w0[1], w0[2], offset); + w1[3] = hc_bytealign (w0[0], w0[1], offset); + w1[2] = hc_bytealign ( 0, w0[0], offset); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -4429,31 +4429,31 @@ DECLSPEC void switch_buffer_by_offset_8x4_le (u32x *w0, u32x *w1, u32x *w2, u32x break; case 7: - w7[3] = amd_bytealign (w5[3], w6[0], offset); - w7[2] = amd_bytealign (w5[2], w5[3], offset); - w7[1] = amd_bytealign (w5[1], w5[2], offset); - w7[0] = amd_bytealign (w5[0], w5[1], offset); - w6[3] = amd_bytealign (w4[3], w5[0], offset); - w6[2] = amd_bytealign (w4[2], w4[3], offset); - w6[1] = amd_bytealign (w4[1], w4[2], offset); - w6[0] = amd_bytealign (w4[0], w4[1], offset); - w5[3] = amd_bytealign (w3[3], w4[0], offset); - w5[2] = amd_bytealign (w3[2], w3[3], offset); - w5[1] = amd_bytealign (w3[1], w3[2], offset); - w5[0] = amd_bytealign (w3[0], w3[1], offset); - w4[3] = amd_bytealign (w2[3], w3[0], offset); - w4[2] = amd_bytealign (w2[2], w2[3], offset); - w4[1] = amd_bytealign (w2[1], w2[2], offset); - w4[0] = amd_bytealign (w2[0], w2[1], offset); - w3[3] = amd_bytealign (w1[3], w2[0], offset); - w3[2] = amd_bytealign (w1[2], w1[3], offset); - w3[1] = amd_bytealign (w1[1], w1[2], offset); - w3[0] = amd_bytealign (w1[0], w1[1], offset); - w2[3] = amd_bytealign (w0[3], w1[0], offset); - w2[2] = amd_bytealign (w0[2], w0[3], offset); - w2[1] = amd_bytealign (w0[1], w0[2], offset); - w2[0] = amd_bytealign (w0[0], w0[1], offset); - w1[3] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w5[3], w6[0], offset); + w7[2] = hc_bytealign (w5[2], w5[3], offset); + w7[1] = hc_bytealign (w5[1], w5[2], offset); + w7[0] = hc_bytealign (w5[0], w5[1], offset); + w6[3] = hc_bytealign (w4[3], w5[0], offset); + w6[2] = hc_bytealign (w4[2], w4[3], offset); + w6[1] = hc_bytealign (w4[1], w4[2], offset); + w6[0] = hc_bytealign (w4[0], w4[1], offset); + w5[3] = hc_bytealign (w3[3], w4[0], offset); + w5[2] = hc_bytealign (w3[2], w3[3], offset); + w5[1] = hc_bytealign (w3[1], w3[2], offset); + w5[0] = hc_bytealign (w3[0], w3[1], offset); + w4[3] = hc_bytealign (w2[3], w3[0], offset); + w4[2] = hc_bytealign (w2[2], w2[3], offset); + w4[1] = hc_bytealign (w2[1], w2[2], offset); + w4[0] = hc_bytealign (w2[0], w2[1], offset); + w3[3] = hc_bytealign (w1[3], w2[0], offset); + w3[2] = hc_bytealign (w1[2], w1[3], offset); + w3[1] = hc_bytealign (w1[1], w1[2], offset); + w3[0] = hc_bytealign (w1[0], w1[1], offset); + w2[3] = hc_bytealign (w0[3], w1[0], offset); + w2[2] = hc_bytealign (w0[2], w0[3], offset); + w2[1] = hc_bytealign (w0[1], w0[2], offset); + w2[0] = hc_bytealign (w0[0], w0[1], offset); + w1[3] = hc_bytealign ( 0, w0[0], offset); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -4465,30 +4465,30 @@ DECLSPEC void switch_buffer_by_offset_8x4_le (u32x *w0, u32x *w1, u32x *w2, u32x break; case 8: - w7[3] = amd_bytealign (w5[2], w5[3], offset); - w7[2] = amd_bytealign (w5[1], w5[2], offset); - w7[1] = amd_bytealign (w5[0], w5[1], offset); - w7[0] = amd_bytealign (w4[3], w5[0], offset); - w6[3] = amd_bytealign (w4[2], w4[3], offset); - w6[2] = amd_bytealign (w4[1], w4[2], offset); - w6[1] = amd_bytealign (w4[0], w4[1], offset); - w6[0] = amd_bytealign (w3[3], w4[0], offset); - w5[3] = amd_bytealign (w3[2], w3[3], offset); - w5[2] = amd_bytealign (w3[1], w3[2], offset); - w5[1] = amd_bytealign (w3[0], w3[1], offset); - w5[0] = amd_bytealign (w2[3], w3[0], offset); - w4[3] = amd_bytealign (w2[2], w2[3], offset); - w4[2] = amd_bytealign (w2[1], w2[2], offset); - w4[1] = amd_bytealign (w2[0], w2[1], offset); - w4[0] = amd_bytealign (w1[3], w2[0], offset); - w3[3] = amd_bytealign (w1[2], w1[3], offset); - w3[2] = amd_bytealign (w1[1], w1[2], offset); - w3[1] = amd_bytealign (w1[0], w1[1], offset); - w3[0] = amd_bytealign (w0[3], w1[0], offset); - w2[3] = amd_bytealign (w0[2], w0[3], offset); - w2[2] = amd_bytealign (w0[1], w0[2], offset); - w2[1] = amd_bytealign (w0[0], w0[1], offset); - w2[0] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w5[2], w5[3], offset); + w7[2] = hc_bytealign (w5[1], w5[2], offset); + w7[1] = hc_bytealign (w5[0], w5[1], offset); + w7[0] = hc_bytealign (w4[3], w5[0], offset); + w6[3] = hc_bytealign (w4[2], w4[3], offset); + w6[2] = hc_bytealign (w4[1], w4[2], offset); + w6[1] = hc_bytealign (w4[0], w4[1], offset); + w6[0] = hc_bytealign (w3[3], w4[0], offset); + w5[3] = hc_bytealign (w3[2], w3[3], offset); + w5[2] = hc_bytealign (w3[1], w3[2], offset); + w5[1] = hc_bytealign (w3[0], w3[1], offset); + w5[0] = hc_bytealign (w2[3], w3[0], offset); + w4[3] = hc_bytealign (w2[2], w2[3], offset); + w4[2] = hc_bytealign (w2[1], w2[2], offset); + w4[1] = hc_bytealign (w2[0], w2[1], offset); + w4[0] = hc_bytealign (w1[3], w2[0], offset); + w3[3] = hc_bytealign (w1[2], w1[3], offset); + w3[2] = hc_bytealign (w1[1], w1[2], offset); + w3[1] = hc_bytealign (w1[0], w1[1], offset); + w3[0] = hc_bytealign (w0[3], w1[0], offset); + w2[3] = hc_bytealign (w0[2], w0[3], offset); + w2[2] = hc_bytealign (w0[1], w0[2], offset); + w2[1] = hc_bytealign (w0[0], w0[1], offset); + w2[0] = hc_bytealign ( 0, w0[0], offset); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -4501,29 +4501,29 @@ DECLSPEC void switch_buffer_by_offset_8x4_le (u32x *w0, u32x *w1, u32x *w2, u32x break; case 9: - w7[3] = amd_bytealign (w5[1], w5[2], offset); - w7[2] = amd_bytealign (w5[0], w5[1], offset); - w7[1] = amd_bytealign (w4[3], w5[0], offset); - w7[0] = amd_bytealign (w4[2], w4[3], offset); - w6[3] = amd_bytealign (w4[1], w4[2], offset); - w6[2] = amd_bytealign (w4[0], w4[1], offset); - w6[1] = amd_bytealign (w3[3], w4[0], offset); - w6[0] = amd_bytealign (w3[2], w3[3], offset); - w5[3] = amd_bytealign (w3[1], w3[2], offset); - w5[2] = amd_bytealign (w3[0], w3[1], offset); - w5[1] = amd_bytealign (w2[3], w3[0], offset); - w5[0] = amd_bytealign (w2[2], w2[3], offset); - w4[3] = amd_bytealign (w2[1], w2[2], offset); - w4[2] = amd_bytealign (w2[0], w2[1], offset); - w4[1] = amd_bytealign (w1[3], w2[0], offset); - w4[0] = amd_bytealign (w1[2], w1[3], offset); - w3[3] = amd_bytealign (w1[1], w1[2], offset); - w3[2] = amd_bytealign (w1[0], w1[1], offset); - w3[1] = amd_bytealign (w0[3], w1[0], offset); - w3[0] = amd_bytealign (w0[2], w0[3], offset); - w2[3] = amd_bytealign (w0[1], w0[2], offset); - w2[2] = amd_bytealign (w0[0], w0[1], offset); - w2[1] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w5[1], w5[2], offset); + w7[2] = hc_bytealign (w5[0], w5[1], offset); + w7[1] = hc_bytealign (w4[3], w5[0], offset); + w7[0] = hc_bytealign (w4[2], w4[3], offset); + w6[3] = hc_bytealign (w4[1], w4[2], offset); + w6[2] = hc_bytealign (w4[0], w4[1], offset); + w6[1] = hc_bytealign (w3[3], w4[0], offset); + w6[0] = hc_bytealign (w3[2], w3[3], offset); + w5[3] = hc_bytealign (w3[1], w3[2], offset); + w5[2] = hc_bytealign (w3[0], w3[1], offset); + w5[1] = hc_bytealign (w2[3], w3[0], offset); + w5[0] = hc_bytealign (w2[2], w2[3], offset); + w4[3] = hc_bytealign (w2[1], w2[2], offset); + w4[2] = hc_bytealign (w2[0], w2[1], offset); + w4[1] = hc_bytealign (w1[3], w2[0], offset); + w4[0] = hc_bytealign (w1[2], w1[3], offset); + w3[3] = hc_bytealign (w1[1], w1[2], offset); + w3[2] = hc_bytealign (w1[0], w1[1], offset); + w3[1] = hc_bytealign (w0[3], w1[0], offset); + w3[0] = hc_bytealign (w0[2], w0[3], offset); + w2[3] = hc_bytealign (w0[1], w0[2], offset); + w2[2] = hc_bytealign (w0[0], w0[1], offset); + w2[1] = hc_bytealign ( 0, w0[0], offset); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -4537,28 +4537,28 @@ DECLSPEC void switch_buffer_by_offset_8x4_le (u32x *w0, u32x *w1, u32x *w2, u32x break; case 10: - w7[3] = amd_bytealign (w5[0], w5[1], offset); - w7[2] = amd_bytealign (w4[3], w5[0], offset); - w7[1] = amd_bytealign (w4[2], w4[3], offset); - w7[0] = amd_bytealign (w4[1], w4[2], offset); - w6[3] = amd_bytealign (w4[0], w4[1], offset); - w6[2] = amd_bytealign (w3[3], w4[0], offset); - w6[1] = amd_bytealign (w3[2], w3[3], offset); - w6[0] = amd_bytealign (w3[1], w3[2], offset); - w5[3] = amd_bytealign (w3[0], w3[1], offset); - w5[2] = amd_bytealign (w2[3], w3[0], offset); - w5[1] = amd_bytealign (w2[2], w2[3], offset); - w5[0] = amd_bytealign (w2[1], w2[2], offset); - w4[3] = amd_bytealign (w2[0], w2[1], offset); - w4[2] = amd_bytealign (w1[3], w2[0], offset); - w4[1] = amd_bytealign (w1[2], w1[3], offset); - w4[0] = amd_bytealign (w1[1], w1[2], offset); - w3[3] = amd_bytealign (w1[0], w1[1], offset); - w3[2] = amd_bytealign (w0[3], w1[0], offset); - w3[1] = amd_bytealign (w0[2], w0[3], offset); - w3[0] = amd_bytealign (w0[1], w0[2], offset); - w2[3] = amd_bytealign (w0[0], w0[1], offset); - w2[2] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w5[0], w5[1], offset); + w7[2] = hc_bytealign (w4[3], w5[0], offset); + w7[1] = hc_bytealign (w4[2], w4[3], offset); + w7[0] = hc_bytealign (w4[1], w4[2], offset); + w6[3] = hc_bytealign (w4[0], w4[1], offset); + w6[2] = hc_bytealign (w3[3], w4[0], offset); + w6[1] = hc_bytealign (w3[2], w3[3], offset); + w6[0] = hc_bytealign (w3[1], w3[2], offset); + w5[3] = hc_bytealign (w3[0], w3[1], offset); + w5[2] = hc_bytealign (w2[3], w3[0], offset); + w5[1] = hc_bytealign (w2[2], w2[3], offset); + w5[0] = hc_bytealign (w2[1], w2[2], offset); + w4[3] = hc_bytealign (w2[0], w2[1], offset); + w4[2] = hc_bytealign (w1[3], w2[0], offset); + w4[1] = hc_bytealign (w1[2], w1[3], offset); + w4[0] = hc_bytealign (w1[1], w1[2], offset); + w3[3] = hc_bytealign (w1[0], w1[1], offset); + w3[2] = hc_bytealign (w0[3], w1[0], offset); + w3[1] = hc_bytealign (w0[2], w0[3], offset); + w3[0] = hc_bytealign (w0[1], w0[2], offset); + w2[3] = hc_bytealign (w0[0], w0[1], offset); + w2[2] = hc_bytealign ( 0, w0[0], offset); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -4573,27 +4573,27 @@ DECLSPEC void switch_buffer_by_offset_8x4_le (u32x *w0, u32x *w1, u32x *w2, u32x break; case 11: - w7[3] = amd_bytealign (w4[3], w5[0], offset); - w7[2] = amd_bytealign (w4[2], w4[3], offset); - w7[1] = amd_bytealign (w4[1], w4[2], offset); - w7[0] = amd_bytealign (w4[0], w4[1], offset); - w6[3] = amd_bytealign (w3[3], w4[0], offset); - w6[2] = amd_bytealign (w3[2], w3[3], offset); - w6[1] = amd_bytealign (w3[1], w3[2], offset); - w6[0] = amd_bytealign (w3[0], w3[1], offset); - w5[3] = amd_bytealign (w2[3], w3[0], offset); - w5[2] = amd_bytealign (w2[2], w2[3], offset); - w5[1] = amd_bytealign (w2[1], w2[2], offset); - w5[0] = amd_bytealign (w2[0], w2[1], offset); - w4[3] = amd_bytealign (w1[3], w2[0], offset); - w4[2] = amd_bytealign (w1[2], w1[3], offset); - w4[1] = amd_bytealign (w1[1], w1[2], offset); - w4[0] = amd_bytealign (w1[0], w1[1], offset); - w3[3] = amd_bytealign (w0[3], w1[0], offset); - w3[2] = amd_bytealign (w0[2], w0[3], offset); - w3[1] = amd_bytealign (w0[1], w0[2], offset); - w3[0] = amd_bytealign (w0[0], w0[1], offset); - w2[3] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w4[3], w5[0], offset); + w7[2] = hc_bytealign (w4[2], w4[3], offset); + w7[1] = hc_bytealign (w4[1], w4[2], offset); + w7[0] = hc_bytealign (w4[0], w4[1], offset); + w6[3] = hc_bytealign (w3[3], w4[0], offset); + w6[2] = hc_bytealign (w3[2], w3[3], offset); + w6[1] = hc_bytealign (w3[1], w3[2], offset); + w6[0] = hc_bytealign (w3[0], w3[1], offset); + w5[3] = hc_bytealign (w2[3], w3[0], offset); + w5[2] = hc_bytealign (w2[2], w2[3], offset); + w5[1] = hc_bytealign (w2[1], w2[2], offset); + w5[0] = hc_bytealign (w2[0], w2[1], offset); + w4[3] = hc_bytealign (w1[3], w2[0], offset); + w4[2] = hc_bytealign (w1[2], w1[3], offset); + w4[1] = hc_bytealign (w1[1], w1[2], offset); + w4[0] = hc_bytealign (w1[0], w1[1], offset); + w3[3] = hc_bytealign (w0[3], w1[0], offset); + w3[2] = hc_bytealign (w0[2], w0[3], offset); + w3[1] = hc_bytealign (w0[1], w0[2], offset); + w3[0] = hc_bytealign (w0[0], w0[1], offset); + w2[3] = hc_bytealign ( 0, w0[0], offset); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -4609,26 +4609,26 @@ DECLSPEC void switch_buffer_by_offset_8x4_le (u32x *w0, u32x *w1, u32x *w2, u32x break; case 12: - w7[3] = amd_bytealign (w4[2], w4[3], offset); - w7[2] = amd_bytealign (w4[1], w4[2], offset); - w7[1] = amd_bytealign (w4[0], w4[1], offset); - w7[0] = amd_bytealign (w3[3], w4[0], offset); - w6[3] = amd_bytealign (w3[2], w3[3], offset); - w6[2] = amd_bytealign (w3[1], w3[2], offset); - w6[1] = amd_bytealign (w3[0], w3[1], offset); - w6[0] = amd_bytealign (w2[3], w3[0], offset); - w5[3] = amd_bytealign (w2[2], w2[3], offset); - w5[2] = amd_bytealign (w2[1], w2[2], offset); - w5[1] = amd_bytealign (w2[0], w2[1], offset); - w5[0] = amd_bytealign (w1[3], w2[0], offset); - w4[3] = amd_bytealign (w1[2], w1[3], offset); - w4[2] = amd_bytealign (w1[1], w1[2], offset); - w4[1] = amd_bytealign (w1[0], w1[1], offset); - w4[0] = amd_bytealign (w0[3], w1[0], offset); - w3[3] = amd_bytealign (w0[2], w0[3], offset); - w3[2] = amd_bytealign (w0[1], w0[2], offset); - w3[1] = amd_bytealign (w0[0], w0[1], offset); - w3[0] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w4[2], w4[3], offset); + w7[2] = hc_bytealign (w4[1], w4[2], offset); + w7[1] = hc_bytealign (w4[0], w4[1], offset); + w7[0] = hc_bytealign (w3[3], w4[0], offset); + w6[3] = hc_bytealign (w3[2], w3[3], offset); + w6[2] = hc_bytealign (w3[1], w3[2], offset); + w6[1] = hc_bytealign (w3[0], w3[1], offset); + w6[0] = hc_bytealign (w2[3], w3[0], offset); + w5[3] = hc_bytealign (w2[2], w2[3], offset); + w5[2] = hc_bytealign (w2[1], w2[2], offset); + w5[1] = hc_bytealign (w2[0], w2[1], offset); + w5[0] = hc_bytealign (w1[3], w2[0], offset); + w4[3] = hc_bytealign (w1[2], w1[3], offset); + w4[2] = hc_bytealign (w1[1], w1[2], offset); + w4[1] = hc_bytealign (w1[0], w1[1], offset); + w4[0] = hc_bytealign (w0[3], w1[0], offset); + w3[3] = hc_bytealign (w0[2], w0[3], offset); + w3[2] = hc_bytealign (w0[1], w0[2], offset); + w3[1] = hc_bytealign (w0[0], w0[1], offset); + w3[0] = hc_bytealign ( 0, w0[0], offset); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -4645,25 +4645,25 @@ DECLSPEC void switch_buffer_by_offset_8x4_le (u32x *w0, u32x *w1, u32x *w2, u32x break; case 13: - w7[3] = amd_bytealign (w4[1], w4[2], offset); - w7[2] = amd_bytealign (w4[0], w4[1], offset); - w7[1] = amd_bytealign (w3[3], w4[0], offset); - w7[0] = amd_bytealign (w3[2], w3[3], offset); - w6[3] = amd_bytealign (w3[1], w3[2], offset); - w6[2] = amd_bytealign (w3[0], w3[1], offset); - w6[1] = amd_bytealign (w2[3], w3[0], offset); - w6[0] = amd_bytealign (w2[2], w2[3], offset); - w5[3] = amd_bytealign (w2[1], w2[2], offset); - w5[2] = amd_bytealign (w2[0], w2[1], offset); - w5[1] = amd_bytealign (w1[3], w2[0], offset); - w5[0] = amd_bytealign (w1[2], w1[3], offset); - w4[3] = amd_bytealign (w1[1], w1[2], offset); - w4[2] = amd_bytealign (w1[0], w1[1], offset); - w4[1] = amd_bytealign (w0[3], w1[0], offset); - w4[0] = amd_bytealign (w0[2], w0[3], offset); - w3[3] = amd_bytealign (w0[1], w0[2], offset); - w3[2] = amd_bytealign (w0[0], w0[1], offset); - w3[1] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w4[1], w4[2], offset); + w7[2] = hc_bytealign (w4[0], w4[1], offset); + w7[1] = hc_bytealign (w3[3], w4[0], offset); + w7[0] = hc_bytealign (w3[2], w3[3], offset); + w6[3] = hc_bytealign (w3[1], w3[2], offset); + w6[2] = hc_bytealign (w3[0], w3[1], offset); + w6[1] = hc_bytealign (w2[3], w3[0], offset); + w6[0] = hc_bytealign (w2[2], w2[3], offset); + w5[3] = hc_bytealign (w2[1], w2[2], offset); + w5[2] = hc_bytealign (w2[0], w2[1], offset); + w5[1] = hc_bytealign (w1[3], w2[0], offset); + w5[0] = hc_bytealign (w1[2], w1[3], offset); + w4[3] = hc_bytealign (w1[1], w1[2], offset); + w4[2] = hc_bytealign (w1[0], w1[1], offset); + w4[1] = hc_bytealign (w0[3], w1[0], offset); + w4[0] = hc_bytealign (w0[2], w0[3], offset); + w3[3] = hc_bytealign (w0[1], w0[2], offset); + w3[2] = hc_bytealign (w0[0], w0[1], offset); + w3[1] = hc_bytealign ( 0, w0[0], offset); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -4681,24 +4681,24 @@ DECLSPEC void switch_buffer_by_offset_8x4_le (u32x *w0, u32x *w1, u32x *w2, u32x break; case 14: - w7[3] = amd_bytealign (w4[0], w4[1], offset); - w7[2] = amd_bytealign (w3[3], w4[0], offset); - w7[1] = amd_bytealign (w3[2], w3[3], offset); - w7[0] = amd_bytealign (w3[1], w3[2], offset); - w6[3] = amd_bytealign (w3[0], w3[1], offset); - w6[2] = amd_bytealign (w2[3], w3[0], offset); - w6[1] = amd_bytealign (w2[2], w2[3], offset); - w6[0] = amd_bytealign (w2[1], w2[2], offset); - w5[3] = amd_bytealign (w2[0], w2[1], offset); - w5[2] = amd_bytealign (w1[3], w2[0], offset); - w5[1] = amd_bytealign (w1[2], w1[3], offset); - w5[0] = amd_bytealign (w1[1], w1[2], offset); - w4[3] = amd_bytealign (w1[0], w1[1], offset); - w4[2] = amd_bytealign (w0[3], w1[0], offset); - w4[1] = amd_bytealign (w0[2], w0[3], offset); - w4[0] = amd_bytealign (w0[1], w0[2], offset); - w3[3] = amd_bytealign (w0[0], w0[1], offset); - w3[2] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w4[0], w4[1], offset); + w7[2] = hc_bytealign (w3[3], w4[0], offset); + w7[1] = hc_bytealign (w3[2], w3[3], offset); + w7[0] = hc_bytealign (w3[1], w3[2], offset); + w6[3] = hc_bytealign (w3[0], w3[1], offset); + w6[2] = hc_bytealign (w2[3], w3[0], offset); + w6[1] = hc_bytealign (w2[2], w2[3], offset); + w6[0] = hc_bytealign (w2[1], w2[2], offset); + w5[3] = hc_bytealign (w2[0], w2[1], offset); + w5[2] = hc_bytealign (w1[3], w2[0], offset); + w5[1] = hc_bytealign (w1[2], w1[3], offset); + w5[0] = hc_bytealign (w1[1], w1[2], offset); + w4[3] = hc_bytealign (w1[0], w1[1], offset); + w4[2] = hc_bytealign (w0[3], w1[0], offset); + w4[1] = hc_bytealign (w0[2], w0[3], offset); + w4[0] = hc_bytealign (w0[1], w0[2], offset); + w3[3] = hc_bytealign (w0[0], w0[1], offset); + w3[2] = hc_bytealign ( 0, w0[0], offset); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -4717,23 +4717,23 @@ DECLSPEC void switch_buffer_by_offset_8x4_le (u32x *w0, u32x *w1, u32x *w2, u32x break; case 15: - w7[3] = amd_bytealign (w3[3], w4[0], offset); - w7[2] = amd_bytealign (w3[2], w3[3], offset); - w7[1] = amd_bytealign (w3[1], w3[2], offset); - w7[0] = amd_bytealign (w3[0], w3[1], offset); - w6[3] = amd_bytealign (w2[3], w3[0], offset); - w6[2] = amd_bytealign (w2[2], w2[3], offset); - w6[1] = amd_bytealign (w2[1], w2[2], offset); - w6[0] = amd_bytealign (w2[0], w2[1], offset); - w5[3] = amd_bytealign (w1[3], w2[0], offset); - w5[2] = amd_bytealign (w1[2], w1[3], offset); - w5[1] = amd_bytealign (w1[1], w1[2], offset); - w5[0] = amd_bytealign (w1[0], w1[1], offset); - w4[3] = amd_bytealign (w0[3], w1[0], offset); - w4[2] = amd_bytealign (w0[2], w0[3], offset); - w4[1] = amd_bytealign (w0[1], w0[2], offset); - w4[0] = amd_bytealign (w0[0], w0[1], offset); - w3[3] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w3[3], w4[0], offset); + w7[2] = hc_bytealign (w3[2], w3[3], offset); + w7[1] = hc_bytealign (w3[1], w3[2], offset); + w7[0] = hc_bytealign (w3[0], w3[1], offset); + w6[3] = hc_bytealign (w2[3], w3[0], offset); + w6[2] = hc_bytealign (w2[2], w2[3], offset); + w6[1] = hc_bytealign (w2[1], w2[2], offset); + w6[0] = hc_bytealign (w2[0], w2[1], offset); + w5[3] = hc_bytealign (w1[3], w2[0], offset); + w5[2] = hc_bytealign (w1[2], w1[3], offset); + w5[1] = hc_bytealign (w1[1], w1[2], offset); + w5[0] = hc_bytealign (w1[0], w1[1], offset); + w4[3] = hc_bytealign (w0[3], w1[0], offset); + w4[2] = hc_bytealign (w0[2], w0[3], offset); + w4[1] = hc_bytealign (w0[1], w0[2], offset); + w4[0] = hc_bytealign (w0[0], w0[1], offset); + w3[3] = hc_bytealign ( 0, w0[0], offset); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -4753,22 +4753,22 @@ DECLSPEC void switch_buffer_by_offset_8x4_le (u32x *w0, u32x *w1, u32x *w2, u32x break; case 16: - w7[3] = amd_bytealign (w3[2], w3[3], offset); - w7[2] = amd_bytealign (w3[1], w3[2], offset); - w7[1] = amd_bytealign (w3[0], w3[1], offset); - w7[0] = amd_bytealign (w2[3], w3[0], offset); - w6[3] = amd_bytealign (w2[2], w2[3], offset); - w6[2] = amd_bytealign (w2[1], w2[2], offset); - w6[1] = amd_bytealign (w2[0], w2[1], offset); - w6[0] = amd_bytealign (w1[3], w2[0], offset); - w5[3] = amd_bytealign (w1[2], w1[3], offset); - w5[2] = amd_bytealign (w1[1], w1[2], offset); - w5[1] = amd_bytealign (w1[0], w1[1], offset); - w5[0] = amd_bytealign (w0[3], w1[0], offset); - w4[3] = amd_bytealign (w0[2], w0[3], offset); - w4[2] = amd_bytealign (w0[1], w0[2], offset); - w4[1] = amd_bytealign (w0[0], w0[1], offset); - w4[0] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w3[2], w3[3], offset); + w7[2] = hc_bytealign (w3[1], w3[2], offset); + w7[1] = hc_bytealign (w3[0], w3[1], offset); + w7[0] = hc_bytealign (w2[3], w3[0], offset); + w6[3] = hc_bytealign (w2[2], w2[3], offset); + w6[2] = hc_bytealign (w2[1], w2[2], offset); + w6[1] = hc_bytealign (w2[0], w2[1], offset); + w6[0] = hc_bytealign (w1[3], w2[0], offset); + w5[3] = hc_bytealign (w1[2], w1[3], offset); + w5[2] = hc_bytealign (w1[1], w1[2], offset); + w5[1] = hc_bytealign (w1[0], w1[1], offset); + w5[0] = hc_bytealign (w0[3], w1[0], offset); + w4[3] = hc_bytealign (w0[2], w0[3], offset); + w4[2] = hc_bytealign (w0[1], w0[2], offset); + w4[1] = hc_bytealign (w0[0], w0[1], offset); + w4[0] = hc_bytealign ( 0, w0[0], offset); w3[3] = 0; w3[2] = 0; w3[1] = 0; @@ -4789,21 +4789,21 @@ DECLSPEC void switch_buffer_by_offset_8x4_le (u32x *w0, u32x *w1, u32x *w2, u32x break; case 17: - w7[3] = amd_bytealign (w3[1], w3[2], offset); - w7[2] = amd_bytealign (w3[0], w3[1], offset); - w7[1] = amd_bytealign (w2[3], w3[0], offset); - w7[0] = amd_bytealign (w2[2], w2[3], offset); - w6[3] = amd_bytealign (w2[1], w2[2], offset); - w6[2] = amd_bytealign (w2[0], w2[1], offset); - w6[1] = amd_bytealign (w1[3], w2[0], offset); - w6[0] = amd_bytealign (w1[2], w1[3], offset); - w5[3] = amd_bytealign (w1[1], w1[2], offset); - w5[2] = amd_bytealign (w1[0], w1[1], offset); - w5[1] = amd_bytealign (w0[3], w1[0], offset); - w5[0] = amd_bytealign (w0[2], w0[3], offset); - w4[3] = amd_bytealign (w0[1], w0[2], offset); - w4[2] = amd_bytealign (w0[0], w0[1], offset); - w4[1] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w3[1], w3[2], offset); + w7[2] = hc_bytealign (w3[0], w3[1], offset); + w7[1] = hc_bytealign (w2[3], w3[0], offset); + w7[0] = hc_bytealign (w2[2], w2[3], offset); + w6[3] = hc_bytealign (w2[1], w2[2], offset); + w6[2] = hc_bytealign (w2[0], w2[1], offset); + w6[1] = hc_bytealign (w1[3], w2[0], offset); + w6[0] = hc_bytealign (w1[2], w1[3], offset); + w5[3] = hc_bytealign (w1[1], w1[2], offset); + w5[2] = hc_bytealign (w1[0], w1[1], offset); + w5[1] = hc_bytealign (w0[3], w1[0], offset); + w5[0] = hc_bytealign (w0[2], w0[3], offset); + w4[3] = hc_bytealign (w0[1], w0[2], offset); + w4[2] = hc_bytealign (w0[0], w0[1], offset); + w4[1] = hc_bytealign ( 0, w0[0], offset); w4[0] = 0; w3[3] = 0; w3[2] = 0; @@ -4825,20 +4825,20 @@ DECLSPEC void switch_buffer_by_offset_8x4_le (u32x *w0, u32x *w1, u32x *w2, u32x break; case 18: - w7[3] = amd_bytealign (w3[0], w3[1], offset); - w7[2] = amd_bytealign (w2[3], w3[0], offset); - w7[1] = amd_bytealign (w2[2], w2[3], offset); - w7[0] = amd_bytealign (w2[1], w2[2], offset); - w6[3] = amd_bytealign (w2[0], w2[1], offset); - w6[2] = amd_bytealign (w1[3], w2[0], offset); - w6[1] = amd_bytealign (w1[2], w1[3], offset); - w6[0] = amd_bytealign (w1[1], w1[2], offset); - w5[3] = amd_bytealign (w1[0], w1[1], offset); - w5[2] = amd_bytealign (w0[3], w1[0], offset); - w5[1] = amd_bytealign (w0[2], w0[3], offset); - w5[0] = amd_bytealign (w0[1], w0[2], offset); - w4[3] = amd_bytealign (w0[0], w0[1], offset); - w4[2] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w3[0], w3[1], offset); + w7[2] = hc_bytealign (w2[3], w3[0], offset); + w7[1] = hc_bytealign (w2[2], w2[3], offset); + w7[0] = hc_bytealign (w2[1], w2[2], offset); + w6[3] = hc_bytealign (w2[0], w2[1], offset); + w6[2] = hc_bytealign (w1[3], w2[0], offset); + w6[1] = hc_bytealign (w1[2], w1[3], offset); + w6[0] = hc_bytealign (w1[1], w1[2], offset); + w5[3] = hc_bytealign (w1[0], w1[1], offset); + w5[2] = hc_bytealign (w0[3], w1[0], offset); + w5[1] = hc_bytealign (w0[2], w0[3], offset); + w5[0] = hc_bytealign (w0[1], w0[2], offset); + w4[3] = hc_bytealign (w0[0], w0[1], offset); + w4[2] = hc_bytealign ( 0, w0[0], offset); w4[1] = 0; w4[0] = 0; w3[3] = 0; @@ -4861,19 +4861,19 @@ DECLSPEC void switch_buffer_by_offset_8x4_le (u32x *w0, u32x *w1, u32x *w2, u32x break; case 19: - w7[3] = amd_bytealign (w2[3], w3[0], offset); - w7[2] = amd_bytealign (w2[2], w2[3], offset); - w7[1] = amd_bytealign (w2[1], w2[2], offset); - w7[0] = amd_bytealign (w2[0], w2[1], offset); - w6[3] = amd_bytealign (w1[3], w2[0], offset); - w6[2] = amd_bytealign (w1[2], w1[3], offset); - w6[1] = amd_bytealign (w1[1], w1[2], offset); - w6[0] = amd_bytealign (w1[0], w1[1], offset); - w5[3] = amd_bytealign (w0[3], w1[0], offset); - w5[2] = amd_bytealign (w0[2], w0[3], offset); - w5[1] = amd_bytealign (w0[1], w0[2], offset); - w5[0] = amd_bytealign (w0[0], w0[1], offset); - w4[3] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w2[3], w3[0], offset); + w7[2] = hc_bytealign (w2[2], w2[3], offset); + w7[1] = hc_bytealign (w2[1], w2[2], offset); + w7[0] = hc_bytealign (w2[0], w2[1], offset); + w6[3] = hc_bytealign (w1[3], w2[0], offset); + w6[2] = hc_bytealign (w1[2], w1[3], offset); + w6[1] = hc_bytealign (w1[1], w1[2], offset); + w6[0] = hc_bytealign (w1[0], w1[1], offset); + w5[3] = hc_bytealign (w0[3], w1[0], offset); + w5[2] = hc_bytealign (w0[2], w0[3], offset); + w5[1] = hc_bytealign (w0[1], w0[2], offset); + w5[0] = hc_bytealign (w0[0], w0[1], offset); + w4[3] = hc_bytealign ( 0, w0[0], offset); w4[2] = 0; w4[1] = 0; w4[0] = 0; @@ -4897,18 +4897,18 @@ DECLSPEC void switch_buffer_by_offset_8x4_le (u32x *w0, u32x *w1, u32x *w2, u32x break; case 20: - w7[3] = amd_bytealign (w2[2], w2[3], offset); - w7[2] = amd_bytealign (w2[1], w2[2], offset); - w7[1] = amd_bytealign (w2[0], w2[1], offset); - w7[0] = amd_bytealign (w1[3], w2[0], offset); - w6[3] = amd_bytealign (w1[2], w1[3], offset); - w6[2] = amd_bytealign (w1[1], w1[2], offset); - w6[1] = amd_bytealign (w1[0], w1[1], offset); - w6[0] = amd_bytealign (w0[3], w1[0], offset); - w5[3] = amd_bytealign (w0[2], w0[3], offset); - w5[2] = amd_bytealign (w0[1], w0[2], offset); - w5[1] = amd_bytealign (w0[0], w0[1], offset); - w5[0] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w2[2], w2[3], offset); + w7[2] = hc_bytealign (w2[1], w2[2], offset); + w7[1] = hc_bytealign (w2[0], w2[1], offset); + w7[0] = hc_bytealign (w1[3], w2[0], offset); + w6[3] = hc_bytealign (w1[2], w1[3], offset); + w6[2] = hc_bytealign (w1[1], w1[2], offset); + w6[1] = hc_bytealign (w1[0], w1[1], offset); + w6[0] = hc_bytealign (w0[3], w1[0], offset); + w5[3] = hc_bytealign (w0[2], w0[3], offset); + w5[2] = hc_bytealign (w0[1], w0[2], offset); + w5[1] = hc_bytealign (w0[0], w0[1], offset); + w5[0] = hc_bytealign ( 0, w0[0], offset); w4[3] = 0; w4[2] = 0; w4[1] = 0; @@ -4933,17 +4933,17 @@ DECLSPEC void switch_buffer_by_offset_8x4_le (u32x *w0, u32x *w1, u32x *w2, u32x break; case 21: - w7[3] = amd_bytealign (w2[1], w2[2], offset); - w7[2] = amd_bytealign (w2[0], w2[1], offset); - w7[1] = amd_bytealign (w1[3], w2[0], offset); - w7[0] = amd_bytealign (w1[2], w1[3], offset); - w6[3] = amd_bytealign (w1[1], w1[2], offset); - w6[2] = amd_bytealign (w1[0], w1[1], offset); - w6[1] = amd_bytealign (w0[3], w1[0], offset); - w6[0] = amd_bytealign (w0[2], w0[3], offset); - w5[3] = amd_bytealign (w0[1], w0[2], offset); - w5[2] = amd_bytealign (w0[0], w0[1], offset); - w5[1] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w2[1], w2[2], offset); + w7[2] = hc_bytealign (w2[0], w2[1], offset); + w7[1] = hc_bytealign (w1[3], w2[0], offset); + w7[0] = hc_bytealign (w1[2], w1[3], offset); + w6[3] = hc_bytealign (w1[1], w1[2], offset); + w6[2] = hc_bytealign (w1[0], w1[1], offset); + w6[1] = hc_bytealign (w0[3], w1[0], offset); + w6[0] = hc_bytealign (w0[2], w0[3], offset); + w5[3] = hc_bytealign (w0[1], w0[2], offset); + w5[2] = hc_bytealign (w0[0], w0[1], offset); + w5[1] = hc_bytealign ( 0, w0[0], offset); w5[0] = 0; w4[3] = 0; w4[2] = 0; @@ -4969,16 +4969,16 @@ DECLSPEC void switch_buffer_by_offset_8x4_le (u32x *w0, u32x *w1, u32x *w2, u32x break; case 22: - w7[3] = amd_bytealign (w2[0], w2[1], offset); - w7[2] = amd_bytealign (w1[3], w2[0], offset); - w7[1] = amd_bytealign (w1[2], w1[3], offset); - w7[0] = amd_bytealign (w1[1], w1[2], offset); - w6[3] = amd_bytealign (w1[0], w1[1], offset); - w6[2] = amd_bytealign (w0[3], w1[0], offset); - w6[1] = amd_bytealign (w0[2], w0[3], offset); - w6[0] = amd_bytealign (w0[1], w0[2], offset); - w5[3] = amd_bytealign (w0[0], w0[1], offset); - w5[2] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w2[0], w2[1], offset); + w7[2] = hc_bytealign (w1[3], w2[0], offset); + w7[1] = hc_bytealign (w1[2], w1[3], offset); + w7[0] = hc_bytealign (w1[1], w1[2], offset); + w6[3] = hc_bytealign (w1[0], w1[1], offset); + w6[2] = hc_bytealign (w0[3], w1[0], offset); + w6[1] = hc_bytealign (w0[2], w0[3], offset); + w6[0] = hc_bytealign (w0[1], w0[2], offset); + w5[3] = hc_bytealign (w0[0], w0[1], offset); + w5[2] = hc_bytealign ( 0, w0[0], offset); w5[1] = 0; w5[0] = 0; w4[3] = 0; @@ -5005,15 +5005,15 @@ DECLSPEC void switch_buffer_by_offset_8x4_le (u32x *w0, u32x *w1, u32x *w2, u32x break; case 23: - w7[3] = amd_bytealign (w1[3], w2[0], offset); - w7[2] = amd_bytealign (w1[2], w1[3], offset); - w7[1] = amd_bytealign (w1[1], w1[2], offset); - w7[0] = amd_bytealign (w1[0], w1[1], offset); - w6[3] = amd_bytealign (w0[3], w1[0], offset); - w6[2] = amd_bytealign (w0[2], w0[3], offset); - w6[1] = amd_bytealign (w0[1], w0[2], offset); - w6[0] = amd_bytealign (w0[0], w0[1], offset); - w5[3] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w1[3], w2[0], offset); + w7[2] = hc_bytealign (w1[2], w1[3], offset); + w7[1] = hc_bytealign (w1[1], w1[2], offset); + w7[0] = hc_bytealign (w1[0], w1[1], offset); + w6[3] = hc_bytealign (w0[3], w1[0], offset); + w6[2] = hc_bytealign (w0[2], w0[3], offset); + w6[1] = hc_bytealign (w0[1], w0[2], offset); + w6[0] = hc_bytealign (w0[0], w0[1], offset); + w5[3] = hc_bytealign ( 0, w0[0], offset); w5[2] = 0; w5[1] = 0; w5[0] = 0; @@ -5041,14 +5041,14 @@ DECLSPEC void switch_buffer_by_offset_8x4_le (u32x *w0, u32x *w1, u32x *w2, u32x break; case 24: - w7[3] = amd_bytealign (w1[2], w1[3], offset); - w7[2] = amd_bytealign (w1[1], w1[2], offset); - w7[1] = amd_bytealign (w1[0], w1[1], offset); - w7[0] = amd_bytealign (w0[3], w1[0], offset); - w6[3] = amd_bytealign (w0[2], w0[3], offset); - w6[2] = amd_bytealign (w0[1], w0[2], offset); - w6[1] = amd_bytealign (w0[0], w0[1], offset); - w6[0] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w1[2], w1[3], offset); + w7[2] = hc_bytealign (w1[1], w1[2], offset); + w7[1] = hc_bytealign (w1[0], w1[1], offset); + w7[0] = hc_bytealign (w0[3], w1[0], offset); + w6[3] = hc_bytealign (w0[2], w0[3], offset); + w6[2] = hc_bytealign (w0[1], w0[2], offset); + w6[1] = hc_bytealign (w0[0], w0[1], offset); + w6[0] = hc_bytealign ( 0, w0[0], offset); w5[3] = 0; w5[2] = 0; w5[1] = 0; @@ -5077,13 +5077,13 @@ DECLSPEC void switch_buffer_by_offset_8x4_le (u32x *w0, u32x *w1, u32x *w2, u32x break; case 25: - w7[3] = amd_bytealign (w1[1], w1[2], offset); - w7[2] = amd_bytealign (w1[0], w1[1], offset); - w7[1] = amd_bytealign (w0[3], w1[0], offset); - w7[0] = amd_bytealign (w0[2], w0[3], offset); - w6[3] = amd_bytealign (w0[1], w0[2], offset); - w6[2] = amd_bytealign (w0[0], w0[1], offset); - w6[1] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w1[1], w1[2], offset); + w7[2] = hc_bytealign (w1[0], w1[1], offset); + w7[1] = hc_bytealign (w0[3], w1[0], offset); + w7[0] = hc_bytealign (w0[2], w0[3], offset); + w6[3] = hc_bytealign (w0[1], w0[2], offset); + w6[2] = hc_bytealign (w0[0], w0[1], offset); + w6[1] = hc_bytealign ( 0, w0[0], offset); w6[0] = 0; w5[3] = 0; w5[2] = 0; @@ -5113,12 +5113,12 @@ DECLSPEC void switch_buffer_by_offset_8x4_le (u32x *w0, u32x *w1, u32x *w2, u32x break; case 26: - w7[3] = amd_bytealign (w1[0], w1[1], offset); - w7[2] = amd_bytealign (w0[3], w1[0], offset); - w7[1] = amd_bytealign (w0[2], w0[3], offset); - w7[0] = amd_bytealign (w0[1], w0[2], offset); - w6[3] = amd_bytealign (w0[0], w0[1], offset); - w6[2] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w1[0], w1[1], offset); + w7[2] = hc_bytealign (w0[3], w1[0], offset); + w7[1] = hc_bytealign (w0[2], w0[3], offset); + w7[0] = hc_bytealign (w0[1], w0[2], offset); + w6[3] = hc_bytealign (w0[0], w0[1], offset); + w6[2] = hc_bytealign ( 0, w0[0], offset); w6[1] = 0; w6[0] = 0; w5[3] = 0; @@ -5149,11 +5149,11 @@ DECLSPEC void switch_buffer_by_offset_8x4_le (u32x *w0, u32x *w1, u32x *w2, u32x break; case 27: - w7[3] = amd_bytealign (w0[3], w1[0], offset); - w7[2] = amd_bytealign (w0[2], w0[3], offset); - w7[1] = amd_bytealign (w0[1], w0[2], offset); - w7[0] = amd_bytealign (w0[0], w0[1], offset); - w6[3] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w0[3], w1[0], offset); + w7[2] = hc_bytealign (w0[2], w0[3], offset); + w7[1] = hc_bytealign (w0[1], w0[2], offset); + w7[0] = hc_bytealign (w0[0], w0[1], offset); + w6[3] = hc_bytealign ( 0, w0[0], offset); w6[2] = 0; w6[1] = 0; w6[0] = 0; @@ -5185,10 +5185,10 @@ DECLSPEC void switch_buffer_by_offset_8x4_le (u32x *w0, u32x *w1, u32x *w2, u32x break; case 28: - w7[3] = amd_bytealign (w0[2], w0[3], offset); - w7[2] = amd_bytealign (w0[1], w0[2], offset); - w7[1] = amd_bytealign (w0[0], w0[1], offset); - w7[0] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w0[2], w0[3], offset); + w7[2] = hc_bytealign (w0[1], w0[2], offset); + w7[1] = hc_bytealign (w0[0], w0[1], offset); + w7[0] = hc_bytealign ( 0, w0[0], offset); w6[3] = 0; w6[2] = 0; w6[1] = 0; @@ -5221,9 +5221,9 @@ DECLSPEC void switch_buffer_by_offset_8x4_le (u32x *w0, u32x *w1, u32x *w2, u32x break; case 29: - w7[3] = amd_bytealign (w0[1], w0[2], offset); - w7[2] = amd_bytealign (w0[0], w0[1], offset); - w7[1] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w0[1], w0[2], offset); + w7[2] = hc_bytealign (w0[0], w0[1], offset); + w7[1] = hc_bytealign ( 0, w0[0], offset); w7[0] = 0; w6[3] = 0; w6[2] = 0; @@ -5257,8 +5257,8 @@ DECLSPEC void switch_buffer_by_offset_8x4_le (u32x *w0, u32x *w1, u32x *w2, u32x break; case 30: - w7[3] = amd_bytealign (w0[0], w0[1], offset); - w7[2] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w0[0], w0[1], offset); + w7[2] = hc_bytealign ( 0, w0[0], offset); w7[1] = 0; w7[0] = 0; w6[3] = 0; @@ -5293,7 +5293,7 @@ DECLSPEC void switch_buffer_by_offset_8x4_le (u32x *w0, u32x *w1, u32x *w2, u32x break; case 31: - w7[3] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign ( 0, w0[0], offset); w7[2] = 0; w7[1] = 0; w7[0] = 0; @@ -5376,174 +5376,174 @@ DECLSPEC void switch_buffer_by_offset_8x4_le (u32x *w0, u32x *w1, u32x *w2, u32x switch (offset_switch) { case 0: - w7[3] = __byte_perm (w7[2], w7[3], selector); - w7[2] = __byte_perm (w7[1], w7[2], selector); - w7[1] = __byte_perm (w7[0], w7[1], selector); - w7[0] = __byte_perm (w6[3], w7[0], selector); - w6[3] = __byte_perm (w6[2], w6[3], selector); - w6[2] = __byte_perm (w6[1], w6[2], selector); - w6[1] = __byte_perm (w6[0], w6[1], selector); - w6[0] = __byte_perm (w5[3], w6[0], selector); - w5[3] = __byte_perm (w5[2], w5[3], selector); - w5[2] = __byte_perm (w5[1], w5[2], selector); - w5[1] = __byte_perm (w5[0], w5[1], selector); - w5[0] = __byte_perm (w4[3], w5[0], selector); - w4[3] = __byte_perm (w4[2], w4[3], selector); - w4[2] = __byte_perm (w4[1], w4[2], selector); - w4[1] = __byte_perm (w4[0], w4[1], selector); - w4[0] = __byte_perm (w3[3], w4[0], selector); - w3[3] = __byte_perm (w3[2], w3[3], selector); - w3[2] = __byte_perm (w3[1], w3[2], selector); - w3[1] = __byte_perm (w3[0], w3[1], selector); - w3[0] = __byte_perm (w2[3], w3[0], selector); - w2[3] = __byte_perm (w2[2], w2[3], selector); - w2[2] = __byte_perm (w2[1], w2[2], selector); - w2[1] = __byte_perm (w2[0], w2[1], selector); - w2[0] = __byte_perm (w1[3], w2[0], selector); - w1[3] = __byte_perm (w1[2], w1[3], selector); - w1[2] = __byte_perm (w1[1], w1[2], selector); - w1[1] = __byte_perm (w1[0], w1[1], selector); - w1[0] = __byte_perm (w0[3], w1[0], selector); - w0[3] = __byte_perm (w0[2], w0[3], selector); - w0[2] = __byte_perm (w0[1], w0[2], selector); - w0[1] = __byte_perm (w0[0], w0[1], selector); - w0[0] = __byte_perm ( 0, w0[0], selector); + w7[3] = hc_byte_perm (w7[2], w7[3], selector); + w7[2] = hc_byte_perm (w7[1], w7[2], selector); + w7[1] = hc_byte_perm (w7[0], w7[1], selector); + w7[0] = hc_byte_perm (w6[3], w7[0], selector); + w6[3] = hc_byte_perm (w6[2], w6[3], selector); + w6[2] = hc_byte_perm (w6[1], w6[2], selector); + w6[1] = hc_byte_perm (w6[0], w6[1], selector); + w6[0] = hc_byte_perm (w5[3], w6[0], selector); + w5[3] = hc_byte_perm (w5[2], w5[3], selector); + w5[2] = hc_byte_perm (w5[1], w5[2], selector); + w5[1] = hc_byte_perm (w5[0], w5[1], selector); + w5[0] = hc_byte_perm (w4[3], w5[0], selector); + w4[3] = hc_byte_perm (w4[2], w4[3], selector); + w4[2] = hc_byte_perm (w4[1], w4[2], selector); + w4[1] = hc_byte_perm (w4[0], w4[1], selector); + w4[0] = hc_byte_perm (w3[3], w4[0], selector); + w3[3] = hc_byte_perm (w3[2], w3[3], selector); + w3[2] = hc_byte_perm (w3[1], w3[2], selector); + w3[1] = hc_byte_perm (w3[0], w3[1], selector); + w3[0] = hc_byte_perm (w2[3], w3[0], selector); + w2[3] = hc_byte_perm (w2[2], w2[3], selector); + w2[2] = hc_byte_perm (w2[1], w2[2], selector); + w2[1] = hc_byte_perm (w2[0], w2[1], selector); + w2[0] = hc_byte_perm (w1[3], w2[0], selector); + w1[3] = hc_byte_perm (w1[2], w1[3], selector); + w1[2] = hc_byte_perm (w1[1], w1[2], selector); + w1[1] = hc_byte_perm (w1[0], w1[1], selector); + w1[0] = hc_byte_perm (w0[3], w1[0], selector); + w0[3] = hc_byte_perm (w0[2], w0[3], selector); + w0[2] = hc_byte_perm (w0[1], w0[2], selector); + w0[1] = hc_byte_perm (w0[0], w0[1], selector); + w0[0] = hc_byte_perm ( 0, w0[0], selector); break; case 1: - w7[3] = __byte_perm (w7[1], w7[2], selector); - w7[2] = __byte_perm (w7[0], w7[1], selector); - w7[1] = __byte_perm (w6[3], w7[0], selector); - w7[0] = __byte_perm (w6[2], w6[3], selector); - w6[3] = __byte_perm (w6[1], w6[2], selector); - w6[2] = __byte_perm (w6[0], w6[1], selector); - w6[1] = __byte_perm (w5[3], w6[0], selector); - w6[0] = __byte_perm (w5[2], w5[3], selector); - w5[3] = __byte_perm (w5[1], w5[2], selector); - w5[2] = __byte_perm (w5[0], w5[1], selector); - w5[1] = __byte_perm (w4[3], w5[0], selector); - w5[0] = __byte_perm (w4[2], w4[3], selector); - w4[3] = __byte_perm (w4[1], w4[2], selector); - w4[2] = __byte_perm (w4[0], w4[1], selector); - w4[1] = __byte_perm (w3[3], w4[0], selector); - w4[0] = __byte_perm (w3[2], w3[3], selector); - w3[3] = __byte_perm (w3[1], w3[2], selector); - w3[2] = __byte_perm (w3[0], w3[1], selector); - w3[1] = __byte_perm (w2[3], w3[0], selector); - w3[0] = __byte_perm (w2[2], w2[3], selector); - w2[3] = __byte_perm (w2[1], w2[2], selector); - w2[2] = __byte_perm (w2[0], w2[1], selector); - w2[1] = __byte_perm (w1[3], w2[0], selector); - w2[0] = __byte_perm (w1[2], w1[3], selector); - w1[3] = __byte_perm (w1[1], w1[2], selector); - w1[2] = __byte_perm (w1[0], w1[1], selector); - w1[1] = __byte_perm (w0[3], w1[0], selector); - w1[0] = __byte_perm (w0[2], w0[3], selector); - w0[3] = __byte_perm (w0[1], w0[2], selector); - w0[2] = __byte_perm (w0[0], w0[1], selector); - w0[1] = __byte_perm ( 0, w0[0], selector); + w7[3] = hc_byte_perm (w7[1], w7[2], selector); + w7[2] = hc_byte_perm (w7[0], w7[1], selector); + w7[1] = hc_byte_perm (w6[3], w7[0], selector); + w7[0] = hc_byte_perm (w6[2], w6[3], selector); + w6[3] = hc_byte_perm (w6[1], w6[2], selector); + w6[2] = hc_byte_perm (w6[0], w6[1], selector); + w6[1] = hc_byte_perm (w5[3], w6[0], selector); + w6[0] = hc_byte_perm (w5[2], w5[3], selector); + w5[3] = hc_byte_perm (w5[1], w5[2], selector); + w5[2] = hc_byte_perm (w5[0], w5[1], selector); + w5[1] = hc_byte_perm (w4[3], w5[0], selector); + w5[0] = hc_byte_perm (w4[2], w4[3], selector); + w4[3] = hc_byte_perm (w4[1], w4[2], selector); + w4[2] = hc_byte_perm (w4[0], w4[1], selector); + w4[1] = hc_byte_perm (w3[3], w4[0], selector); + w4[0] = hc_byte_perm (w3[2], w3[3], selector); + w3[3] = hc_byte_perm (w3[1], w3[2], selector); + w3[2] = hc_byte_perm (w3[0], w3[1], selector); + w3[1] = hc_byte_perm (w2[3], w3[0], selector); + w3[0] = hc_byte_perm (w2[2], w2[3], selector); + w2[3] = hc_byte_perm (w2[1], w2[2], selector); + w2[2] = hc_byte_perm (w2[0], w2[1], selector); + w2[1] = hc_byte_perm (w1[3], w2[0], selector); + w2[0] = hc_byte_perm (w1[2], w1[3], selector); + w1[3] = hc_byte_perm (w1[1], w1[2], selector); + w1[2] = hc_byte_perm (w1[0], w1[1], selector); + w1[1] = hc_byte_perm (w0[3], w1[0], selector); + w1[0] = hc_byte_perm (w0[2], w0[3], selector); + w0[3] = hc_byte_perm (w0[1], w0[2], selector); + w0[2] = hc_byte_perm (w0[0], w0[1], selector); + w0[1] = hc_byte_perm ( 0, w0[0], selector); w0[0] = 0; break; case 2: - w7[3] = __byte_perm (w7[0], w7[1], selector); - w7[2] = __byte_perm (w6[3], w7[0], selector); - w7[1] = __byte_perm (w6[2], w6[3], selector); - w7[0] = __byte_perm (w6[1], w6[2], selector); - w6[3] = __byte_perm (w6[0], w6[1], selector); - w6[2] = __byte_perm (w5[3], w6[0], selector); - w6[1] = __byte_perm (w5[2], w5[3], selector); - w6[0] = __byte_perm (w5[1], w5[2], selector); - w5[3] = __byte_perm (w5[0], w5[1], selector); - w5[2] = __byte_perm (w4[3], w5[0], selector); - w5[1] = __byte_perm (w4[2], w4[3], selector); - w5[0] = __byte_perm (w4[1], w4[2], selector); - w4[3] = __byte_perm (w4[0], w4[1], selector); - w4[2] = __byte_perm (w3[3], w4[0], selector); - w4[1] = __byte_perm (w3[2], w3[3], selector); - w4[0] = __byte_perm (w3[1], w3[2], selector); - w3[3] = __byte_perm (w3[0], w3[1], selector); - w3[2] = __byte_perm (w2[3], w3[0], selector); - w3[1] = __byte_perm (w2[2], w2[3], selector); - w3[0] = __byte_perm (w2[1], w2[2], selector); - w2[3] = __byte_perm (w2[0], w2[1], selector); - w2[2] = __byte_perm (w1[3], w2[0], selector); - w2[1] = __byte_perm (w1[2], w1[3], selector); - w2[0] = __byte_perm (w1[1], w1[2], selector); - w1[3] = __byte_perm (w1[0], w1[1], selector); - w1[2] = __byte_perm (w0[3], w1[0], selector); - w1[1] = __byte_perm (w0[2], w0[3], selector); - w1[0] = __byte_perm (w0[1], w0[2], selector); - w0[3] = __byte_perm (w0[0], w0[1], selector); - w0[2] = __byte_perm ( 0, w0[0], selector); + w7[3] = hc_byte_perm (w7[0], w7[1], selector); + w7[2] = hc_byte_perm (w6[3], w7[0], selector); + w7[1] = hc_byte_perm (w6[2], w6[3], selector); + w7[0] = hc_byte_perm (w6[1], w6[2], selector); + w6[3] = hc_byte_perm (w6[0], w6[1], selector); + w6[2] = hc_byte_perm (w5[3], w6[0], selector); + w6[1] = hc_byte_perm (w5[2], w5[3], selector); + w6[0] = hc_byte_perm (w5[1], w5[2], selector); + w5[3] = hc_byte_perm (w5[0], w5[1], selector); + w5[2] = hc_byte_perm (w4[3], w5[0], selector); + w5[1] = hc_byte_perm (w4[2], w4[3], selector); + w5[0] = hc_byte_perm (w4[1], w4[2], selector); + w4[3] = hc_byte_perm (w4[0], w4[1], selector); + w4[2] = hc_byte_perm (w3[3], w4[0], selector); + w4[1] = hc_byte_perm (w3[2], w3[3], selector); + w4[0] = hc_byte_perm (w3[1], w3[2], selector); + w3[3] = hc_byte_perm (w3[0], w3[1], selector); + w3[2] = hc_byte_perm (w2[3], w3[0], selector); + w3[1] = hc_byte_perm (w2[2], w2[3], selector); + w3[0] = hc_byte_perm (w2[1], w2[2], selector); + w2[3] = hc_byte_perm (w2[0], w2[1], selector); + w2[2] = hc_byte_perm (w1[3], w2[0], selector); + w2[1] = hc_byte_perm (w1[2], w1[3], selector); + w2[0] = hc_byte_perm (w1[1], w1[2], selector); + w1[3] = hc_byte_perm (w1[0], w1[1], selector); + w1[2] = hc_byte_perm (w0[3], w1[0], selector); + w1[1] = hc_byte_perm (w0[2], w0[3], selector); + w1[0] = hc_byte_perm (w0[1], w0[2], selector); + w0[3] = hc_byte_perm (w0[0], w0[1], selector); + w0[2] = hc_byte_perm ( 0, w0[0], selector); w0[1] = 0; w0[0] = 0; break; case 3: - w7[3] = __byte_perm (w6[3], w7[0], selector); - w7[2] = __byte_perm (w6[2], w6[3], selector); - w7[1] = __byte_perm (w6[1], w6[2], selector); - w7[0] = __byte_perm (w6[0], w6[1], selector); - w6[3] = __byte_perm (w5[3], w6[0], selector); - w6[2] = __byte_perm (w5[2], w5[3], selector); - w6[1] = __byte_perm (w5[1], w5[2], selector); - w6[0] = __byte_perm (w5[0], w5[1], selector); - w5[3] = __byte_perm (w4[3], w5[0], selector); - w5[2] = __byte_perm (w4[2], w4[3], selector); - w5[1] = __byte_perm (w4[1], w4[2], selector); - w5[0] = __byte_perm (w4[0], w4[1], selector); - w4[3] = __byte_perm (w3[3], w4[0], selector); - w4[2] = __byte_perm (w3[2], w3[3], selector); - w4[1] = __byte_perm (w3[1], w3[2], selector); - w4[0] = __byte_perm (w3[0], w3[1], selector); - w3[3] = __byte_perm (w2[3], w3[0], selector); - w3[2] = __byte_perm (w2[2], w2[3], selector); - w3[1] = __byte_perm (w2[1], w2[2], selector); - w3[0] = __byte_perm (w2[0], w2[1], selector); - w2[3] = __byte_perm (w1[3], w2[0], selector); - w2[2] = __byte_perm (w1[2], w1[3], selector); - w2[1] = __byte_perm (w1[1], w1[2], selector); - w2[0] = __byte_perm (w1[0], w1[1], selector); - w1[3] = __byte_perm (w0[3], w1[0], selector); - w1[2] = __byte_perm (w0[2], w0[3], selector); - w1[1] = __byte_perm (w0[1], w0[2], selector); - w1[0] = __byte_perm (w0[0], w0[1], selector); - w0[3] = __byte_perm ( 0, w0[0], selector); + w7[3] = hc_byte_perm (w6[3], w7[0], selector); + w7[2] = hc_byte_perm (w6[2], w6[3], selector); + w7[1] = hc_byte_perm (w6[1], w6[2], selector); + w7[0] = hc_byte_perm (w6[0], w6[1], selector); + w6[3] = hc_byte_perm (w5[3], w6[0], selector); + w6[2] = hc_byte_perm (w5[2], w5[3], selector); + w6[1] = hc_byte_perm (w5[1], w5[2], selector); + w6[0] = hc_byte_perm (w5[0], w5[1], selector); + w5[3] = hc_byte_perm (w4[3], w5[0], selector); + w5[2] = hc_byte_perm (w4[2], w4[3], selector); + w5[1] = hc_byte_perm (w4[1], w4[2], selector); + w5[0] = hc_byte_perm (w4[0], w4[1], selector); + w4[3] = hc_byte_perm (w3[3], w4[0], selector); + w4[2] = hc_byte_perm (w3[2], w3[3], selector); + w4[1] = hc_byte_perm (w3[1], w3[2], selector); + w4[0] = hc_byte_perm (w3[0], w3[1], selector); + w3[3] = hc_byte_perm (w2[3], w3[0], selector); + w3[2] = hc_byte_perm (w2[2], w2[3], selector); + w3[1] = hc_byte_perm (w2[1], w2[2], selector); + w3[0] = hc_byte_perm (w2[0], w2[1], selector); + w2[3] = hc_byte_perm (w1[3], w2[0], selector); + w2[2] = hc_byte_perm (w1[2], w1[3], selector); + w2[1] = hc_byte_perm (w1[1], w1[2], selector); + w2[0] = hc_byte_perm (w1[0], w1[1], selector); + w1[3] = hc_byte_perm (w0[3], w1[0], selector); + w1[2] = hc_byte_perm (w0[2], w0[3], selector); + w1[1] = hc_byte_perm (w0[1], w0[2], selector); + w1[0] = hc_byte_perm (w0[0], w0[1], selector); + w0[3] = hc_byte_perm ( 0, w0[0], selector); w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 4: - w7[3] = __byte_perm (w6[2], w6[3], selector); - w7[2] = __byte_perm (w6[1], w6[2], selector); - w7[1] = __byte_perm (w6[0], w6[1], selector); - w7[0] = __byte_perm (w5[3], w6[0], selector); - w6[3] = __byte_perm (w5[2], w5[3], selector); - w6[2] = __byte_perm (w5[1], w5[2], selector); - w6[1] = __byte_perm (w5[0], w5[1], selector); - w6[0] = __byte_perm (w4[3], w5[0], selector); - w5[3] = __byte_perm (w4[2], w4[3], selector); - w5[2] = __byte_perm (w4[1], w4[2], selector); - w5[1] = __byte_perm (w4[0], w4[1], selector); - w5[0] = __byte_perm (w3[3], w4[0], selector); - w4[3] = __byte_perm (w3[2], w3[3], selector); - w4[2] = __byte_perm (w3[1], w3[2], selector); - w4[1] = __byte_perm (w3[0], w3[1], selector); - w4[0] = __byte_perm (w2[3], w3[0], selector); - w3[3] = __byte_perm (w2[2], w2[3], selector); - w3[2] = __byte_perm (w2[1], w2[2], selector); - w3[1] = __byte_perm (w2[0], w2[1], selector); - w3[0] = __byte_perm (w1[3], w2[0], selector); - w2[3] = __byte_perm (w1[2], w1[3], selector); - w2[2] = __byte_perm (w1[1], w1[2], selector); - w2[1] = __byte_perm (w1[0], w1[1], selector); - w2[0] = __byte_perm (w0[3], w1[0], selector); - w1[3] = __byte_perm (w0[2], w0[3], selector); - w1[2] = __byte_perm (w0[1], w0[2], selector); - w1[1] = __byte_perm (w0[0], w0[1], selector); - w1[0] = __byte_perm ( 0, w0[0], selector); + w7[3] = hc_byte_perm (w6[2], w6[3], selector); + w7[2] = hc_byte_perm (w6[1], w6[2], selector); + w7[1] = hc_byte_perm (w6[0], w6[1], selector); + w7[0] = hc_byte_perm (w5[3], w6[0], selector); + w6[3] = hc_byte_perm (w5[2], w5[3], selector); + w6[2] = hc_byte_perm (w5[1], w5[2], selector); + w6[1] = hc_byte_perm (w5[0], w5[1], selector); + w6[0] = hc_byte_perm (w4[3], w5[0], selector); + w5[3] = hc_byte_perm (w4[2], w4[3], selector); + w5[2] = hc_byte_perm (w4[1], w4[2], selector); + w5[1] = hc_byte_perm (w4[0], w4[1], selector); + w5[0] = hc_byte_perm (w3[3], w4[0], selector); + w4[3] = hc_byte_perm (w3[2], w3[3], selector); + w4[2] = hc_byte_perm (w3[1], w3[2], selector); + w4[1] = hc_byte_perm (w3[0], w3[1], selector); + w4[0] = hc_byte_perm (w2[3], w3[0], selector); + w3[3] = hc_byte_perm (w2[2], w2[3], selector); + w3[2] = hc_byte_perm (w2[1], w2[2], selector); + w3[1] = hc_byte_perm (w2[0], w2[1], selector); + w3[0] = hc_byte_perm (w1[3], w2[0], selector); + w2[3] = hc_byte_perm (w1[2], w1[3], selector); + w2[2] = hc_byte_perm (w1[1], w1[2], selector); + w2[1] = hc_byte_perm (w1[0], w1[1], selector); + w2[0] = hc_byte_perm (w0[3], w1[0], selector); + w1[3] = hc_byte_perm (w0[2], w0[3], selector); + w1[2] = hc_byte_perm (w0[1], w0[2], selector); + w1[1] = hc_byte_perm (w0[0], w0[1], selector); + w1[0] = hc_byte_perm ( 0, w0[0], selector); w0[3] = 0; w0[2] = 0; w0[1] = 0; @@ -5551,33 +5551,33 @@ DECLSPEC void switch_buffer_by_offset_8x4_le (u32x *w0, u32x *w1, u32x *w2, u32x break; case 5: - w7[3] = __byte_perm (w6[1], w6[2], selector); - w7[2] = __byte_perm (w6[0], w6[1], selector); - w7[1] = __byte_perm (w5[3], w6[0], selector); - w7[0] = __byte_perm (w5[2], w5[3], selector); - w6[3] = __byte_perm (w5[1], w5[2], selector); - w6[2] = __byte_perm (w5[0], w5[1], selector); - w6[1] = __byte_perm (w4[3], w5[0], selector); - w6[0] = __byte_perm (w4[2], w4[3], selector); - w5[3] = __byte_perm (w4[1], w4[2], selector); - w5[2] = __byte_perm (w4[0], w4[1], selector); - w5[1] = __byte_perm (w3[3], w4[0], selector); - w5[0] = __byte_perm (w3[2], w3[3], selector); - w4[3] = __byte_perm (w3[1], w3[2], selector); - w4[2] = __byte_perm (w3[0], w3[1], selector); - w4[1] = __byte_perm (w2[3], w3[0], selector); - w4[0] = __byte_perm (w2[2], w2[3], selector); - w3[3] = __byte_perm (w2[1], w2[2], selector); - w3[2] = __byte_perm (w2[0], w2[1], selector); - w3[1] = __byte_perm (w1[3], w2[0], selector); - w3[0] = __byte_perm (w1[2], w1[3], selector); - w2[3] = __byte_perm (w1[1], w1[2], selector); - w2[2] = __byte_perm (w1[0], w1[1], selector); - w2[1] = __byte_perm (w0[3], w1[0], selector); - w2[0] = __byte_perm (w0[2], w0[3], selector); - w1[3] = __byte_perm (w0[1], w0[2], selector); - w1[2] = __byte_perm (w0[0], w0[1], selector); - w1[1] = __byte_perm ( 0, w0[0], selector); + w7[3] = hc_byte_perm (w6[1], w6[2], selector); + w7[2] = hc_byte_perm (w6[0], w6[1], selector); + w7[1] = hc_byte_perm (w5[3], w6[0], selector); + w7[0] = hc_byte_perm (w5[2], w5[3], selector); + w6[3] = hc_byte_perm (w5[1], w5[2], selector); + w6[2] = hc_byte_perm (w5[0], w5[1], selector); + w6[1] = hc_byte_perm (w4[3], w5[0], selector); + w6[0] = hc_byte_perm (w4[2], w4[3], selector); + w5[3] = hc_byte_perm (w4[1], w4[2], selector); + w5[2] = hc_byte_perm (w4[0], w4[1], selector); + w5[1] = hc_byte_perm (w3[3], w4[0], selector); + w5[0] = hc_byte_perm (w3[2], w3[3], selector); + w4[3] = hc_byte_perm (w3[1], w3[2], selector); + w4[2] = hc_byte_perm (w3[0], w3[1], selector); + w4[1] = hc_byte_perm (w2[3], w3[0], selector); + w4[0] = hc_byte_perm (w2[2], w2[3], selector); + w3[3] = hc_byte_perm (w2[1], w2[2], selector); + w3[2] = hc_byte_perm (w2[0], w2[1], selector); + w3[1] = hc_byte_perm (w1[3], w2[0], selector); + w3[0] = hc_byte_perm (w1[2], w1[3], selector); + w2[3] = hc_byte_perm (w1[1], w1[2], selector); + w2[2] = hc_byte_perm (w1[0], w1[1], selector); + w2[1] = hc_byte_perm (w0[3], w1[0], selector); + w2[0] = hc_byte_perm (w0[2], w0[3], selector); + w1[3] = hc_byte_perm (w0[1], w0[2], selector); + w1[2] = hc_byte_perm (w0[0], w0[1], selector); + w1[1] = hc_byte_perm ( 0, w0[0], selector); w1[0] = 0; w0[3] = 0; w0[2] = 0; @@ -5586,32 +5586,32 @@ DECLSPEC void switch_buffer_by_offset_8x4_le (u32x *w0, u32x *w1, u32x *w2, u32x break; case 6: - w7[3] = __byte_perm (w6[0], w6[1], selector); - w7[2] = __byte_perm (w5[3], w6[0], selector); - w7[1] = __byte_perm (w5[2], w5[3], selector); - w7[0] = __byte_perm (w5[1], w5[2], selector); - w6[3] = __byte_perm (w5[0], w5[1], selector); - w6[2] = __byte_perm (w4[3], w5[0], selector); - w6[1] = __byte_perm (w4[2], w4[3], selector); - w6[0] = __byte_perm (w4[1], w4[2], selector); - w5[3] = __byte_perm (w4[0], w4[1], selector); - w5[2] = __byte_perm (w3[3], w4[0], selector); - w5[1] = __byte_perm (w3[2], w3[3], selector); - w5[0] = __byte_perm (w3[1], w3[2], selector); - w4[3] = __byte_perm (w3[0], w3[1], selector); - w4[2] = __byte_perm (w2[3], w3[0], selector); - w4[1] = __byte_perm (w2[2], w2[3], selector); - w4[0] = __byte_perm (w2[1], w2[2], selector); - w3[3] = __byte_perm (w2[0], w2[1], selector); - w3[2] = __byte_perm (w1[3], w2[0], selector); - w3[1] = __byte_perm (w1[2], w1[3], selector); - w3[0] = __byte_perm (w1[1], w1[2], selector); - w2[3] = __byte_perm (w1[0], w1[1], selector); - w2[2] = __byte_perm (w0[3], w1[0], selector); - w2[1] = __byte_perm (w0[2], w0[3], selector); - w2[0] = __byte_perm (w0[1], w0[2], selector); - w1[3] = __byte_perm (w0[0], w0[1], selector); - w1[2] = __byte_perm ( 0, w0[0], selector); + w7[3] = hc_byte_perm (w6[0], w6[1], selector); + w7[2] = hc_byte_perm (w5[3], w6[0], selector); + w7[1] = hc_byte_perm (w5[2], w5[3], selector); + w7[0] = hc_byte_perm (w5[1], w5[2], selector); + w6[3] = hc_byte_perm (w5[0], w5[1], selector); + w6[2] = hc_byte_perm (w4[3], w5[0], selector); + w6[1] = hc_byte_perm (w4[2], w4[3], selector); + w6[0] = hc_byte_perm (w4[1], w4[2], selector); + w5[3] = hc_byte_perm (w4[0], w4[1], selector); + w5[2] = hc_byte_perm (w3[3], w4[0], selector); + w5[1] = hc_byte_perm (w3[2], w3[3], selector); + w5[0] = hc_byte_perm (w3[1], w3[2], selector); + w4[3] = hc_byte_perm (w3[0], w3[1], selector); + w4[2] = hc_byte_perm (w2[3], w3[0], selector); + w4[1] = hc_byte_perm (w2[2], w2[3], selector); + w4[0] = hc_byte_perm (w2[1], w2[2], selector); + w3[3] = hc_byte_perm (w2[0], w2[1], selector); + w3[2] = hc_byte_perm (w1[3], w2[0], selector); + w3[1] = hc_byte_perm (w1[2], w1[3], selector); + w3[0] = hc_byte_perm (w1[1], w1[2], selector); + w2[3] = hc_byte_perm (w1[0], w1[1], selector); + w2[2] = hc_byte_perm (w0[3], w1[0], selector); + w2[1] = hc_byte_perm (w0[2], w0[3], selector); + w2[0] = hc_byte_perm (w0[1], w0[2], selector); + w1[3] = hc_byte_perm (w0[0], w0[1], selector); + w1[2] = hc_byte_perm ( 0, w0[0], selector); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -5621,31 +5621,31 @@ DECLSPEC void switch_buffer_by_offset_8x4_le (u32x *w0, u32x *w1, u32x *w2, u32x break; case 7: - w7[3] = __byte_perm (w5[3], w6[0], selector); - w7[2] = __byte_perm (w5[2], w5[3], selector); - w7[1] = __byte_perm (w5[1], w5[2], selector); - w7[0] = __byte_perm (w5[0], w5[1], selector); - w6[3] = __byte_perm (w4[3], w5[0], selector); - w6[2] = __byte_perm (w4[2], w4[3], selector); - w6[1] = __byte_perm (w4[1], w4[2], selector); - w6[0] = __byte_perm (w4[0], w4[1], selector); - w5[3] = __byte_perm (w3[3], w4[0], selector); - w5[2] = __byte_perm (w3[2], w3[3], selector); - w5[1] = __byte_perm (w3[1], w3[2], selector); - w5[0] = __byte_perm (w3[0], w3[1], selector); - w4[3] = __byte_perm (w2[3], w3[0], selector); - w4[2] = __byte_perm (w2[2], w2[3], selector); - w4[1] = __byte_perm (w2[1], w2[2], selector); - w4[0] = __byte_perm (w2[0], w2[1], selector); - w3[3] = __byte_perm (w1[3], w2[0], selector); - w3[2] = __byte_perm (w1[2], w1[3], selector); - w3[1] = __byte_perm (w1[1], w1[2], selector); - w3[0] = __byte_perm (w1[0], w1[1], selector); - w2[3] = __byte_perm (w0[3], w1[0], selector); - w2[2] = __byte_perm (w0[2], w0[3], selector); - w2[1] = __byte_perm (w0[1], w0[2], selector); - w2[0] = __byte_perm (w0[0], w0[1], selector); - w1[3] = __byte_perm ( 0, w0[0], selector); + w7[3] = hc_byte_perm (w5[3], w6[0], selector); + w7[2] = hc_byte_perm (w5[2], w5[3], selector); + w7[1] = hc_byte_perm (w5[1], w5[2], selector); + w7[0] = hc_byte_perm (w5[0], w5[1], selector); + w6[3] = hc_byte_perm (w4[3], w5[0], selector); + w6[2] = hc_byte_perm (w4[2], w4[3], selector); + w6[1] = hc_byte_perm (w4[1], w4[2], selector); + w6[0] = hc_byte_perm (w4[0], w4[1], selector); + w5[3] = hc_byte_perm (w3[3], w4[0], selector); + w5[2] = hc_byte_perm (w3[2], w3[3], selector); + w5[1] = hc_byte_perm (w3[1], w3[2], selector); + w5[0] = hc_byte_perm (w3[0], w3[1], selector); + w4[3] = hc_byte_perm (w2[3], w3[0], selector); + w4[2] = hc_byte_perm (w2[2], w2[3], selector); + w4[1] = hc_byte_perm (w2[1], w2[2], selector); + w4[0] = hc_byte_perm (w2[0], w2[1], selector); + w3[3] = hc_byte_perm (w1[3], w2[0], selector); + w3[2] = hc_byte_perm (w1[2], w1[3], selector); + w3[1] = hc_byte_perm (w1[1], w1[2], selector); + w3[0] = hc_byte_perm (w1[0], w1[1], selector); + w2[3] = hc_byte_perm (w0[3], w1[0], selector); + w2[2] = hc_byte_perm (w0[2], w0[3], selector); + w2[1] = hc_byte_perm (w0[1], w0[2], selector); + w2[0] = hc_byte_perm (w0[0], w0[1], selector); + w1[3] = hc_byte_perm ( 0, w0[0], selector); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -5656,30 +5656,30 @@ DECLSPEC void switch_buffer_by_offset_8x4_le (u32x *w0, u32x *w1, u32x *w2, u32x break; case 8: - w7[3] = __byte_perm (w5[2], w5[3], selector); - w7[2] = __byte_perm (w5[1], w5[2], selector); - w7[1] = __byte_perm (w5[0], w5[1], selector); - w7[0] = __byte_perm (w4[3], w5[0], selector); - w6[3] = __byte_perm (w4[2], w4[3], selector); - w6[2] = __byte_perm (w4[1], w4[2], selector); - w6[1] = __byte_perm (w4[0], w4[1], selector); - w6[0] = __byte_perm (w3[3], w4[0], selector); - w5[3] = __byte_perm (w3[2], w3[3], selector); - w5[2] = __byte_perm (w3[1], w3[2], selector); - w5[1] = __byte_perm (w3[0], w3[1], selector); - w5[0] = __byte_perm (w2[3], w3[0], selector); - w4[3] = __byte_perm (w2[2], w2[3], selector); - w4[2] = __byte_perm (w2[1], w2[2], selector); - w4[1] = __byte_perm (w2[0], w2[1], selector); - w4[0] = __byte_perm (w1[3], w2[0], selector); - w3[3] = __byte_perm (w1[2], w1[3], selector); - w3[2] = __byte_perm (w1[1], w1[2], selector); - w3[1] = __byte_perm (w1[0], w1[1], selector); - w3[0] = __byte_perm (w0[3], w1[0], selector); - w2[3] = __byte_perm (w0[2], w0[3], selector); - w2[2] = __byte_perm (w0[1], w0[2], selector); - w2[1] = __byte_perm (w0[0], w0[1], selector); - w2[0] = __byte_perm ( 0, w0[0], selector); + w7[3] = hc_byte_perm (w5[2], w5[3], selector); + w7[2] = hc_byte_perm (w5[1], w5[2], selector); + w7[1] = hc_byte_perm (w5[0], w5[1], selector); + w7[0] = hc_byte_perm (w4[3], w5[0], selector); + w6[3] = hc_byte_perm (w4[2], w4[3], selector); + w6[2] = hc_byte_perm (w4[1], w4[2], selector); + w6[1] = hc_byte_perm (w4[0], w4[1], selector); + w6[0] = hc_byte_perm (w3[3], w4[0], selector); + w5[3] = hc_byte_perm (w3[2], w3[3], selector); + w5[2] = hc_byte_perm (w3[1], w3[2], selector); + w5[1] = hc_byte_perm (w3[0], w3[1], selector); + w5[0] = hc_byte_perm (w2[3], w3[0], selector); + w4[3] = hc_byte_perm (w2[2], w2[3], selector); + w4[2] = hc_byte_perm (w2[1], w2[2], selector); + w4[1] = hc_byte_perm (w2[0], w2[1], selector); + w4[0] = hc_byte_perm (w1[3], w2[0], selector); + w3[3] = hc_byte_perm (w1[2], w1[3], selector); + w3[2] = hc_byte_perm (w1[1], w1[2], selector); + w3[1] = hc_byte_perm (w1[0], w1[1], selector); + w3[0] = hc_byte_perm (w0[3], w1[0], selector); + w2[3] = hc_byte_perm (w0[2], w0[3], selector); + w2[2] = hc_byte_perm (w0[1], w0[2], selector); + w2[1] = hc_byte_perm (w0[0], w0[1], selector); + w2[0] = hc_byte_perm ( 0, w0[0], selector); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -5691,29 +5691,29 @@ DECLSPEC void switch_buffer_by_offset_8x4_le (u32x *w0, u32x *w1, u32x *w2, u32x break; case 9: - w7[3] = __byte_perm (w5[1], w5[2], selector); - w7[2] = __byte_perm (w5[0], w5[1], selector); - w7[1] = __byte_perm (w4[3], w5[0], selector); - w7[0] = __byte_perm (w4[2], w4[3], selector); - w6[3] = __byte_perm (w4[1], w4[2], selector); - w6[2] = __byte_perm (w4[0], w4[1], selector); - w6[1] = __byte_perm (w3[3], w4[0], selector); - w6[0] = __byte_perm (w3[2], w3[3], selector); - w5[3] = __byte_perm (w3[1], w3[2], selector); - w5[2] = __byte_perm (w3[0], w3[1], selector); - w5[1] = __byte_perm (w2[3], w3[0], selector); - w5[0] = __byte_perm (w2[2], w2[3], selector); - w4[3] = __byte_perm (w2[1], w2[2], selector); - w4[2] = __byte_perm (w2[0], w2[1], selector); - w4[1] = __byte_perm (w1[3], w2[0], selector); - w4[0] = __byte_perm (w1[2], w1[3], selector); - w3[3] = __byte_perm (w1[1], w1[2], selector); - w3[2] = __byte_perm (w1[0], w1[1], selector); - w3[1] = __byte_perm (w0[3], w1[0], selector); - w3[0] = __byte_perm (w0[2], w0[3], selector); - w2[3] = __byte_perm (w0[1], w0[2], selector); - w2[2] = __byte_perm (w0[0], w0[1], selector); - w2[1] = __byte_perm ( 0, w0[0], selector); + w7[3] = hc_byte_perm (w5[1], w5[2], selector); + w7[2] = hc_byte_perm (w5[0], w5[1], selector); + w7[1] = hc_byte_perm (w4[3], w5[0], selector); + w7[0] = hc_byte_perm (w4[2], w4[3], selector); + w6[3] = hc_byte_perm (w4[1], w4[2], selector); + w6[2] = hc_byte_perm (w4[0], w4[1], selector); + w6[1] = hc_byte_perm (w3[3], w4[0], selector); + w6[0] = hc_byte_perm (w3[2], w3[3], selector); + w5[3] = hc_byte_perm (w3[1], w3[2], selector); + w5[2] = hc_byte_perm (w3[0], w3[1], selector); + w5[1] = hc_byte_perm (w2[3], w3[0], selector); + w5[0] = hc_byte_perm (w2[2], w2[3], selector); + w4[3] = hc_byte_perm (w2[1], w2[2], selector); + w4[2] = hc_byte_perm (w2[0], w2[1], selector); + w4[1] = hc_byte_perm (w1[3], w2[0], selector); + w4[0] = hc_byte_perm (w1[2], w1[3], selector); + w3[3] = hc_byte_perm (w1[1], w1[2], selector); + w3[2] = hc_byte_perm (w1[0], w1[1], selector); + w3[1] = hc_byte_perm (w0[3], w1[0], selector); + w3[0] = hc_byte_perm (w0[2], w0[3], selector); + w2[3] = hc_byte_perm (w0[1], w0[2], selector); + w2[2] = hc_byte_perm (w0[0], w0[1], selector); + w2[1] = hc_byte_perm ( 0, w0[0], selector); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -5726,28 +5726,28 @@ DECLSPEC void switch_buffer_by_offset_8x4_le (u32x *w0, u32x *w1, u32x *w2, u32x break; case 10: - w7[3] = __byte_perm (w5[0], w5[1], selector); - w7[2] = __byte_perm (w4[3], w5[0], selector); - w7[1] = __byte_perm (w4[2], w4[3], selector); - w7[0] = __byte_perm (w4[1], w4[2], selector); - w6[3] = __byte_perm (w4[0], w4[1], selector); - w6[2] = __byte_perm (w3[3], w4[0], selector); - w6[1] = __byte_perm (w3[2], w3[3], selector); - w6[0] = __byte_perm (w3[1], w3[2], selector); - w5[3] = __byte_perm (w3[0], w3[1], selector); - w5[2] = __byte_perm (w2[3], w3[0], selector); - w5[1] = __byte_perm (w2[2], w2[3], selector); - w5[0] = __byte_perm (w2[1], w2[2], selector); - w4[3] = __byte_perm (w2[0], w2[1], selector); - w4[2] = __byte_perm (w1[3], w2[0], selector); - w4[1] = __byte_perm (w1[2], w1[3], selector); - w4[0] = __byte_perm (w1[1], w1[2], selector); - w3[3] = __byte_perm (w1[0], w1[1], selector); - w3[2] = __byte_perm (w0[3], w1[0], selector); - w3[1] = __byte_perm (w0[2], w0[3], selector); - w3[0] = __byte_perm (w0[1], w0[2], selector); - w2[3] = __byte_perm (w0[0], w0[1], selector); - w2[2] = __byte_perm ( 0, w0[0], selector); + w7[3] = hc_byte_perm (w5[0], w5[1], selector); + w7[2] = hc_byte_perm (w4[3], w5[0], selector); + w7[1] = hc_byte_perm (w4[2], w4[3], selector); + w7[0] = hc_byte_perm (w4[1], w4[2], selector); + w6[3] = hc_byte_perm (w4[0], w4[1], selector); + w6[2] = hc_byte_perm (w3[3], w4[0], selector); + w6[1] = hc_byte_perm (w3[2], w3[3], selector); + w6[0] = hc_byte_perm (w3[1], w3[2], selector); + w5[3] = hc_byte_perm (w3[0], w3[1], selector); + w5[2] = hc_byte_perm (w2[3], w3[0], selector); + w5[1] = hc_byte_perm (w2[2], w2[3], selector); + w5[0] = hc_byte_perm (w2[1], w2[2], selector); + w4[3] = hc_byte_perm (w2[0], w2[1], selector); + w4[2] = hc_byte_perm (w1[3], w2[0], selector); + w4[1] = hc_byte_perm (w1[2], w1[3], selector); + w4[0] = hc_byte_perm (w1[1], w1[2], selector); + w3[3] = hc_byte_perm (w1[0], w1[1], selector); + w3[2] = hc_byte_perm (w0[3], w1[0], selector); + w3[1] = hc_byte_perm (w0[2], w0[3], selector); + w3[0] = hc_byte_perm (w0[1], w0[2], selector); + w2[3] = hc_byte_perm (w0[0], w0[1], selector); + w2[2] = hc_byte_perm ( 0, w0[0], selector); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -5761,27 +5761,27 @@ DECLSPEC void switch_buffer_by_offset_8x4_le (u32x *w0, u32x *w1, u32x *w2, u32x break; case 11: - w7[3] = __byte_perm (w4[3], w5[0], selector); - w7[2] = __byte_perm (w4[2], w4[3], selector); - w7[1] = __byte_perm (w4[1], w4[2], selector); - w7[0] = __byte_perm (w4[0], w4[1], selector); - w6[3] = __byte_perm (w3[3], w4[0], selector); - w6[2] = __byte_perm (w3[2], w3[3], selector); - w6[1] = __byte_perm (w3[1], w3[2], selector); - w6[0] = __byte_perm (w3[0], w3[1], selector); - w5[3] = __byte_perm (w2[3], w3[0], selector); - w5[2] = __byte_perm (w2[2], w2[3], selector); - w5[1] = __byte_perm (w2[1], w2[2], selector); - w5[0] = __byte_perm (w2[0], w2[1], selector); - w4[3] = __byte_perm (w1[3], w2[0], selector); - w4[2] = __byte_perm (w1[2], w1[3], selector); - w4[1] = __byte_perm (w1[1], w1[2], selector); - w4[0] = __byte_perm (w1[0], w1[1], selector); - w3[3] = __byte_perm (w0[3], w1[0], selector); - w3[2] = __byte_perm (w0[2], w0[3], selector); - w3[1] = __byte_perm (w0[1], w0[2], selector); - w3[0] = __byte_perm (w0[0], w0[1], selector); - w2[3] = __byte_perm ( 0, w0[0], selector); + w7[3] = hc_byte_perm (w4[3], w5[0], selector); + w7[2] = hc_byte_perm (w4[2], w4[3], selector); + w7[1] = hc_byte_perm (w4[1], w4[2], selector); + w7[0] = hc_byte_perm (w4[0], w4[1], selector); + w6[3] = hc_byte_perm (w3[3], w4[0], selector); + w6[2] = hc_byte_perm (w3[2], w3[3], selector); + w6[1] = hc_byte_perm (w3[1], w3[2], selector); + w6[0] = hc_byte_perm (w3[0], w3[1], selector); + w5[3] = hc_byte_perm (w2[3], w3[0], selector); + w5[2] = hc_byte_perm (w2[2], w2[3], selector); + w5[1] = hc_byte_perm (w2[1], w2[2], selector); + w5[0] = hc_byte_perm (w2[0], w2[1], selector); + w4[3] = hc_byte_perm (w1[3], w2[0], selector); + w4[2] = hc_byte_perm (w1[2], w1[3], selector); + w4[1] = hc_byte_perm (w1[1], w1[2], selector); + w4[0] = hc_byte_perm (w1[0], w1[1], selector); + w3[3] = hc_byte_perm (w0[3], w1[0], selector); + w3[2] = hc_byte_perm (w0[2], w0[3], selector); + w3[1] = hc_byte_perm (w0[1], w0[2], selector); + w3[0] = hc_byte_perm (w0[0], w0[1], selector); + w2[3] = hc_byte_perm ( 0, w0[0], selector); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -5796,26 +5796,26 @@ DECLSPEC void switch_buffer_by_offset_8x4_le (u32x *w0, u32x *w1, u32x *w2, u32x break; case 12: - w7[3] = __byte_perm (w4[2], w4[3], selector); - w7[2] = __byte_perm (w4[1], w4[2], selector); - w7[1] = __byte_perm (w4[0], w4[1], selector); - w7[0] = __byte_perm (w3[3], w4[0], selector); - w6[3] = __byte_perm (w3[2], w3[3], selector); - w6[2] = __byte_perm (w3[1], w3[2], selector); - w6[1] = __byte_perm (w3[0], w3[1], selector); - w6[0] = __byte_perm (w2[3], w3[0], selector); - w5[3] = __byte_perm (w2[2], w2[3], selector); - w5[2] = __byte_perm (w2[1], w2[2], selector); - w5[1] = __byte_perm (w2[0], w2[1], selector); - w5[0] = __byte_perm (w1[3], w2[0], selector); - w4[3] = __byte_perm (w1[2], w1[3], selector); - w4[2] = __byte_perm (w1[1], w1[2], selector); - w4[1] = __byte_perm (w1[0], w1[1], selector); - w4[0] = __byte_perm (w0[3], w1[0], selector); - w3[3] = __byte_perm (w0[2], w0[3], selector); - w3[2] = __byte_perm (w0[1], w0[2], selector); - w3[1] = __byte_perm (w0[0], w0[1], selector); - w3[0] = __byte_perm ( 0, w0[0], selector); + w7[3] = hc_byte_perm (w4[2], w4[3], selector); + w7[2] = hc_byte_perm (w4[1], w4[2], selector); + w7[1] = hc_byte_perm (w4[0], w4[1], selector); + w7[0] = hc_byte_perm (w3[3], w4[0], selector); + w6[3] = hc_byte_perm (w3[2], w3[3], selector); + w6[2] = hc_byte_perm (w3[1], w3[2], selector); + w6[1] = hc_byte_perm (w3[0], w3[1], selector); + w6[0] = hc_byte_perm (w2[3], w3[0], selector); + w5[3] = hc_byte_perm (w2[2], w2[3], selector); + w5[2] = hc_byte_perm (w2[1], w2[2], selector); + w5[1] = hc_byte_perm (w2[0], w2[1], selector); + w5[0] = hc_byte_perm (w1[3], w2[0], selector); + w4[3] = hc_byte_perm (w1[2], w1[3], selector); + w4[2] = hc_byte_perm (w1[1], w1[2], selector); + w4[1] = hc_byte_perm (w1[0], w1[1], selector); + w4[0] = hc_byte_perm (w0[3], w1[0], selector); + w3[3] = hc_byte_perm (w0[2], w0[3], selector); + w3[2] = hc_byte_perm (w0[1], w0[2], selector); + w3[1] = hc_byte_perm (w0[0], w0[1], selector); + w3[0] = hc_byte_perm ( 0, w0[0], selector); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -5831,25 +5831,25 @@ DECLSPEC void switch_buffer_by_offset_8x4_le (u32x *w0, u32x *w1, u32x *w2, u32x break; case 13: - w7[3] = __byte_perm (w4[1], w4[2], selector); - w7[2] = __byte_perm (w4[0], w4[1], selector); - w7[1] = __byte_perm (w3[3], w4[0], selector); - w7[0] = __byte_perm (w3[2], w3[3], selector); - w6[3] = __byte_perm (w3[1], w3[2], selector); - w6[2] = __byte_perm (w3[0], w3[1], selector); - w6[1] = __byte_perm (w2[3], w3[0], selector); - w6[0] = __byte_perm (w2[2], w2[3], selector); - w5[3] = __byte_perm (w2[1], w2[2], selector); - w5[2] = __byte_perm (w2[0], w2[1], selector); - w5[1] = __byte_perm (w1[3], w2[0], selector); - w5[0] = __byte_perm (w1[2], w1[3], selector); - w4[3] = __byte_perm (w1[1], w1[2], selector); - w4[2] = __byte_perm (w1[0], w1[1], selector); - w4[1] = __byte_perm (w0[3], w1[0], selector); - w4[0] = __byte_perm (w0[2], w0[3], selector); - w3[3] = __byte_perm (w0[1], w0[2], selector); - w3[2] = __byte_perm (w0[0], w0[1], selector); - w3[1] = __byte_perm ( 0, w0[0], selector); + w7[3] = hc_byte_perm (w4[1], w4[2], selector); + w7[2] = hc_byte_perm (w4[0], w4[1], selector); + w7[1] = hc_byte_perm (w3[3], w4[0], selector); + w7[0] = hc_byte_perm (w3[2], w3[3], selector); + w6[3] = hc_byte_perm (w3[1], w3[2], selector); + w6[2] = hc_byte_perm (w3[0], w3[1], selector); + w6[1] = hc_byte_perm (w2[3], w3[0], selector); + w6[0] = hc_byte_perm (w2[2], w2[3], selector); + w5[3] = hc_byte_perm (w2[1], w2[2], selector); + w5[2] = hc_byte_perm (w2[0], w2[1], selector); + w5[1] = hc_byte_perm (w1[3], w2[0], selector); + w5[0] = hc_byte_perm (w1[2], w1[3], selector); + w4[3] = hc_byte_perm (w1[1], w1[2], selector); + w4[2] = hc_byte_perm (w1[0], w1[1], selector); + w4[1] = hc_byte_perm (w0[3], w1[0], selector); + w4[0] = hc_byte_perm (w0[2], w0[3], selector); + w3[3] = hc_byte_perm (w0[1], w0[2], selector); + w3[2] = hc_byte_perm (w0[0], w0[1], selector); + w3[1] = hc_byte_perm ( 0, w0[0], selector); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -5866,24 +5866,24 @@ DECLSPEC void switch_buffer_by_offset_8x4_le (u32x *w0, u32x *w1, u32x *w2, u32x break; case 14: - w7[3] = __byte_perm (w4[0], w4[1], selector); - w7[2] = __byte_perm (w3[3], w4[0], selector); - w7[1] = __byte_perm (w3[2], w3[3], selector); - w7[0] = __byte_perm (w3[1], w3[2], selector); - w6[3] = __byte_perm (w3[0], w3[1], selector); - w6[2] = __byte_perm (w2[3], w3[0], selector); - w6[1] = __byte_perm (w2[2], w2[3], selector); - w6[0] = __byte_perm (w2[1], w2[2], selector); - w5[3] = __byte_perm (w2[0], w2[1], selector); - w5[2] = __byte_perm (w1[3], w2[0], selector); - w5[1] = __byte_perm (w1[2], w1[3], selector); - w5[0] = __byte_perm (w1[1], w1[2], selector); - w4[3] = __byte_perm (w1[0], w1[1], selector); - w4[2] = __byte_perm (w0[3], w1[0], selector); - w4[1] = __byte_perm (w0[2], w0[3], selector); - w4[0] = __byte_perm (w0[1], w0[2], selector); - w3[3] = __byte_perm (w0[0], w0[1], selector); - w3[2] = __byte_perm ( 0, w0[0], selector); + w7[3] = hc_byte_perm (w4[0], w4[1], selector); + w7[2] = hc_byte_perm (w3[3], w4[0], selector); + w7[1] = hc_byte_perm (w3[2], w3[3], selector); + w7[0] = hc_byte_perm (w3[1], w3[2], selector); + w6[3] = hc_byte_perm (w3[0], w3[1], selector); + w6[2] = hc_byte_perm (w2[3], w3[0], selector); + w6[1] = hc_byte_perm (w2[2], w2[3], selector); + w6[0] = hc_byte_perm (w2[1], w2[2], selector); + w5[3] = hc_byte_perm (w2[0], w2[1], selector); + w5[2] = hc_byte_perm (w1[3], w2[0], selector); + w5[1] = hc_byte_perm (w1[2], w1[3], selector); + w5[0] = hc_byte_perm (w1[1], w1[2], selector); + w4[3] = hc_byte_perm (w1[0], w1[1], selector); + w4[2] = hc_byte_perm (w0[3], w1[0], selector); + w4[1] = hc_byte_perm (w0[2], w0[3], selector); + w4[0] = hc_byte_perm (w0[1], w0[2], selector); + w3[3] = hc_byte_perm (w0[0], w0[1], selector); + w3[2] = hc_byte_perm ( 0, w0[0], selector); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -5901,23 +5901,23 @@ DECLSPEC void switch_buffer_by_offset_8x4_le (u32x *w0, u32x *w1, u32x *w2, u32x break; case 15: - w7[3] = __byte_perm (w3[3], w4[0], selector); - w7[2] = __byte_perm (w3[2], w3[3], selector); - w7[1] = __byte_perm (w3[1], w3[2], selector); - w7[0] = __byte_perm (w3[0], w3[1], selector); - w6[3] = __byte_perm (w2[3], w3[0], selector); - w6[2] = __byte_perm (w2[2], w2[3], selector); - w6[1] = __byte_perm (w2[1], w2[2], selector); - w6[0] = __byte_perm (w2[0], w2[1], selector); - w5[3] = __byte_perm (w1[3], w2[0], selector); - w5[2] = __byte_perm (w1[2], w1[3], selector); - w5[1] = __byte_perm (w1[1], w1[2], selector); - w5[0] = __byte_perm (w1[0], w1[1], selector); - w4[3] = __byte_perm (w0[3], w1[0], selector); - w4[2] = __byte_perm (w0[2], w0[3], selector); - w4[1] = __byte_perm (w0[1], w0[2], selector); - w4[0] = __byte_perm (w0[0], w0[1], selector); - w3[3] = __byte_perm ( 0, w0[0], selector); + w7[3] = hc_byte_perm (w3[3], w4[0], selector); + w7[2] = hc_byte_perm (w3[2], w3[3], selector); + w7[1] = hc_byte_perm (w3[1], w3[2], selector); + w7[0] = hc_byte_perm (w3[0], w3[1], selector); + w6[3] = hc_byte_perm (w2[3], w3[0], selector); + w6[2] = hc_byte_perm (w2[2], w2[3], selector); + w6[1] = hc_byte_perm (w2[1], w2[2], selector); + w6[0] = hc_byte_perm (w2[0], w2[1], selector); + w5[3] = hc_byte_perm (w1[3], w2[0], selector); + w5[2] = hc_byte_perm (w1[2], w1[3], selector); + w5[1] = hc_byte_perm (w1[1], w1[2], selector); + w5[0] = hc_byte_perm (w1[0], w1[1], selector); + w4[3] = hc_byte_perm (w0[3], w1[0], selector); + w4[2] = hc_byte_perm (w0[2], w0[3], selector); + w4[1] = hc_byte_perm (w0[1], w0[2], selector); + w4[0] = hc_byte_perm (w0[0], w0[1], selector); + w3[3] = hc_byte_perm ( 0, w0[0], selector); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -5946,143 +5946,143 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x switch (offset_switch) { case 0: - w7[3] = amd_bytealign (w7[2], w7[3], offset); - w7[2] = amd_bytealign (w7[1], w7[2], offset); - w7[1] = amd_bytealign (w7[0], w7[1], offset); - w7[0] = amd_bytealign (w6[3], w7[0], offset); - w6[3] = amd_bytealign (w6[2], w6[3], offset); - w6[2] = amd_bytealign (w6[1], w6[2], offset); - w6[1] = amd_bytealign (w6[0], w6[1], offset); - w6[0] = amd_bytealign (w5[3], w6[0], offset); - w5[3] = amd_bytealign (w5[2], w5[3], offset); - w5[2] = amd_bytealign (w5[1], w5[2], offset); - w5[1] = amd_bytealign (w5[0], w5[1], offset); - w5[0] = amd_bytealign (w4[3], w5[0], offset); - w4[3] = amd_bytealign (w4[2], w4[3], offset); - w4[2] = amd_bytealign (w4[1], w4[2], offset); - w4[1] = amd_bytealign (w4[0], w4[1], offset); - w4[0] = amd_bytealign (w3[3], w4[0], offset); - w3[3] = amd_bytealign (w3[2], w3[3], offset); - w3[2] = amd_bytealign (w3[1], w3[2], offset); - w3[1] = amd_bytealign (w3[0], w3[1], offset); - w3[0] = amd_bytealign (w2[3], w3[0], offset); - w2[3] = amd_bytealign (w2[2], w2[3], offset); - w2[2] = amd_bytealign (w2[1], w2[2], offset); - w2[1] = amd_bytealign (w2[0], w2[1], offset); - w2[0] = amd_bytealign (w1[3], w2[0], offset); - w1[3] = amd_bytealign (w1[2], w1[3], offset); - w1[2] = amd_bytealign (w1[1], w1[2], offset); - w1[1] = amd_bytealign (w1[0], w1[1], offset); - w1[0] = amd_bytealign (w0[3], w1[0], offset); - w0[3] = amd_bytealign (w0[2], w0[3], offset); - w0[2] = amd_bytealign (w0[1], w0[2], offset); - w0[1] = amd_bytealign (w0[0], w0[1], offset); - w0[0] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w7[2], w7[3], offset); + w7[2] = hc_bytealign (w7[1], w7[2], offset); + w7[1] = hc_bytealign (w7[0], w7[1], offset); + w7[0] = hc_bytealign (w6[3], w7[0], offset); + w6[3] = hc_bytealign (w6[2], w6[3], offset); + w6[2] = hc_bytealign (w6[1], w6[2], offset); + w6[1] = hc_bytealign (w6[0], w6[1], offset); + w6[0] = hc_bytealign (w5[3], w6[0], offset); + w5[3] = hc_bytealign (w5[2], w5[3], offset); + w5[2] = hc_bytealign (w5[1], w5[2], offset); + w5[1] = hc_bytealign (w5[0], w5[1], offset); + w5[0] = hc_bytealign (w4[3], w5[0], offset); + w4[3] = hc_bytealign (w4[2], w4[3], offset); + w4[2] = hc_bytealign (w4[1], w4[2], offset); + w4[1] = hc_bytealign (w4[0], w4[1], offset); + w4[0] = hc_bytealign (w3[3], w4[0], offset); + w3[3] = hc_bytealign (w3[2], w3[3], offset); + w3[2] = hc_bytealign (w3[1], w3[2], offset); + w3[1] = hc_bytealign (w3[0], w3[1], offset); + w3[0] = hc_bytealign (w2[3], w3[0], offset); + w2[3] = hc_bytealign (w2[2], w2[3], offset); + w2[2] = hc_bytealign (w2[1], w2[2], offset); + w2[1] = hc_bytealign (w2[0], w2[1], offset); + w2[0] = hc_bytealign (w1[3], w2[0], offset); + w1[3] = hc_bytealign (w1[2], w1[3], offset); + w1[2] = hc_bytealign (w1[1], w1[2], offset); + w1[1] = hc_bytealign (w1[0], w1[1], offset); + w1[0] = hc_bytealign (w0[3], w1[0], offset); + w0[3] = hc_bytealign (w0[2], w0[3], offset); + w0[2] = hc_bytealign (w0[1], w0[2], offset); + w0[1] = hc_bytealign (w0[0], w0[1], offset); + w0[0] = hc_bytealign ( 0, w0[0], offset); break; case 1: - w7[3] = amd_bytealign (w7[1], w7[2], offset); - w7[2] = amd_bytealign (w7[0], w7[1], offset); - w7[1] = amd_bytealign (w6[3], w7[0], offset); - w7[0] = amd_bytealign (w6[2], w6[3], offset); - w6[3] = amd_bytealign (w6[1], w6[2], offset); - w6[2] = amd_bytealign (w6[0], w6[1], offset); - w6[1] = amd_bytealign (w5[3], w6[0], offset); - w6[0] = amd_bytealign (w5[2], w5[3], offset); - w5[3] = amd_bytealign (w5[1], w5[2], offset); - w5[2] = amd_bytealign (w5[0], w5[1], offset); - w5[1] = amd_bytealign (w4[3], w5[0], offset); - w5[0] = amd_bytealign (w4[2], w4[3], offset); - w4[3] = amd_bytealign (w4[1], w4[2], offset); - w4[2] = amd_bytealign (w4[0], w4[1], offset); - w4[1] = amd_bytealign (w3[3], w4[0], offset); - w4[0] = amd_bytealign (w3[2], w3[3], offset); - w3[3] = amd_bytealign (w3[1], w3[2], offset); - w3[2] = amd_bytealign (w3[0], w3[1], offset); - w3[1] = amd_bytealign (w2[3], w3[0], offset); - w3[0] = amd_bytealign (w2[2], w2[3], offset); - w2[3] = amd_bytealign (w2[1], w2[2], offset); - w2[2] = amd_bytealign (w2[0], w2[1], offset); - w2[1] = amd_bytealign (w1[3], w2[0], offset); - w2[0] = amd_bytealign (w1[2], w1[3], offset); - w1[3] = amd_bytealign (w1[1], w1[2], offset); - w1[2] = amd_bytealign (w1[0], w1[1], offset); - w1[1] = amd_bytealign (w0[3], w1[0], offset); - w1[0] = amd_bytealign (w0[2], w0[3], offset); - w0[3] = amd_bytealign (w0[1], w0[2], offset); - w0[2] = amd_bytealign (w0[0], w0[1], offset); - w0[1] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w7[1], w7[2], offset); + w7[2] = hc_bytealign (w7[0], w7[1], offset); + w7[1] = hc_bytealign (w6[3], w7[0], offset); + w7[0] = hc_bytealign (w6[2], w6[3], offset); + w6[3] = hc_bytealign (w6[1], w6[2], offset); + w6[2] = hc_bytealign (w6[0], w6[1], offset); + w6[1] = hc_bytealign (w5[3], w6[0], offset); + w6[0] = hc_bytealign (w5[2], w5[3], offset); + w5[3] = hc_bytealign (w5[1], w5[2], offset); + w5[2] = hc_bytealign (w5[0], w5[1], offset); + w5[1] = hc_bytealign (w4[3], w5[0], offset); + w5[0] = hc_bytealign (w4[2], w4[3], offset); + w4[3] = hc_bytealign (w4[1], w4[2], offset); + w4[2] = hc_bytealign (w4[0], w4[1], offset); + w4[1] = hc_bytealign (w3[3], w4[0], offset); + w4[0] = hc_bytealign (w3[2], w3[3], offset); + w3[3] = hc_bytealign (w3[1], w3[2], offset); + w3[2] = hc_bytealign (w3[0], w3[1], offset); + w3[1] = hc_bytealign (w2[3], w3[0], offset); + w3[0] = hc_bytealign (w2[2], w2[3], offset); + w2[3] = hc_bytealign (w2[1], w2[2], offset); + w2[2] = hc_bytealign (w2[0], w2[1], offset); + w2[1] = hc_bytealign (w1[3], w2[0], offset); + w2[0] = hc_bytealign (w1[2], w1[3], offset); + w1[3] = hc_bytealign (w1[1], w1[2], offset); + w1[2] = hc_bytealign (w1[0], w1[1], offset); + w1[1] = hc_bytealign (w0[3], w1[0], offset); + w1[0] = hc_bytealign (w0[2], w0[3], offset); + w0[3] = hc_bytealign (w0[1], w0[2], offset); + w0[2] = hc_bytealign (w0[0], w0[1], offset); + w0[1] = hc_bytealign ( 0, w0[0], offset); w0[0] = 0; break; case 2: - w7[3] = amd_bytealign (w7[0], w7[1], offset); - w7[2] = amd_bytealign (w6[3], w7[0], offset); - w7[1] = amd_bytealign (w6[2], w6[3], offset); - w7[0] = amd_bytealign (w6[1], w6[2], offset); - w6[3] = amd_bytealign (w6[0], w6[1], offset); - w6[2] = amd_bytealign (w5[3], w6[0], offset); - w6[1] = amd_bytealign (w5[2], w5[3], offset); - w6[0] = amd_bytealign (w5[1], w5[2], offset); - w5[3] = amd_bytealign (w5[0], w5[1], offset); - w5[2] = amd_bytealign (w4[3], w5[0], offset); - w5[1] = amd_bytealign (w4[2], w4[3], offset); - w5[0] = amd_bytealign (w4[1], w4[2], offset); - w4[3] = amd_bytealign (w4[0], w4[1], offset); - w4[2] = amd_bytealign (w3[3], w4[0], offset); - w4[1] = amd_bytealign (w3[2], w3[3], offset); - w4[0] = amd_bytealign (w3[1], w3[2], offset); - w3[3] = amd_bytealign (w3[0], w3[1], offset); - w3[2] = amd_bytealign (w2[3], w3[0], offset); - w3[1] = amd_bytealign (w2[2], w2[3], offset); - w3[0] = amd_bytealign (w2[1], w2[2], offset); - w2[3] = amd_bytealign (w2[0], w2[1], offset); - w2[2] = amd_bytealign (w1[3], w2[0], offset); - w2[1] = amd_bytealign (w1[2], w1[3], offset); - w2[0] = amd_bytealign (w1[1], w1[2], offset); - w1[3] = amd_bytealign (w1[0], w1[1], offset); - w1[2] = amd_bytealign (w0[3], w1[0], offset); - w1[1] = amd_bytealign (w0[2], w0[3], offset); - w1[0] = amd_bytealign (w0[1], w0[2], offset); - w0[3] = amd_bytealign (w0[0], w0[1], offset); - w0[2] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w7[0], w7[1], offset); + w7[2] = hc_bytealign (w6[3], w7[0], offset); + w7[1] = hc_bytealign (w6[2], w6[3], offset); + w7[0] = hc_bytealign (w6[1], w6[2], offset); + w6[3] = hc_bytealign (w6[0], w6[1], offset); + w6[2] = hc_bytealign (w5[3], w6[0], offset); + w6[1] = hc_bytealign (w5[2], w5[3], offset); + w6[0] = hc_bytealign (w5[1], w5[2], offset); + w5[3] = hc_bytealign (w5[0], w5[1], offset); + w5[2] = hc_bytealign (w4[3], w5[0], offset); + w5[1] = hc_bytealign (w4[2], w4[3], offset); + w5[0] = hc_bytealign (w4[1], w4[2], offset); + w4[3] = hc_bytealign (w4[0], w4[1], offset); + w4[2] = hc_bytealign (w3[3], w4[0], offset); + w4[1] = hc_bytealign (w3[2], w3[3], offset); + w4[0] = hc_bytealign (w3[1], w3[2], offset); + w3[3] = hc_bytealign (w3[0], w3[1], offset); + w3[2] = hc_bytealign (w2[3], w3[0], offset); + w3[1] = hc_bytealign (w2[2], w2[3], offset); + w3[0] = hc_bytealign (w2[1], w2[2], offset); + w2[3] = hc_bytealign (w2[0], w2[1], offset); + w2[2] = hc_bytealign (w1[3], w2[0], offset); + w2[1] = hc_bytealign (w1[2], w1[3], offset); + w2[0] = hc_bytealign (w1[1], w1[2], offset); + w1[3] = hc_bytealign (w1[0], w1[1], offset); + w1[2] = hc_bytealign (w0[3], w1[0], offset); + w1[1] = hc_bytealign (w0[2], w0[3], offset); + w1[0] = hc_bytealign (w0[1], w0[2], offset); + w0[3] = hc_bytealign (w0[0], w0[1], offset); + w0[2] = hc_bytealign ( 0, w0[0], offset); w0[1] = 0; w0[0] = 0; break; case 3: - w7[3] = amd_bytealign (w6[3], w7[0], offset); - w7[2] = amd_bytealign (w6[2], w6[3], offset); - w7[1] = amd_bytealign (w6[1], w6[2], offset); - w7[0] = amd_bytealign (w6[0], w6[1], offset); - w6[3] = amd_bytealign (w5[3], w6[0], offset); - w6[2] = amd_bytealign (w5[2], w5[3], offset); - w6[1] = amd_bytealign (w5[1], w5[2], offset); - w6[0] = amd_bytealign (w5[0], w5[1], offset); - w5[3] = amd_bytealign (w4[3], w5[0], offset); - w5[2] = amd_bytealign (w4[2], w4[3], offset); - w5[1] = amd_bytealign (w4[1], w4[2], offset); - w5[0] = amd_bytealign (w4[0], w4[1], offset); - w4[3] = amd_bytealign (w3[3], w4[0], offset); - w4[2] = amd_bytealign (w3[2], w3[3], offset); - w4[1] = amd_bytealign (w3[1], w3[2], offset); - w4[0] = amd_bytealign (w3[0], w3[1], offset); - w3[3] = amd_bytealign (w2[3], w3[0], offset); - w3[2] = amd_bytealign (w2[2], w2[3], offset); - w3[1] = amd_bytealign (w2[1], w2[2], offset); - w3[0] = amd_bytealign (w2[0], w2[1], offset); - w2[3] = amd_bytealign (w1[3], w2[0], offset); - w2[2] = amd_bytealign (w1[2], w1[3], offset); - w2[1] = amd_bytealign (w1[1], w1[2], offset); - w2[0] = amd_bytealign (w1[0], w1[1], offset); - w1[3] = amd_bytealign (w0[3], w1[0], offset); - w1[2] = amd_bytealign (w0[2], w0[3], offset); - w1[1] = amd_bytealign (w0[1], w0[2], offset); - w1[0] = amd_bytealign (w0[0], w0[1], offset); - w0[3] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w6[3], w7[0], offset); + w7[2] = hc_bytealign (w6[2], w6[3], offset); + w7[1] = hc_bytealign (w6[1], w6[2], offset); + w7[0] = hc_bytealign (w6[0], w6[1], offset); + w6[3] = hc_bytealign (w5[3], w6[0], offset); + w6[2] = hc_bytealign (w5[2], w5[3], offset); + w6[1] = hc_bytealign (w5[1], w5[2], offset); + w6[0] = hc_bytealign (w5[0], w5[1], offset); + w5[3] = hc_bytealign (w4[3], w5[0], offset); + w5[2] = hc_bytealign (w4[2], w4[3], offset); + w5[1] = hc_bytealign (w4[1], w4[2], offset); + w5[0] = hc_bytealign (w4[0], w4[1], offset); + w4[3] = hc_bytealign (w3[3], w4[0], offset); + w4[2] = hc_bytealign (w3[2], w3[3], offset); + w4[1] = hc_bytealign (w3[1], w3[2], offset); + w4[0] = hc_bytealign (w3[0], w3[1], offset); + w3[3] = hc_bytealign (w2[3], w3[0], offset); + w3[2] = hc_bytealign (w2[2], w2[3], offset); + w3[1] = hc_bytealign (w2[1], w2[2], offset); + w3[0] = hc_bytealign (w2[0], w2[1], offset); + w2[3] = hc_bytealign (w1[3], w2[0], offset); + w2[2] = hc_bytealign (w1[2], w1[3], offset); + w2[1] = hc_bytealign (w1[1], w1[2], offset); + w2[0] = hc_bytealign (w1[0], w1[1], offset); + w1[3] = hc_bytealign (w0[3], w1[0], offset); + w1[2] = hc_bytealign (w0[2], w0[3], offset); + w1[1] = hc_bytealign (w0[1], w0[2], offset); + w1[0] = hc_bytealign (w0[0], w0[1], offset); + w0[3] = hc_bytealign ( 0, w0[0], offset); w0[2] = 0; w0[1] = 0; w0[0] = 0; @@ -6090,34 +6090,34 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 4: - w7[3] = amd_bytealign (w6[2], w6[3], offset); - w7[2] = amd_bytealign (w6[1], w6[2], offset); - w7[1] = amd_bytealign (w6[0], w6[1], offset); - w7[0] = amd_bytealign (w5[3], w6[0], offset); - w6[3] = amd_bytealign (w5[2], w5[3], offset); - w6[2] = amd_bytealign (w5[1], w5[2], offset); - w6[1] = amd_bytealign (w5[0], w5[1], offset); - w6[0] = amd_bytealign (w4[3], w5[0], offset); - w5[3] = amd_bytealign (w4[2], w4[3], offset); - w5[2] = amd_bytealign (w4[1], w4[2], offset); - w5[1] = amd_bytealign (w4[0], w4[1], offset); - w5[0] = amd_bytealign (w3[3], w4[0], offset); - w4[3] = amd_bytealign (w3[2], w3[3], offset); - w4[2] = amd_bytealign (w3[1], w3[2], offset); - w4[1] = amd_bytealign (w3[0], w3[1], offset); - w4[0] = amd_bytealign (w2[3], w3[0], offset); - w3[3] = amd_bytealign (w2[2], w2[3], offset); - w3[2] = amd_bytealign (w2[1], w2[2], offset); - w3[1] = amd_bytealign (w2[0], w2[1], offset); - w3[0] = amd_bytealign (w1[3], w2[0], offset); - w2[3] = amd_bytealign (w1[2], w1[3], offset); - w2[2] = amd_bytealign (w1[1], w1[2], offset); - w2[1] = amd_bytealign (w1[0], w1[1], offset); - w2[0] = amd_bytealign (w0[3], w1[0], offset); - w1[3] = amd_bytealign (w0[2], w0[3], offset); - w1[2] = amd_bytealign (w0[1], w0[2], offset); - w1[1] = amd_bytealign (w0[0], w0[1], offset); - w1[0] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w6[2], w6[3], offset); + w7[2] = hc_bytealign (w6[1], w6[2], offset); + w7[1] = hc_bytealign (w6[0], w6[1], offset); + w7[0] = hc_bytealign (w5[3], w6[0], offset); + w6[3] = hc_bytealign (w5[2], w5[3], offset); + w6[2] = hc_bytealign (w5[1], w5[2], offset); + w6[1] = hc_bytealign (w5[0], w5[1], offset); + w6[0] = hc_bytealign (w4[3], w5[0], offset); + w5[3] = hc_bytealign (w4[2], w4[3], offset); + w5[2] = hc_bytealign (w4[1], w4[2], offset); + w5[1] = hc_bytealign (w4[0], w4[1], offset); + w5[0] = hc_bytealign (w3[3], w4[0], offset); + w4[3] = hc_bytealign (w3[2], w3[3], offset); + w4[2] = hc_bytealign (w3[1], w3[2], offset); + w4[1] = hc_bytealign (w3[0], w3[1], offset); + w4[0] = hc_bytealign (w2[3], w3[0], offset); + w3[3] = hc_bytealign (w2[2], w2[3], offset); + w3[2] = hc_bytealign (w2[1], w2[2], offset); + w3[1] = hc_bytealign (w2[0], w2[1], offset); + w3[0] = hc_bytealign (w1[3], w2[0], offset); + w2[3] = hc_bytealign (w1[2], w1[3], offset); + w2[2] = hc_bytealign (w1[1], w1[2], offset); + w2[1] = hc_bytealign (w1[0], w1[1], offset); + w2[0] = hc_bytealign (w0[3], w1[0], offset); + w1[3] = hc_bytealign (w0[2], w0[3], offset); + w1[2] = hc_bytealign (w0[1], w0[2], offset); + w1[1] = hc_bytealign (w0[0], w0[1], offset); + w1[0] = hc_bytealign ( 0, w0[0], offset); w0[3] = 0; w0[2] = 0; w0[1] = 0; @@ -6126,33 +6126,33 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 5: - w7[3] = amd_bytealign (w6[1], w6[2], offset); - w7[2] = amd_bytealign (w6[0], w6[1], offset); - w7[1] = amd_bytealign (w5[3], w6[0], offset); - w7[0] = amd_bytealign (w5[2], w5[3], offset); - w6[3] = amd_bytealign (w5[1], w5[2], offset); - w6[2] = amd_bytealign (w5[0], w5[1], offset); - w6[1] = amd_bytealign (w4[3], w5[0], offset); - w6[0] = amd_bytealign (w4[2], w4[3], offset); - w5[3] = amd_bytealign (w4[1], w4[2], offset); - w5[2] = amd_bytealign (w4[0], w4[1], offset); - w5[1] = amd_bytealign (w3[3], w4[0], offset); - w5[0] = amd_bytealign (w3[2], w3[3], offset); - w4[3] = amd_bytealign (w3[1], w3[2], offset); - w4[2] = amd_bytealign (w3[0], w3[1], offset); - w4[1] = amd_bytealign (w2[3], w3[0], offset); - w4[0] = amd_bytealign (w2[2], w2[3], offset); - w3[3] = amd_bytealign (w2[1], w2[2], offset); - w3[2] = amd_bytealign (w2[0], w2[1], offset); - w3[1] = amd_bytealign (w1[3], w2[0], offset); - w3[0] = amd_bytealign (w1[2], w1[3], offset); - w2[3] = amd_bytealign (w1[1], w1[2], offset); - w2[2] = amd_bytealign (w1[0], w1[1], offset); - w2[1] = amd_bytealign (w0[3], w1[0], offset); - w2[0] = amd_bytealign (w0[2], w0[3], offset); - w1[3] = amd_bytealign (w0[1], w0[2], offset); - w1[2] = amd_bytealign (w0[0], w0[1], offset); - w1[1] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w6[1], w6[2], offset); + w7[2] = hc_bytealign (w6[0], w6[1], offset); + w7[1] = hc_bytealign (w5[3], w6[0], offset); + w7[0] = hc_bytealign (w5[2], w5[3], offset); + w6[3] = hc_bytealign (w5[1], w5[2], offset); + w6[2] = hc_bytealign (w5[0], w5[1], offset); + w6[1] = hc_bytealign (w4[3], w5[0], offset); + w6[0] = hc_bytealign (w4[2], w4[3], offset); + w5[3] = hc_bytealign (w4[1], w4[2], offset); + w5[2] = hc_bytealign (w4[0], w4[1], offset); + w5[1] = hc_bytealign (w3[3], w4[0], offset); + w5[0] = hc_bytealign (w3[2], w3[3], offset); + w4[3] = hc_bytealign (w3[1], w3[2], offset); + w4[2] = hc_bytealign (w3[0], w3[1], offset); + w4[1] = hc_bytealign (w2[3], w3[0], offset); + w4[0] = hc_bytealign (w2[2], w2[3], offset); + w3[3] = hc_bytealign (w2[1], w2[2], offset); + w3[2] = hc_bytealign (w2[0], w2[1], offset); + w3[1] = hc_bytealign (w1[3], w2[0], offset); + w3[0] = hc_bytealign (w1[2], w1[3], offset); + w2[3] = hc_bytealign (w1[1], w1[2], offset); + w2[2] = hc_bytealign (w1[0], w1[1], offset); + w2[1] = hc_bytealign (w0[3], w1[0], offset); + w2[0] = hc_bytealign (w0[2], w0[3], offset); + w1[3] = hc_bytealign (w0[1], w0[2], offset); + w1[2] = hc_bytealign (w0[0], w0[1], offset); + w1[1] = hc_bytealign ( 0, w0[0], offset); w1[0] = 0; w0[3] = 0; w0[2] = 0; @@ -6162,32 +6162,32 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 6: - w7[3] = amd_bytealign (w6[0], w6[1], offset); - w7[2] = amd_bytealign (w5[3], w6[0], offset); - w7[1] = amd_bytealign (w5[2], w5[3], offset); - w7[0] = amd_bytealign (w5[1], w5[2], offset); - w6[3] = amd_bytealign (w5[0], w5[1], offset); - w6[2] = amd_bytealign (w4[3], w5[0], offset); - w6[1] = amd_bytealign (w4[2], w4[3], offset); - w6[0] = amd_bytealign (w4[1], w4[2], offset); - w5[3] = amd_bytealign (w4[0], w4[1], offset); - w5[2] = amd_bytealign (w3[3], w4[0], offset); - w5[1] = amd_bytealign (w3[2], w3[3], offset); - w5[0] = amd_bytealign (w3[1], w3[2], offset); - w4[3] = amd_bytealign (w3[0], w3[1], offset); - w4[2] = amd_bytealign (w2[3], w3[0], offset); - w4[1] = amd_bytealign (w2[2], w2[3], offset); - w4[0] = amd_bytealign (w2[1], w2[2], offset); - w3[3] = amd_bytealign (w2[0], w2[1], offset); - w3[2] = amd_bytealign (w1[3], w2[0], offset); - w3[1] = amd_bytealign (w1[2], w1[3], offset); - w3[0] = amd_bytealign (w1[1], w1[2], offset); - w2[3] = amd_bytealign (w1[0], w1[1], offset); - w2[2] = amd_bytealign (w0[3], w1[0], offset); - w2[1] = amd_bytealign (w0[2], w0[3], offset); - w2[0] = amd_bytealign (w0[1], w0[2], offset); - w1[3] = amd_bytealign (w0[0], w0[1], offset); - w1[2] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w6[0], w6[1], offset); + w7[2] = hc_bytealign (w5[3], w6[0], offset); + w7[1] = hc_bytealign (w5[2], w5[3], offset); + w7[0] = hc_bytealign (w5[1], w5[2], offset); + w6[3] = hc_bytealign (w5[0], w5[1], offset); + w6[2] = hc_bytealign (w4[3], w5[0], offset); + w6[1] = hc_bytealign (w4[2], w4[3], offset); + w6[0] = hc_bytealign (w4[1], w4[2], offset); + w5[3] = hc_bytealign (w4[0], w4[1], offset); + w5[2] = hc_bytealign (w3[3], w4[0], offset); + w5[1] = hc_bytealign (w3[2], w3[3], offset); + w5[0] = hc_bytealign (w3[1], w3[2], offset); + w4[3] = hc_bytealign (w3[0], w3[1], offset); + w4[2] = hc_bytealign (w2[3], w3[0], offset); + w4[1] = hc_bytealign (w2[2], w2[3], offset); + w4[0] = hc_bytealign (w2[1], w2[2], offset); + w3[3] = hc_bytealign (w2[0], w2[1], offset); + w3[2] = hc_bytealign (w1[3], w2[0], offset); + w3[1] = hc_bytealign (w1[2], w1[3], offset); + w3[0] = hc_bytealign (w1[1], w1[2], offset); + w2[3] = hc_bytealign (w1[0], w1[1], offset); + w2[2] = hc_bytealign (w0[3], w1[0], offset); + w2[1] = hc_bytealign (w0[2], w0[3], offset); + w2[0] = hc_bytealign (w0[1], w0[2], offset); + w1[3] = hc_bytealign (w0[0], w0[1], offset); + w1[2] = hc_bytealign ( 0, w0[0], offset); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -6198,31 +6198,31 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 7: - w7[3] = amd_bytealign (w5[3], w6[0], offset); - w7[2] = amd_bytealign (w5[2], w5[3], offset); - w7[1] = amd_bytealign (w5[1], w5[2], offset); - w7[0] = amd_bytealign (w5[0], w5[1], offset); - w6[3] = amd_bytealign (w4[3], w5[0], offset); - w6[2] = amd_bytealign (w4[2], w4[3], offset); - w6[1] = amd_bytealign (w4[1], w4[2], offset); - w6[0] = amd_bytealign (w4[0], w4[1], offset); - w5[3] = amd_bytealign (w3[3], w4[0], offset); - w5[2] = amd_bytealign (w3[2], w3[3], offset); - w5[1] = amd_bytealign (w3[1], w3[2], offset); - w5[0] = amd_bytealign (w3[0], w3[1], offset); - w4[3] = amd_bytealign (w2[3], w3[0], offset); - w4[2] = amd_bytealign (w2[2], w2[3], offset); - w4[1] = amd_bytealign (w2[1], w2[2], offset); - w4[0] = amd_bytealign (w2[0], w2[1], offset); - w3[3] = amd_bytealign (w1[3], w2[0], offset); - w3[2] = amd_bytealign (w1[2], w1[3], offset); - w3[1] = amd_bytealign (w1[1], w1[2], offset); - w3[0] = amd_bytealign (w1[0], w1[1], offset); - w2[3] = amd_bytealign (w0[3], w1[0], offset); - w2[2] = amd_bytealign (w0[2], w0[3], offset); - w2[1] = amd_bytealign (w0[1], w0[2], offset); - w2[0] = amd_bytealign (w0[0], w0[1], offset); - w1[3] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w5[3], w6[0], offset); + w7[2] = hc_bytealign (w5[2], w5[3], offset); + w7[1] = hc_bytealign (w5[1], w5[2], offset); + w7[0] = hc_bytealign (w5[0], w5[1], offset); + w6[3] = hc_bytealign (w4[3], w5[0], offset); + w6[2] = hc_bytealign (w4[2], w4[3], offset); + w6[1] = hc_bytealign (w4[1], w4[2], offset); + w6[0] = hc_bytealign (w4[0], w4[1], offset); + w5[3] = hc_bytealign (w3[3], w4[0], offset); + w5[2] = hc_bytealign (w3[2], w3[3], offset); + w5[1] = hc_bytealign (w3[1], w3[2], offset); + w5[0] = hc_bytealign (w3[0], w3[1], offset); + w4[3] = hc_bytealign (w2[3], w3[0], offset); + w4[2] = hc_bytealign (w2[2], w2[3], offset); + w4[1] = hc_bytealign (w2[1], w2[2], offset); + w4[0] = hc_bytealign (w2[0], w2[1], offset); + w3[3] = hc_bytealign (w1[3], w2[0], offset); + w3[2] = hc_bytealign (w1[2], w1[3], offset); + w3[1] = hc_bytealign (w1[1], w1[2], offset); + w3[0] = hc_bytealign (w1[0], w1[1], offset); + w2[3] = hc_bytealign (w0[3], w1[0], offset); + w2[2] = hc_bytealign (w0[2], w0[3], offset); + w2[1] = hc_bytealign (w0[1], w0[2], offset); + w2[0] = hc_bytealign (w0[0], w0[1], offset); + w1[3] = hc_bytealign ( 0, w0[0], offset); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -6234,30 +6234,30 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 8: - w7[3] = amd_bytealign (w5[2], w5[3], offset); - w7[2] = amd_bytealign (w5[1], w5[2], offset); - w7[1] = amd_bytealign (w5[0], w5[1], offset); - w7[0] = amd_bytealign (w4[3], w5[0], offset); - w6[3] = amd_bytealign (w4[2], w4[3], offset); - w6[2] = amd_bytealign (w4[1], w4[2], offset); - w6[1] = amd_bytealign (w4[0], w4[1], offset); - w6[0] = amd_bytealign (w3[3], w4[0], offset); - w5[3] = amd_bytealign (w3[2], w3[3], offset); - w5[2] = amd_bytealign (w3[1], w3[2], offset); - w5[1] = amd_bytealign (w3[0], w3[1], offset); - w5[0] = amd_bytealign (w2[3], w3[0], offset); - w4[3] = amd_bytealign (w2[2], w2[3], offset); - w4[2] = amd_bytealign (w2[1], w2[2], offset); - w4[1] = amd_bytealign (w2[0], w2[1], offset); - w4[0] = amd_bytealign (w1[3], w2[0], offset); - w3[3] = amd_bytealign (w1[2], w1[3], offset); - w3[2] = amd_bytealign (w1[1], w1[2], offset); - w3[1] = amd_bytealign (w1[0], w1[1], offset); - w3[0] = amd_bytealign (w0[3], w1[0], offset); - w2[3] = amd_bytealign (w0[2], w0[3], offset); - w2[2] = amd_bytealign (w0[1], w0[2], offset); - w2[1] = amd_bytealign (w0[0], w0[1], offset); - w2[0] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w5[2], w5[3], offset); + w7[2] = hc_bytealign (w5[1], w5[2], offset); + w7[1] = hc_bytealign (w5[0], w5[1], offset); + w7[0] = hc_bytealign (w4[3], w5[0], offset); + w6[3] = hc_bytealign (w4[2], w4[3], offset); + w6[2] = hc_bytealign (w4[1], w4[2], offset); + w6[1] = hc_bytealign (w4[0], w4[1], offset); + w6[0] = hc_bytealign (w3[3], w4[0], offset); + w5[3] = hc_bytealign (w3[2], w3[3], offset); + w5[2] = hc_bytealign (w3[1], w3[2], offset); + w5[1] = hc_bytealign (w3[0], w3[1], offset); + w5[0] = hc_bytealign (w2[3], w3[0], offset); + w4[3] = hc_bytealign (w2[2], w2[3], offset); + w4[2] = hc_bytealign (w2[1], w2[2], offset); + w4[1] = hc_bytealign (w2[0], w2[1], offset); + w4[0] = hc_bytealign (w1[3], w2[0], offset); + w3[3] = hc_bytealign (w1[2], w1[3], offset); + w3[2] = hc_bytealign (w1[1], w1[2], offset); + w3[1] = hc_bytealign (w1[0], w1[1], offset); + w3[0] = hc_bytealign (w0[3], w1[0], offset); + w2[3] = hc_bytealign (w0[2], w0[3], offset); + w2[2] = hc_bytealign (w0[1], w0[2], offset); + w2[1] = hc_bytealign (w0[0], w0[1], offset); + w2[0] = hc_bytealign ( 0, w0[0], offset); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -6270,29 +6270,29 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 9: - w7[3] = amd_bytealign (w5[1], w5[2], offset); - w7[2] = amd_bytealign (w5[0], w5[1], offset); - w7[1] = amd_bytealign (w4[3], w5[0], offset); - w7[0] = amd_bytealign (w4[2], w4[3], offset); - w6[3] = amd_bytealign (w4[1], w4[2], offset); - w6[2] = amd_bytealign (w4[0], w4[1], offset); - w6[1] = amd_bytealign (w3[3], w4[0], offset); - w6[0] = amd_bytealign (w3[2], w3[3], offset); - w5[3] = amd_bytealign (w3[1], w3[2], offset); - w5[2] = amd_bytealign (w3[0], w3[1], offset); - w5[1] = amd_bytealign (w2[3], w3[0], offset); - w5[0] = amd_bytealign (w2[2], w2[3], offset); - w4[3] = amd_bytealign (w2[1], w2[2], offset); - w4[2] = amd_bytealign (w2[0], w2[1], offset); - w4[1] = amd_bytealign (w1[3], w2[0], offset); - w4[0] = amd_bytealign (w1[2], w1[3], offset); - w3[3] = amd_bytealign (w1[1], w1[2], offset); - w3[2] = amd_bytealign (w1[0], w1[1], offset); - w3[1] = amd_bytealign (w0[3], w1[0], offset); - w3[0] = amd_bytealign (w0[2], w0[3], offset); - w2[3] = amd_bytealign (w0[1], w0[2], offset); - w2[2] = amd_bytealign (w0[0], w0[1], offset); - w2[1] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w5[1], w5[2], offset); + w7[2] = hc_bytealign (w5[0], w5[1], offset); + w7[1] = hc_bytealign (w4[3], w5[0], offset); + w7[0] = hc_bytealign (w4[2], w4[3], offset); + w6[3] = hc_bytealign (w4[1], w4[2], offset); + w6[2] = hc_bytealign (w4[0], w4[1], offset); + w6[1] = hc_bytealign (w3[3], w4[0], offset); + w6[0] = hc_bytealign (w3[2], w3[3], offset); + w5[3] = hc_bytealign (w3[1], w3[2], offset); + w5[2] = hc_bytealign (w3[0], w3[1], offset); + w5[1] = hc_bytealign (w2[3], w3[0], offset); + w5[0] = hc_bytealign (w2[2], w2[3], offset); + w4[3] = hc_bytealign (w2[1], w2[2], offset); + w4[2] = hc_bytealign (w2[0], w2[1], offset); + w4[1] = hc_bytealign (w1[3], w2[0], offset); + w4[0] = hc_bytealign (w1[2], w1[3], offset); + w3[3] = hc_bytealign (w1[1], w1[2], offset); + w3[2] = hc_bytealign (w1[0], w1[1], offset); + w3[1] = hc_bytealign (w0[3], w1[0], offset); + w3[0] = hc_bytealign (w0[2], w0[3], offset); + w2[3] = hc_bytealign (w0[1], w0[2], offset); + w2[2] = hc_bytealign (w0[0], w0[1], offset); + w2[1] = hc_bytealign ( 0, w0[0], offset); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -6306,28 +6306,28 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 10: - w7[3] = amd_bytealign (w5[0], w5[1], offset); - w7[2] = amd_bytealign (w4[3], w5[0], offset); - w7[1] = amd_bytealign (w4[2], w4[3], offset); - w7[0] = amd_bytealign (w4[1], w4[2], offset); - w6[3] = amd_bytealign (w4[0], w4[1], offset); - w6[2] = amd_bytealign (w3[3], w4[0], offset); - w6[1] = amd_bytealign (w3[2], w3[3], offset); - w6[0] = amd_bytealign (w3[1], w3[2], offset); - w5[3] = amd_bytealign (w3[0], w3[1], offset); - w5[2] = amd_bytealign (w2[3], w3[0], offset); - w5[1] = amd_bytealign (w2[2], w2[3], offset); - w5[0] = amd_bytealign (w2[1], w2[2], offset); - w4[3] = amd_bytealign (w2[0], w2[1], offset); - w4[2] = amd_bytealign (w1[3], w2[0], offset); - w4[1] = amd_bytealign (w1[2], w1[3], offset); - w4[0] = amd_bytealign (w1[1], w1[2], offset); - w3[3] = amd_bytealign (w1[0], w1[1], offset); - w3[2] = amd_bytealign (w0[3], w1[0], offset); - w3[1] = amd_bytealign (w0[2], w0[3], offset); - w3[0] = amd_bytealign (w0[1], w0[2], offset); - w2[3] = amd_bytealign (w0[0], w0[1], offset); - w2[2] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w5[0], w5[1], offset); + w7[2] = hc_bytealign (w4[3], w5[0], offset); + w7[1] = hc_bytealign (w4[2], w4[3], offset); + w7[0] = hc_bytealign (w4[1], w4[2], offset); + w6[3] = hc_bytealign (w4[0], w4[1], offset); + w6[2] = hc_bytealign (w3[3], w4[0], offset); + w6[1] = hc_bytealign (w3[2], w3[3], offset); + w6[0] = hc_bytealign (w3[1], w3[2], offset); + w5[3] = hc_bytealign (w3[0], w3[1], offset); + w5[2] = hc_bytealign (w2[3], w3[0], offset); + w5[1] = hc_bytealign (w2[2], w2[3], offset); + w5[0] = hc_bytealign (w2[1], w2[2], offset); + w4[3] = hc_bytealign (w2[0], w2[1], offset); + w4[2] = hc_bytealign (w1[3], w2[0], offset); + w4[1] = hc_bytealign (w1[2], w1[3], offset); + w4[0] = hc_bytealign (w1[1], w1[2], offset); + w3[3] = hc_bytealign (w1[0], w1[1], offset); + w3[2] = hc_bytealign (w0[3], w1[0], offset); + w3[1] = hc_bytealign (w0[2], w0[3], offset); + w3[0] = hc_bytealign (w0[1], w0[2], offset); + w2[3] = hc_bytealign (w0[0], w0[1], offset); + w2[2] = hc_bytealign ( 0, w0[0], offset); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -6342,27 +6342,27 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 11: - w7[3] = amd_bytealign (w4[3], w5[0], offset); - w7[2] = amd_bytealign (w4[2], w4[3], offset); - w7[1] = amd_bytealign (w4[1], w4[2], offset); - w7[0] = amd_bytealign (w4[0], w4[1], offset); - w6[3] = amd_bytealign (w3[3], w4[0], offset); - w6[2] = amd_bytealign (w3[2], w3[3], offset); - w6[1] = amd_bytealign (w3[1], w3[2], offset); - w6[0] = amd_bytealign (w3[0], w3[1], offset); - w5[3] = amd_bytealign (w2[3], w3[0], offset); - w5[2] = amd_bytealign (w2[2], w2[3], offset); - w5[1] = amd_bytealign (w2[1], w2[2], offset); - w5[0] = amd_bytealign (w2[0], w2[1], offset); - w4[3] = amd_bytealign (w1[3], w2[0], offset); - w4[2] = amd_bytealign (w1[2], w1[3], offset); - w4[1] = amd_bytealign (w1[1], w1[2], offset); - w4[0] = amd_bytealign (w1[0], w1[1], offset); - w3[3] = amd_bytealign (w0[3], w1[0], offset); - w3[2] = amd_bytealign (w0[2], w0[3], offset); - w3[1] = amd_bytealign (w0[1], w0[2], offset); - w3[0] = amd_bytealign (w0[0], w0[1], offset); - w2[3] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w4[3], w5[0], offset); + w7[2] = hc_bytealign (w4[2], w4[3], offset); + w7[1] = hc_bytealign (w4[1], w4[2], offset); + w7[0] = hc_bytealign (w4[0], w4[1], offset); + w6[3] = hc_bytealign (w3[3], w4[0], offset); + w6[2] = hc_bytealign (w3[2], w3[3], offset); + w6[1] = hc_bytealign (w3[1], w3[2], offset); + w6[0] = hc_bytealign (w3[0], w3[1], offset); + w5[3] = hc_bytealign (w2[3], w3[0], offset); + w5[2] = hc_bytealign (w2[2], w2[3], offset); + w5[1] = hc_bytealign (w2[1], w2[2], offset); + w5[0] = hc_bytealign (w2[0], w2[1], offset); + w4[3] = hc_bytealign (w1[3], w2[0], offset); + w4[2] = hc_bytealign (w1[2], w1[3], offset); + w4[1] = hc_bytealign (w1[1], w1[2], offset); + w4[0] = hc_bytealign (w1[0], w1[1], offset); + w3[3] = hc_bytealign (w0[3], w1[0], offset); + w3[2] = hc_bytealign (w0[2], w0[3], offset); + w3[1] = hc_bytealign (w0[1], w0[2], offset); + w3[0] = hc_bytealign (w0[0], w0[1], offset); + w2[3] = hc_bytealign ( 0, w0[0], offset); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -6378,26 +6378,26 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 12: - w7[3] = amd_bytealign (w4[2], w4[3], offset); - w7[2] = amd_bytealign (w4[1], w4[2], offset); - w7[1] = amd_bytealign (w4[0], w4[1], offset); - w7[0] = amd_bytealign (w3[3], w4[0], offset); - w6[3] = amd_bytealign (w3[2], w3[3], offset); - w6[2] = amd_bytealign (w3[1], w3[2], offset); - w6[1] = amd_bytealign (w3[0], w3[1], offset); - w6[0] = amd_bytealign (w2[3], w3[0], offset); - w5[3] = amd_bytealign (w2[2], w2[3], offset); - w5[2] = amd_bytealign (w2[1], w2[2], offset); - w5[1] = amd_bytealign (w2[0], w2[1], offset); - w5[0] = amd_bytealign (w1[3], w2[0], offset); - w4[3] = amd_bytealign (w1[2], w1[3], offset); - w4[2] = amd_bytealign (w1[1], w1[2], offset); - w4[1] = amd_bytealign (w1[0], w1[1], offset); - w4[0] = amd_bytealign (w0[3], w1[0], offset); - w3[3] = amd_bytealign (w0[2], w0[3], offset); - w3[2] = amd_bytealign (w0[1], w0[2], offset); - w3[1] = amd_bytealign (w0[0], w0[1], offset); - w3[0] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w4[2], w4[3], offset); + w7[2] = hc_bytealign (w4[1], w4[2], offset); + w7[1] = hc_bytealign (w4[0], w4[1], offset); + w7[0] = hc_bytealign (w3[3], w4[0], offset); + w6[3] = hc_bytealign (w3[2], w3[3], offset); + w6[2] = hc_bytealign (w3[1], w3[2], offset); + w6[1] = hc_bytealign (w3[0], w3[1], offset); + w6[0] = hc_bytealign (w2[3], w3[0], offset); + w5[3] = hc_bytealign (w2[2], w2[3], offset); + w5[2] = hc_bytealign (w2[1], w2[2], offset); + w5[1] = hc_bytealign (w2[0], w2[1], offset); + w5[0] = hc_bytealign (w1[3], w2[0], offset); + w4[3] = hc_bytealign (w1[2], w1[3], offset); + w4[2] = hc_bytealign (w1[1], w1[2], offset); + w4[1] = hc_bytealign (w1[0], w1[1], offset); + w4[0] = hc_bytealign (w0[3], w1[0], offset); + w3[3] = hc_bytealign (w0[2], w0[3], offset); + w3[2] = hc_bytealign (w0[1], w0[2], offset); + w3[1] = hc_bytealign (w0[0], w0[1], offset); + w3[0] = hc_bytealign ( 0, w0[0], offset); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -6414,25 +6414,25 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 13: - w7[3] = amd_bytealign (w4[1], w4[2], offset); - w7[2] = amd_bytealign (w4[0], w4[1], offset); - w7[1] = amd_bytealign (w3[3], w4[0], offset); - w7[0] = amd_bytealign (w3[2], w3[3], offset); - w6[3] = amd_bytealign (w3[1], w3[2], offset); - w6[2] = amd_bytealign (w3[0], w3[1], offset); - w6[1] = amd_bytealign (w2[3], w3[0], offset); - w6[0] = amd_bytealign (w2[2], w2[3], offset); - w5[3] = amd_bytealign (w2[1], w2[2], offset); - w5[2] = amd_bytealign (w2[0], w2[1], offset); - w5[1] = amd_bytealign (w1[3], w2[0], offset); - w5[0] = amd_bytealign (w1[2], w1[3], offset); - w4[3] = amd_bytealign (w1[1], w1[2], offset); - w4[2] = amd_bytealign (w1[0], w1[1], offset); - w4[1] = amd_bytealign (w0[3], w1[0], offset); - w4[0] = amd_bytealign (w0[2], w0[3], offset); - w3[3] = amd_bytealign (w0[1], w0[2], offset); - w3[2] = amd_bytealign (w0[0], w0[1], offset); - w3[1] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w4[1], w4[2], offset); + w7[2] = hc_bytealign (w4[0], w4[1], offset); + w7[1] = hc_bytealign (w3[3], w4[0], offset); + w7[0] = hc_bytealign (w3[2], w3[3], offset); + w6[3] = hc_bytealign (w3[1], w3[2], offset); + w6[2] = hc_bytealign (w3[0], w3[1], offset); + w6[1] = hc_bytealign (w2[3], w3[0], offset); + w6[0] = hc_bytealign (w2[2], w2[3], offset); + w5[3] = hc_bytealign (w2[1], w2[2], offset); + w5[2] = hc_bytealign (w2[0], w2[1], offset); + w5[1] = hc_bytealign (w1[3], w2[0], offset); + w5[0] = hc_bytealign (w1[2], w1[3], offset); + w4[3] = hc_bytealign (w1[1], w1[2], offset); + w4[2] = hc_bytealign (w1[0], w1[1], offset); + w4[1] = hc_bytealign (w0[3], w1[0], offset); + w4[0] = hc_bytealign (w0[2], w0[3], offset); + w3[3] = hc_bytealign (w0[1], w0[2], offset); + w3[2] = hc_bytealign (w0[0], w0[1], offset); + w3[1] = hc_bytealign ( 0, w0[0], offset); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -6450,24 +6450,24 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 14: - w7[3] = amd_bytealign (w4[0], w4[1], offset); - w7[2] = amd_bytealign (w3[3], w4[0], offset); - w7[1] = amd_bytealign (w3[2], w3[3], offset); - w7[0] = amd_bytealign (w3[1], w3[2], offset); - w6[3] = amd_bytealign (w3[0], w3[1], offset); - w6[2] = amd_bytealign (w2[3], w3[0], offset); - w6[1] = amd_bytealign (w2[2], w2[3], offset); - w6[0] = amd_bytealign (w2[1], w2[2], offset); - w5[3] = amd_bytealign (w2[0], w2[1], offset); - w5[2] = amd_bytealign (w1[3], w2[0], offset); - w5[1] = amd_bytealign (w1[2], w1[3], offset); - w5[0] = amd_bytealign (w1[1], w1[2], offset); - w4[3] = amd_bytealign (w1[0], w1[1], offset); - w4[2] = amd_bytealign (w0[3], w1[0], offset); - w4[1] = amd_bytealign (w0[2], w0[3], offset); - w4[0] = amd_bytealign (w0[1], w0[2], offset); - w3[3] = amd_bytealign (w0[0], w0[1], offset); - w3[2] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w4[0], w4[1], offset); + w7[2] = hc_bytealign (w3[3], w4[0], offset); + w7[1] = hc_bytealign (w3[2], w3[3], offset); + w7[0] = hc_bytealign (w3[1], w3[2], offset); + w6[3] = hc_bytealign (w3[0], w3[1], offset); + w6[2] = hc_bytealign (w2[3], w3[0], offset); + w6[1] = hc_bytealign (w2[2], w2[3], offset); + w6[0] = hc_bytealign (w2[1], w2[2], offset); + w5[3] = hc_bytealign (w2[0], w2[1], offset); + w5[2] = hc_bytealign (w1[3], w2[0], offset); + w5[1] = hc_bytealign (w1[2], w1[3], offset); + w5[0] = hc_bytealign (w1[1], w1[2], offset); + w4[3] = hc_bytealign (w1[0], w1[1], offset); + w4[2] = hc_bytealign (w0[3], w1[0], offset); + w4[1] = hc_bytealign (w0[2], w0[3], offset); + w4[0] = hc_bytealign (w0[1], w0[2], offset); + w3[3] = hc_bytealign (w0[0], w0[1], offset); + w3[2] = hc_bytealign ( 0, w0[0], offset); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -6486,23 +6486,23 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 15: - w7[3] = amd_bytealign (w3[3], w4[0], offset); - w7[2] = amd_bytealign (w3[2], w3[3], offset); - w7[1] = amd_bytealign (w3[1], w3[2], offset); - w7[0] = amd_bytealign (w3[0], w3[1], offset); - w6[3] = amd_bytealign (w2[3], w3[0], offset); - w6[2] = amd_bytealign (w2[2], w2[3], offset); - w6[1] = amd_bytealign (w2[1], w2[2], offset); - w6[0] = amd_bytealign (w2[0], w2[1], offset); - w5[3] = amd_bytealign (w1[3], w2[0], offset); - w5[2] = amd_bytealign (w1[2], w1[3], offset); - w5[1] = amd_bytealign (w1[1], w1[2], offset); - w5[0] = amd_bytealign (w1[0], w1[1], offset); - w4[3] = amd_bytealign (w0[3], w1[0], offset); - w4[2] = amd_bytealign (w0[2], w0[3], offset); - w4[1] = amd_bytealign (w0[1], w0[2], offset); - w4[0] = amd_bytealign (w0[0], w0[1], offset); - w3[3] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w3[3], w4[0], offset); + w7[2] = hc_bytealign (w3[2], w3[3], offset); + w7[1] = hc_bytealign (w3[1], w3[2], offset); + w7[0] = hc_bytealign (w3[0], w3[1], offset); + w6[3] = hc_bytealign (w2[3], w3[0], offset); + w6[2] = hc_bytealign (w2[2], w2[3], offset); + w6[1] = hc_bytealign (w2[1], w2[2], offset); + w6[0] = hc_bytealign (w2[0], w2[1], offset); + w5[3] = hc_bytealign (w1[3], w2[0], offset); + w5[2] = hc_bytealign (w1[2], w1[3], offset); + w5[1] = hc_bytealign (w1[1], w1[2], offset); + w5[0] = hc_bytealign (w1[0], w1[1], offset); + w4[3] = hc_bytealign (w0[3], w1[0], offset); + w4[2] = hc_bytealign (w0[2], w0[3], offset); + w4[1] = hc_bytealign (w0[1], w0[2], offset); + w4[0] = hc_bytealign (w0[0], w0[1], offset); + w3[3] = hc_bytealign ( 0, w0[0], offset); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -6522,22 +6522,22 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 16: - w7[3] = amd_bytealign (w3[2], w3[3], offset); - w7[2] = amd_bytealign (w3[1], w3[2], offset); - w7[1] = amd_bytealign (w3[0], w3[1], offset); - w7[0] = amd_bytealign (w2[3], w3[0], offset); - w6[3] = amd_bytealign (w2[2], w2[3], offset); - w6[2] = amd_bytealign (w2[1], w2[2], offset); - w6[1] = amd_bytealign (w2[0], w2[1], offset); - w6[0] = amd_bytealign (w1[3], w2[0], offset); - w5[3] = amd_bytealign (w1[2], w1[3], offset); - w5[2] = amd_bytealign (w1[1], w1[2], offset); - w5[1] = amd_bytealign (w1[0], w1[1], offset); - w5[0] = amd_bytealign (w0[3], w1[0], offset); - w4[3] = amd_bytealign (w0[2], w0[3], offset); - w4[2] = amd_bytealign (w0[1], w0[2], offset); - w4[1] = amd_bytealign (w0[0], w0[1], offset); - w4[0] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w3[2], w3[3], offset); + w7[2] = hc_bytealign (w3[1], w3[2], offset); + w7[1] = hc_bytealign (w3[0], w3[1], offset); + w7[0] = hc_bytealign (w2[3], w3[0], offset); + w6[3] = hc_bytealign (w2[2], w2[3], offset); + w6[2] = hc_bytealign (w2[1], w2[2], offset); + w6[1] = hc_bytealign (w2[0], w2[1], offset); + w6[0] = hc_bytealign (w1[3], w2[0], offset); + w5[3] = hc_bytealign (w1[2], w1[3], offset); + w5[2] = hc_bytealign (w1[1], w1[2], offset); + w5[1] = hc_bytealign (w1[0], w1[1], offset); + w5[0] = hc_bytealign (w0[3], w1[0], offset); + w4[3] = hc_bytealign (w0[2], w0[3], offset); + w4[2] = hc_bytealign (w0[1], w0[2], offset); + w4[1] = hc_bytealign (w0[0], w0[1], offset); + w4[0] = hc_bytealign ( 0, w0[0], offset); w3[3] = 0; w3[2] = 0; w3[1] = 0; @@ -6558,21 +6558,21 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 17: - w7[3] = amd_bytealign (w3[1], w3[2], offset); - w7[2] = amd_bytealign (w3[0], w3[1], offset); - w7[1] = amd_bytealign (w2[3], w3[0], offset); - w7[0] = amd_bytealign (w2[2], w2[3], offset); - w6[3] = amd_bytealign (w2[1], w2[2], offset); - w6[2] = amd_bytealign (w2[0], w2[1], offset); - w6[1] = amd_bytealign (w1[3], w2[0], offset); - w6[0] = amd_bytealign (w1[2], w1[3], offset); - w5[3] = amd_bytealign (w1[1], w1[2], offset); - w5[2] = amd_bytealign (w1[0], w1[1], offset); - w5[1] = amd_bytealign (w0[3], w1[0], offset); - w5[0] = amd_bytealign (w0[2], w0[3], offset); - w4[3] = amd_bytealign (w0[1], w0[2], offset); - w4[2] = amd_bytealign (w0[0], w0[1], offset); - w4[1] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w3[1], w3[2], offset); + w7[2] = hc_bytealign (w3[0], w3[1], offset); + w7[1] = hc_bytealign (w2[3], w3[0], offset); + w7[0] = hc_bytealign (w2[2], w2[3], offset); + w6[3] = hc_bytealign (w2[1], w2[2], offset); + w6[2] = hc_bytealign (w2[0], w2[1], offset); + w6[1] = hc_bytealign (w1[3], w2[0], offset); + w6[0] = hc_bytealign (w1[2], w1[3], offset); + w5[3] = hc_bytealign (w1[1], w1[2], offset); + w5[2] = hc_bytealign (w1[0], w1[1], offset); + w5[1] = hc_bytealign (w0[3], w1[0], offset); + w5[0] = hc_bytealign (w0[2], w0[3], offset); + w4[3] = hc_bytealign (w0[1], w0[2], offset); + w4[2] = hc_bytealign (w0[0], w0[1], offset); + w4[1] = hc_bytealign ( 0, w0[0], offset); w4[0] = 0; w3[3] = 0; w3[2] = 0; @@ -6594,20 +6594,20 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 18: - w7[3] = amd_bytealign (w3[0], w3[1], offset); - w7[2] = amd_bytealign (w2[3], w3[0], offset); - w7[1] = amd_bytealign (w2[2], w2[3], offset); - w7[0] = amd_bytealign (w2[1], w2[2], offset); - w6[3] = amd_bytealign (w2[0], w2[1], offset); - w6[2] = amd_bytealign (w1[3], w2[0], offset); - w6[1] = amd_bytealign (w1[2], w1[3], offset); - w6[0] = amd_bytealign (w1[1], w1[2], offset); - w5[3] = amd_bytealign (w1[0], w1[1], offset); - w5[2] = amd_bytealign (w0[3], w1[0], offset); - w5[1] = amd_bytealign (w0[2], w0[3], offset); - w5[0] = amd_bytealign (w0[1], w0[2], offset); - w4[3] = amd_bytealign (w0[0], w0[1], offset); - w4[2] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w3[0], w3[1], offset); + w7[2] = hc_bytealign (w2[3], w3[0], offset); + w7[1] = hc_bytealign (w2[2], w2[3], offset); + w7[0] = hc_bytealign (w2[1], w2[2], offset); + w6[3] = hc_bytealign (w2[0], w2[1], offset); + w6[2] = hc_bytealign (w1[3], w2[0], offset); + w6[1] = hc_bytealign (w1[2], w1[3], offset); + w6[0] = hc_bytealign (w1[1], w1[2], offset); + w5[3] = hc_bytealign (w1[0], w1[1], offset); + w5[2] = hc_bytealign (w0[3], w1[0], offset); + w5[1] = hc_bytealign (w0[2], w0[3], offset); + w5[0] = hc_bytealign (w0[1], w0[2], offset); + w4[3] = hc_bytealign (w0[0], w0[1], offset); + w4[2] = hc_bytealign ( 0, w0[0], offset); w4[1] = 0; w4[0] = 0; w3[3] = 0; @@ -6630,19 +6630,19 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 19: - w7[3] = amd_bytealign (w2[3], w3[0], offset); - w7[2] = amd_bytealign (w2[2], w2[3], offset); - w7[1] = amd_bytealign (w2[1], w2[2], offset); - w7[0] = amd_bytealign (w2[0], w2[1], offset); - w6[3] = amd_bytealign (w1[3], w2[0], offset); - w6[2] = amd_bytealign (w1[2], w1[3], offset); - w6[1] = amd_bytealign (w1[1], w1[2], offset); - w6[0] = amd_bytealign (w1[0], w1[1], offset); - w5[3] = amd_bytealign (w0[3], w1[0], offset); - w5[2] = amd_bytealign (w0[2], w0[3], offset); - w5[1] = amd_bytealign (w0[1], w0[2], offset); - w5[0] = amd_bytealign (w0[0], w0[1], offset); - w4[3] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w2[3], w3[0], offset); + w7[2] = hc_bytealign (w2[2], w2[3], offset); + w7[1] = hc_bytealign (w2[1], w2[2], offset); + w7[0] = hc_bytealign (w2[0], w2[1], offset); + w6[3] = hc_bytealign (w1[3], w2[0], offset); + w6[2] = hc_bytealign (w1[2], w1[3], offset); + w6[1] = hc_bytealign (w1[1], w1[2], offset); + w6[0] = hc_bytealign (w1[0], w1[1], offset); + w5[3] = hc_bytealign (w0[3], w1[0], offset); + w5[2] = hc_bytealign (w0[2], w0[3], offset); + w5[1] = hc_bytealign (w0[1], w0[2], offset); + w5[0] = hc_bytealign (w0[0], w0[1], offset); + w4[3] = hc_bytealign ( 0, w0[0], offset); w4[2] = 0; w4[1] = 0; w4[0] = 0; @@ -6666,18 +6666,18 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 20: - w7[3] = amd_bytealign (w2[2], w2[3], offset); - w7[2] = amd_bytealign (w2[1], w2[2], offset); - w7[1] = amd_bytealign (w2[0], w2[1], offset); - w7[0] = amd_bytealign (w1[3], w2[0], offset); - w6[3] = amd_bytealign (w1[2], w1[3], offset); - w6[2] = amd_bytealign (w1[1], w1[2], offset); - w6[1] = amd_bytealign (w1[0], w1[1], offset); - w6[0] = amd_bytealign (w0[3], w1[0], offset); - w5[3] = amd_bytealign (w0[2], w0[3], offset); - w5[2] = amd_bytealign (w0[1], w0[2], offset); - w5[1] = amd_bytealign (w0[0], w0[1], offset); - w5[0] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w2[2], w2[3], offset); + w7[2] = hc_bytealign (w2[1], w2[2], offset); + w7[1] = hc_bytealign (w2[0], w2[1], offset); + w7[0] = hc_bytealign (w1[3], w2[0], offset); + w6[3] = hc_bytealign (w1[2], w1[3], offset); + w6[2] = hc_bytealign (w1[1], w1[2], offset); + w6[1] = hc_bytealign (w1[0], w1[1], offset); + w6[0] = hc_bytealign (w0[3], w1[0], offset); + w5[3] = hc_bytealign (w0[2], w0[3], offset); + w5[2] = hc_bytealign (w0[1], w0[2], offset); + w5[1] = hc_bytealign (w0[0], w0[1], offset); + w5[0] = hc_bytealign ( 0, w0[0], offset); w4[3] = 0; w4[2] = 0; w4[1] = 0; @@ -6702,17 +6702,17 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 21: - w7[3] = amd_bytealign (w2[1], w2[2], offset); - w7[2] = amd_bytealign (w2[0], w2[1], offset); - w7[1] = amd_bytealign (w1[3], w2[0], offset); - w7[0] = amd_bytealign (w1[2], w1[3], offset); - w6[3] = amd_bytealign (w1[1], w1[2], offset); - w6[2] = amd_bytealign (w1[0], w1[1], offset); - w6[1] = amd_bytealign (w0[3], w1[0], offset); - w6[0] = amd_bytealign (w0[2], w0[3], offset); - w5[3] = amd_bytealign (w0[1], w0[2], offset); - w5[2] = amd_bytealign (w0[0], w0[1], offset); - w5[1] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w2[1], w2[2], offset); + w7[2] = hc_bytealign (w2[0], w2[1], offset); + w7[1] = hc_bytealign (w1[3], w2[0], offset); + w7[0] = hc_bytealign (w1[2], w1[3], offset); + w6[3] = hc_bytealign (w1[1], w1[2], offset); + w6[2] = hc_bytealign (w1[0], w1[1], offset); + w6[1] = hc_bytealign (w0[3], w1[0], offset); + w6[0] = hc_bytealign (w0[2], w0[3], offset); + w5[3] = hc_bytealign (w0[1], w0[2], offset); + w5[2] = hc_bytealign (w0[0], w0[1], offset); + w5[1] = hc_bytealign ( 0, w0[0], offset); w5[0] = 0; w4[3] = 0; w4[2] = 0; @@ -6738,16 +6738,16 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 22: - w7[3] = amd_bytealign (w2[0], w2[1], offset); - w7[2] = amd_bytealign (w1[3], w2[0], offset); - w7[1] = amd_bytealign (w1[2], w1[3], offset); - w7[0] = amd_bytealign (w1[1], w1[2], offset); - w6[3] = amd_bytealign (w1[0], w1[1], offset); - w6[2] = amd_bytealign (w0[3], w1[0], offset); - w6[1] = amd_bytealign (w0[2], w0[3], offset); - w6[0] = amd_bytealign (w0[1], w0[2], offset); - w5[3] = amd_bytealign (w0[0], w0[1], offset); - w5[2] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w2[0], w2[1], offset); + w7[2] = hc_bytealign (w1[3], w2[0], offset); + w7[1] = hc_bytealign (w1[2], w1[3], offset); + w7[0] = hc_bytealign (w1[1], w1[2], offset); + w6[3] = hc_bytealign (w1[0], w1[1], offset); + w6[2] = hc_bytealign (w0[3], w1[0], offset); + w6[1] = hc_bytealign (w0[2], w0[3], offset); + w6[0] = hc_bytealign (w0[1], w0[2], offset); + w5[3] = hc_bytealign (w0[0], w0[1], offset); + w5[2] = hc_bytealign ( 0, w0[0], offset); w5[1] = 0; w5[0] = 0; w4[3] = 0; @@ -6774,15 +6774,15 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 23: - w7[3] = amd_bytealign (w1[3], w2[0], offset); - w7[2] = amd_bytealign (w1[2], w1[3], offset); - w7[1] = amd_bytealign (w1[1], w1[2], offset); - w7[0] = amd_bytealign (w1[0], w1[1], offset); - w6[3] = amd_bytealign (w0[3], w1[0], offset); - w6[2] = amd_bytealign (w0[2], w0[3], offset); - w6[1] = amd_bytealign (w0[1], w0[2], offset); - w6[0] = amd_bytealign (w0[0], w0[1], offset); - w5[3] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w1[3], w2[0], offset); + w7[2] = hc_bytealign (w1[2], w1[3], offset); + w7[1] = hc_bytealign (w1[1], w1[2], offset); + w7[0] = hc_bytealign (w1[0], w1[1], offset); + w6[3] = hc_bytealign (w0[3], w1[0], offset); + w6[2] = hc_bytealign (w0[2], w0[3], offset); + w6[1] = hc_bytealign (w0[1], w0[2], offset); + w6[0] = hc_bytealign (w0[0], w0[1], offset); + w5[3] = hc_bytealign ( 0, w0[0], offset); w5[2] = 0; w5[1] = 0; w5[0] = 0; @@ -6810,14 +6810,14 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 24: - w7[3] = amd_bytealign (w1[2], w1[3], offset); - w7[2] = amd_bytealign (w1[1], w1[2], offset); - w7[1] = amd_bytealign (w1[0], w1[1], offset); - w7[0] = amd_bytealign (w0[3], w1[0], offset); - w6[3] = amd_bytealign (w0[2], w0[3], offset); - w6[2] = amd_bytealign (w0[1], w0[2], offset); - w6[1] = amd_bytealign (w0[0], w0[1], offset); - w6[0] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w1[2], w1[3], offset); + w7[2] = hc_bytealign (w1[1], w1[2], offset); + w7[1] = hc_bytealign (w1[0], w1[1], offset); + w7[0] = hc_bytealign (w0[3], w1[0], offset); + w6[3] = hc_bytealign (w0[2], w0[3], offset); + w6[2] = hc_bytealign (w0[1], w0[2], offset); + w6[1] = hc_bytealign (w0[0], w0[1], offset); + w6[0] = hc_bytealign ( 0, w0[0], offset); w5[3] = 0; w5[2] = 0; w5[1] = 0; @@ -6846,13 +6846,13 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 25: - w7[3] = amd_bytealign (w1[1], w1[2], offset); - w7[2] = amd_bytealign (w1[0], w1[1], offset); - w7[1] = amd_bytealign (w0[3], w1[0], offset); - w7[0] = amd_bytealign (w0[2], w0[3], offset); - w6[3] = amd_bytealign (w0[1], w0[2], offset); - w6[2] = amd_bytealign (w0[0], w0[1], offset); - w6[1] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w1[1], w1[2], offset); + w7[2] = hc_bytealign (w1[0], w1[1], offset); + w7[1] = hc_bytealign (w0[3], w1[0], offset); + w7[0] = hc_bytealign (w0[2], w0[3], offset); + w6[3] = hc_bytealign (w0[1], w0[2], offset); + w6[2] = hc_bytealign (w0[0], w0[1], offset); + w6[1] = hc_bytealign ( 0, w0[0], offset); w6[0] = 0; w5[3] = 0; w5[2] = 0; @@ -6882,12 +6882,12 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 26: - w7[3] = amd_bytealign (w1[0], w1[1], offset); - w7[2] = amd_bytealign (w0[3], w1[0], offset); - w7[1] = amd_bytealign (w0[2], w0[3], offset); - w7[0] = amd_bytealign (w0[1], w0[2], offset); - w6[3] = amd_bytealign (w0[0], w0[1], offset); - w6[2] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w1[0], w1[1], offset); + w7[2] = hc_bytealign (w0[3], w1[0], offset); + w7[1] = hc_bytealign (w0[2], w0[3], offset); + w7[0] = hc_bytealign (w0[1], w0[2], offset); + w6[3] = hc_bytealign (w0[0], w0[1], offset); + w6[2] = hc_bytealign ( 0, w0[0], offset); w6[1] = 0; w6[0] = 0; w5[3] = 0; @@ -6918,11 +6918,11 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 27: - w7[3] = amd_bytealign (w0[3], w1[0], offset); - w7[2] = amd_bytealign (w0[2], w0[3], offset); - w7[1] = amd_bytealign (w0[1], w0[2], offset); - w7[0] = amd_bytealign (w0[0], w0[1], offset); - w6[3] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w0[3], w1[0], offset); + w7[2] = hc_bytealign (w0[2], w0[3], offset); + w7[1] = hc_bytealign (w0[1], w0[2], offset); + w7[0] = hc_bytealign (w0[0], w0[1], offset); + w6[3] = hc_bytealign ( 0, w0[0], offset); w6[2] = 0; w6[1] = 0; w6[0] = 0; @@ -6954,10 +6954,10 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 28: - w7[3] = amd_bytealign (w0[2], w0[3], offset); - w7[2] = amd_bytealign (w0[1], w0[2], offset); - w7[1] = amd_bytealign (w0[0], w0[1], offset); - w7[0] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w0[2], w0[3], offset); + w7[2] = hc_bytealign (w0[1], w0[2], offset); + w7[1] = hc_bytealign (w0[0], w0[1], offset); + w7[0] = hc_bytealign ( 0, w0[0], offset); w6[3] = 0; w6[2] = 0; w6[1] = 0; @@ -6990,9 +6990,9 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 29: - w7[3] = amd_bytealign (w0[1], w0[2], offset); - w7[2] = amd_bytealign (w0[0], w0[1], offset); - w7[1] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w0[1], w0[2], offset); + w7[2] = hc_bytealign (w0[0], w0[1], offset); + w7[1] = hc_bytealign ( 0, w0[0], offset); w7[0] = 0; w6[3] = 0; w6[2] = 0; @@ -7026,8 +7026,8 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 30: - w7[3] = amd_bytealign (w0[0], w0[1], offset); - w7[2] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign (w0[0], w0[1], offset); + w7[2] = hc_bytealign ( 0, w0[0], offset); w7[1] = 0; w7[0] = 0; w6[3] = 0; @@ -7062,7 +7062,7 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 31: - w7[3] = amd_bytealign ( 0, w0[0], offset); + w7[3] = hc_bytealign ( 0, w0[0], offset); w7[2] = 0; w7[1] = 0; w7[0] = 0; @@ -7112,143 +7112,143 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x switch (offset_switch) { case 0: - w7[3] = __byte_perm (w7[3], w7[2], selector); - w7[2] = __byte_perm (w7[2], w7[1], selector); - w7[1] = __byte_perm (w7[1], w7[0], selector); - w7[0] = __byte_perm (w7[0], w6[3], selector); - w6[3] = __byte_perm (w6[3], w6[2], selector); - w6[2] = __byte_perm (w6[2], w6[1], selector); - w6[1] = __byte_perm (w6[1], w6[0], selector); - w6[0] = __byte_perm (w6[0], w5[3], selector); - w5[3] = __byte_perm (w5[3], w5[2], selector); - w5[2] = __byte_perm (w5[2], w5[1], selector); - w5[1] = __byte_perm (w5[1], w5[0], selector); - w5[0] = __byte_perm (w5[0], w4[3], selector); - w4[3] = __byte_perm (w4[3], w4[2], selector); - w4[2] = __byte_perm (w4[2], w4[1], selector); - w4[1] = __byte_perm (w4[1], w4[0], selector); - w4[0] = __byte_perm (w4[0], w3[3], selector); - w3[3] = __byte_perm (w3[3], w3[2], selector); - w3[2] = __byte_perm (w3[2], w3[1], selector); - w3[1] = __byte_perm (w3[1], w3[0], selector); - w3[0] = __byte_perm (w3[0], w2[3], selector); - w2[3] = __byte_perm (w2[3], w2[2], selector); - w2[2] = __byte_perm (w2[2], w2[1], selector); - w2[1] = __byte_perm (w2[1], w2[0], selector); - w2[0] = __byte_perm (w2[0], w1[3], selector); - w1[3] = __byte_perm (w1[3], w1[2], selector); - w1[2] = __byte_perm (w1[2], w1[1], selector); - w1[1] = __byte_perm (w1[1], w1[0], selector); - w1[0] = __byte_perm (w1[0], w0[3], selector); - w0[3] = __byte_perm (w0[3], w0[2], selector); - w0[2] = __byte_perm (w0[2], w0[1], selector); - w0[1] = __byte_perm (w0[1], w0[0], selector); - w0[0] = __byte_perm (w0[0], 0, selector); + w7[3] = hc_byte_perm (w7[3], w7[2], selector); + w7[2] = hc_byte_perm (w7[2], w7[1], selector); + w7[1] = hc_byte_perm (w7[1], w7[0], selector); + w7[0] = hc_byte_perm (w7[0], w6[3], selector); + w6[3] = hc_byte_perm (w6[3], w6[2], selector); + w6[2] = hc_byte_perm (w6[2], w6[1], selector); + w6[1] = hc_byte_perm (w6[1], w6[0], selector); + w6[0] = hc_byte_perm (w6[0], w5[3], selector); + w5[3] = hc_byte_perm (w5[3], w5[2], selector); + w5[2] = hc_byte_perm (w5[2], w5[1], selector); + w5[1] = hc_byte_perm (w5[1], w5[0], selector); + w5[0] = hc_byte_perm (w5[0], w4[3], selector); + w4[3] = hc_byte_perm (w4[3], w4[2], selector); + w4[2] = hc_byte_perm (w4[2], w4[1], selector); + w4[1] = hc_byte_perm (w4[1], w4[0], selector); + w4[0] = hc_byte_perm (w4[0], w3[3], selector); + w3[3] = hc_byte_perm (w3[3], w3[2], selector); + w3[2] = hc_byte_perm (w3[2], w3[1], selector); + w3[1] = hc_byte_perm (w3[1], w3[0], selector); + w3[0] = hc_byte_perm (w3[0], w2[3], selector); + w2[3] = hc_byte_perm (w2[3], w2[2], selector); + w2[2] = hc_byte_perm (w2[2], w2[1], selector); + w2[1] = hc_byte_perm (w2[1], w2[0], selector); + w2[0] = hc_byte_perm (w2[0], w1[3], selector); + w1[3] = hc_byte_perm (w1[3], w1[2], selector); + w1[2] = hc_byte_perm (w1[2], w1[1], selector); + w1[1] = hc_byte_perm (w1[1], w1[0], selector); + w1[0] = hc_byte_perm (w1[0], w0[3], selector); + w0[3] = hc_byte_perm (w0[3], w0[2], selector); + w0[2] = hc_byte_perm (w0[2], w0[1], selector); + w0[1] = hc_byte_perm (w0[1], w0[0], selector); + w0[0] = hc_byte_perm (w0[0], 0, selector); break; case 1: - w7[3] = __byte_perm (w7[2], w7[1], selector); - w7[2] = __byte_perm (w7[1], w7[0], selector); - w7[1] = __byte_perm (w7[0], w6[3], selector); - w7[0] = __byte_perm (w6[3], w6[2], selector); - w6[3] = __byte_perm (w6[2], w6[1], selector); - w6[2] = __byte_perm (w6[1], w6[0], selector); - w6[1] = __byte_perm (w6[0], w5[3], selector); - w6[0] = __byte_perm (w5[3], w5[2], selector); - w5[3] = __byte_perm (w5[2], w5[1], selector); - w5[2] = __byte_perm (w5[1], w5[0], selector); - w5[1] = __byte_perm (w5[0], w4[3], selector); - w5[0] = __byte_perm (w4[3], w4[2], selector); - w4[3] = __byte_perm (w4[2], w4[1], selector); - w4[2] = __byte_perm (w4[1], w4[0], selector); - w4[1] = __byte_perm (w4[0], w3[3], selector); - w4[0] = __byte_perm (w3[3], w3[2], selector); - w3[3] = __byte_perm (w3[2], w3[1], selector); - w3[2] = __byte_perm (w3[1], w3[0], selector); - w3[1] = __byte_perm (w3[0], w2[3], selector); - w3[0] = __byte_perm (w2[3], w2[2], selector); - w2[3] = __byte_perm (w2[2], w2[1], selector); - w2[2] = __byte_perm (w2[1], w2[0], selector); - w2[1] = __byte_perm (w2[0], w1[3], selector); - w2[0] = __byte_perm (w1[3], w1[2], selector); - w1[3] = __byte_perm (w1[2], w1[1], selector); - w1[2] = __byte_perm (w1[1], w1[0], selector); - w1[1] = __byte_perm (w1[0], w0[3], selector); - w1[0] = __byte_perm (w0[3], w0[2], selector); - w0[3] = __byte_perm (w0[2], w0[1], selector); - w0[2] = __byte_perm (w0[1], w0[0], selector); - w0[1] = __byte_perm (w0[0], 0, selector); + w7[3] = hc_byte_perm (w7[2], w7[1], selector); + w7[2] = hc_byte_perm (w7[1], w7[0], selector); + w7[1] = hc_byte_perm (w7[0], w6[3], selector); + w7[0] = hc_byte_perm (w6[3], w6[2], selector); + w6[3] = hc_byte_perm (w6[2], w6[1], selector); + w6[2] = hc_byte_perm (w6[1], w6[0], selector); + w6[1] = hc_byte_perm (w6[0], w5[3], selector); + w6[0] = hc_byte_perm (w5[3], w5[2], selector); + w5[3] = hc_byte_perm (w5[2], w5[1], selector); + w5[2] = hc_byte_perm (w5[1], w5[0], selector); + w5[1] = hc_byte_perm (w5[0], w4[3], selector); + w5[0] = hc_byte_perm (w4[3], w4[2], selector); + w4[3] = hc_byte_perm (w4[2], w4[1], selector); + w4[2] = hc_byte_perm (w4[1], w4[0], selector); + w4[1] = hc_byte_perm (w4[0], w3[3], selector); + w4[0] = hc_byte_perm (w3[3], w3[2], selector); + w3[3] = hc_byte_perm (w3[2], w3[1], selector); + w3[2] = hc_byte_perm (w3[1], w3[0], selector); + w3[1] = hc_byte_perm (w3[0], w2[3], selector); + w3[0] = hc_byte_perm (w2[3], w2[2], selector); + w2[3] = hc_byte_perm (w2[2], w2[1], selector); + w2[2] = hc_byte_perm (w2[1], w2[0], selector); + w2[1] = hc_byte_perm (w2[0], w1[3], selector); + w2[0] = hc_byte_perm (w1[3], w1[2], selector); + w1[3] = hc_byte_perm (w1[2], w1[1], selector); + w1[2] = hc_byte_perm (w1[1], w1[0], selector); + w1[1] = hc_byte_perm (w1[0], w0[3], selector); + w1[0] = hc_byte_perm (w0[3], w0[2], selector); + w0[3] = hc_byte_perm (w0[2], w0[1], selector); + w0[2] = hc_byte_perm (w0[1], w0[0], selector); + w0[1] = hc_byte_perm (w0[0], 0, selector); w0[0] = 0; break; case 2: - w7[3] = __byte_perm (w7[1], w7[0], selector); - w7[2] = __byte_perm (w7[0], w6[3], selector); - w7[1] = __byte_perm (w6[3], w6[2], selector); - w7[0] = __byte_perm (w6[2], w6[1], selector); - w6[3] = __byte_perm (w6[1], w6[0], selector); - w6[2] = __byte_perm (w6[0], w5[3], selector); - w6[1] = __byte_perm (w5[3], w5[2], selector); - w6[0] = __byte_perm (w5[2], w5[1], selector); - w5[3] = __byte_perm (w5[1], w5[0], selector); - w5[2] = __byte_perm (w5[0], w4[3], selector); - w5[1] = __byte_perm (w4[3], w4[2], selector); - w5[0] = __byte_perm (w4[2], w4[1], selector); - w4[3] = __byte_perm (w4[1], w4[0], selector); - w4[2] = __byte_perm (w4[0], w3[3], selector); - w4[1] = __byte_perm (w3[3], w3[2], selector); - w4[0] = __byte_perm (w3[2], w3[1], selector); - w3[3] = __byte_perm (w3[1], w3[0], selector); - w3[2] = __byte_perm (w3[0], w2[3], selector); - w3[1] = __byte_perm (w2[3], w2[2], selector); - w3[0] = __byte_perm (w2[2], w2[1], selector); - w2[3] = __byte_perm (w2[1], w2[0], selector); - w2[2] = __byte_perm (w2[0], w1[3], selector); - w2[1] = __byte_perm (w1[3], w1[2], selector); - w2[0] = __byte_perm (w1[2], w1[1], selector); - w1[3] = __byte_perm (w1[1], w1[0], selector); - w1[2] = __byte_perm (w1[0], w0[3], selector); - w1[1] = __byte_perm (w0[3], w0[2], selector); - w1[0] = __byte_perm (w0[2], w0[1], selector); - w0[3] = __byte_perm (w0[1], w0[0], selector); - w0[2] = __byte_perm (w0[0], 0, selector); + w7[3] = hc_byte_perm (w7[1], w7[0], selector); + w7[2] = hc_byte_perm (w7[0], w6[3], selector); + w7[1] = hc_byte_perm (w6[3], w6[2], selector); + w7[0] = hc_byte_perm (w6[2], w6[1], selector); + w6[3] = hc_byte_perm (w6[1], w6[0], selector); + w6[2] = hc_byte_perm (w6[0], w5[3], selector); + w6[1] = hc_byte_perm (w5[3], w5[2], selector); + w6[0] = hc_byte_perm (w5[2], w5[1], selector); + w5[3] = hc_byte_perm (w5[1], w5[0], selector); + w5[2] = hc_byte_perm (w5[0], w4[3], selector); + w5[1] = hc_byte_perm (w4[3], w4[2], selector); + w5[0] = hc_byte_perm (w4[2], w4[1], selector); + w4[3] = hc_byte_perm (w4[1], w4[0], selector); + w4[2] = hc_byte_perm (w4[0], w3[3], selector); + w4[1] = hc_byte_perm (w3[3], w3[2], selector); + w4[0] = hc_byte_perm (w3[2], w3[1], selector); + w3[3] = hc_byte_perm (w3[1], w3[0], selector); + w3[2] = hc_byte_perm (w3[0], w2[3], selector); + w3[1] = hc_byte_perm (w2[3], w2[2], selector); + w3[0] = hc_byte_perm (w2[2], w2[1], selector); + w2[3] = hc_byte_perm (w2[1], w2[0], selector); + w2[2] = hc_byte_perm (w2[0], w1[3], selector); + w2[1] = hc_byte_perm (w1[3], w1[2], selector); + w2[0] = hc_byte_perm (w1[2], w1[1], selector); + w1[3] = hc_byte_perm (w1[1], w1[0], selector); + w1[2] = hc_byte_perm (w1[0], w0[3], selector); + w1[1] = hc_byte_perm (w0[3], w0[2], selector); + w1[0] = hc_byte_perm (w0[2], w0[1], selector); + w0[3] = hc_byte_perm (w0[1], w0[0], selector); + w0[2] = hc_byte_perm (w0[0], 0, selector); w0[1] = 0; w0[0] = 0; break; case 3: - w7[3] = __byte_perm (w7[0], w6[3], selector); - w7[2] = __byte_perm (w6[3], w6[2], selector); - w7[1] = __byte_perm (w6[2], w6[1], selector); - w7[0] = __byte_perm (w6[1], w6[0], selector); - w6[3] = __byte_perm (w6[0], w5[3], selector); - w6[2] = __byte_perm (w5[3], w5[2], selector); - w6[1] = __byte_perm (w5[2], w5[1], selector); - w6[0] = __byte_perm (w5[1], w5[0], selector); - w5[3] = __byte_perm (w5[0], w4[3], selector); - w5[2] = __byte_perm (w4[3], w4[2], selector); - w5[1] = __byte_perm (w4[2], w4[1], selector); - w5[0] = __byte_perm (w4[1], w4[0], selector); - w4[3] = __byte_perm (w4[0], w3[3], selector); - w4[2] = __byte_perm (w3[3], w3[2], selector); - w4[1] = __byte_perm (w3[2], w3[1], selector); - w4[0] = __byte_perm (w3[1], w3[0], selector); - w3[3] = __byte_perm (w3[0], w2[3], selector); - w3[2] = __byte_perm (w2[3], w2[2], selector); - w3[1] = __byte_perm (w2[2], w2[1], selector); - w3[0] = __byte_perm (w2[1], w2[0], selector); - w2[3] = __byte_perm (w2[0], w1[3], selector); - w2[2] = __byte_perm (w1[3], w1[2], selector); - w2[1] = __byte_perm (w1[2], w1[1], selector); - w2[0] = __byte_perm (w1[1], w1[0], selector); - w1[3] = __byte_perm (w1[0], w0[3], selector); - w1[2] = __byte_perm (w0[3], w0[2], selector); - w1[1] = __byte_perm (w0[2], w0[1], selector); - w1[0] = __byte_perm (w0[1], w0[0], selector); - w0[3] = __byte_perm (w0[0], 0, selector); + w7[3] = hc_byte_perm (w7[0], w6[3], selector); + w7[2] = hc_byte_perm (w6[3], w6[2], selector); + w7[1] = hc_byte_perm (w6[2], w6[1], selector); + w7[0] = hc_byte_perm (w6[1], w6[0], selector); + w6[3] = hc_byte_perm (w6[0], w5[3], selector); + w6[2] = hc_byte_perm (w5[3], w5[2], selector); + w6[1] = hc_byte_perm (w5[2], w5[1], selector); + w6[0] = hc_byte_perm (w5[1], w5[0], selector); + w5[3] = hc_byte_perm (w5[0], w4[3], selector); + w5[2] = hc_byte_perm (w4[3], w4[2], selector); + w5[1] = hc_byte_perm (w4[2], w4[1], selector); + w5[0] = hc_byte_perm (w4[1], w4[0], selector); + w4[3] = hc_byte_perm (w4[0], w3[3], selector); + w4[2] = hc_byte_perm (w3[3], w3[2], selector); + w4[1] = hc_byte_perm (w3[2], w3[1], selector); + w4[0] = hc_byte_perm (w3[1], w3[0], selector); + w3[3] = hc_byte_perm (w3[0], w2[3], selector); + w3[2] = hc_byte_perm (w2[3], w2[2], selector); + w3[1] = hc_byte_perm (w2[2], w2[1], selector); + w3[0] = hc_byte_perm (w2[1], w2[0], selector); + w2[3] = hc_byte_perm (w2[0], w1[3], selector); + w2[2] = hc_byte_perm (w1[3], w1[2], selector); + w2[1] = hc_byte_perm (w1[2], w1[1], selector); + w2[0] = hc_byte_perm (w1[1], w1[0], selector); + w1[3] = hc_byte_perm (w1[0], w0[3], selector); + w1[2] = hc_byte_perm (w0[3], w0[2], selector); + w1[1] = hc_byte_perm (w0[2], w0[1], selector); + w1[0] = hc_byte_perm (w0[1], w0[0], selector); + w0[3] = hc_byte_perm (w0[0], 0, selector); w0[2] = 0; w0[1] = 0; w0[0] = 0; @@ -7256,34 +7256,34 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 4: - w7[3] = __byte_perm (w6[3], w6[2], selector); - w7[2] = __byte_perm (w6[2], w6[1], selector); - w7[1] = __byte_perm (w6[1], w6[0], selector); - w7[0] = __byte_perm (w6[0], w5[3], selector); - w6[3] = __byte_perm (w5[3], w5[2], selector); - w6[2] = __byte_perm (w5[2], w5[1], selector); - w6[1] = __byte_perm (w5[1], w5[0], selector); - w6[0] = __byte_perm (w5[0], w4[3], selector); - w5[3] = __byte_perm (w4[3], w4[2], selector); - w5[2] = __byte_perm (w4[2], w4[1], selector); - w5[1] = __byte_perm (w4[1], w4[0], selector); - w5[0] = __byte_perm (w4[0], w3[3], selector); - w4[3] = __byte_perm (w3[3], w3[2], selector); - w4[2] = __byte_perm (w3[2], w3[1], selector); - w4[1] = __byte_perm (w3[1], w3[0], selector); - w4[0] = __byte_perm (w3[0], w2[3], selector); - w3[3] = __byte_perm (w2[3], w2[2], selector); - w3[2] = __byte_perm (w2[2], w2[1], selector); - w3[1] = __byte_perm (w2[1], w2[0], selector); - w3[0] = __byte_perm (w2[0], w1[3], selector); - w2[3] = __byte_perm (w1[3], w1[2], selector); - w2[2] = __byte_perm (w1[2], w1[1], selector); - w2[1] = __byte_perm (w1[1], w1[0], selector); - w2[0] = __byte_perm (w1[0], w0[3], selector); - w1[3] = __byte_perm (w0[3], w0[2], selector); - w1[2] = __byte_perm (w0[2], w0[1], selector); - w1[1] = __byte_perm (w0[1], w0[0], selector); - w1[0] = __byte_perm (w0[0], 0, selector); + w7[3] = hc_byte_perm (w6[3], w6[2], selector); + w7[2] = hc_byte_perm (w6[2], w6[1], selector); + w7[1] = hc_byte_perm (w6[1], w6[0], selector); + w7[0] = hc_byte_perm (w6[0], w5[3], selector); + w6[3] = hc_byte_perm (w5[3], w5[2], selector); + w6[2] = hc_byte_perm (w5[2], w5[1], selector); + w6[1] = hc_byte_perm (w5[1], w5[0], selector); + w6[0] = hc_byte_perm (w5[0], w4[3], selector); + w5[3] = hc_byte_perm (w4[3], w4[2], selector); + w5[2] = hc_byte_perm (w4[2], w4[1], selector); + w5[1] = hc_byte_perm (w4[1], w4[0], selector); + w5[0] = hc_byte_perm (w4[0], w3[3], selector); + w4[3] = hc_byte_perm (w3[3], w3[2], selector); + w4[2] = hc_byte_perm (w3[2], w3[1], selector); + w4[1] = hc_byte_perm (w3[1], w3[0], selector); + w4[0] = hc_byte_perm (w3[0], w2[3], selector); + w3[3] = hc_byte_perm (w2[3], w2[2], selector); + w3[2] = hc_byte_perm (w2[2], w2[1], selector); + w3[1] = hc_byte_perm (w2[1], w2[0], selector); + w3[0] = hc_byte_perm (w2[0], w1[3], selector); + w2[3] = hc_byte_perm (w1[3], w1[2], selector); + w2[2] = hc_byte_perm (w1[2], w1[1], selector); + w2[1] = hc_byte_perm (w1[1], w1[0], selector); + w2[0] = hc_byte_perm (w1[0], w0[3], selector); + w1[3] = hc_byte_perm (w0[3], w0[2], selector); + w1[2] = hc_byte_perm (w0[2], w0[1], selector); + w1[1] = hc_byte_perm (w0[1], w0[0], selector); + w1[0] = hc_byte_perm (w0[0], 0, selector); w0[3] = 0; w0[2] = 0; w0[1] = 0; @@ -7292,33 +7292,33 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 5: - w7[3] = __byte_perm (w6[2], w6[1], selector); - w7[2] = __byte_perm (w6[1], w6[0], selector); - w7[1] = __byte_perm (w6[0], w5[3], selector); - w7[0] = __byte_perm (w5[3], w5[2], selector); - w6[3] = __byte_perm (w5[2], w5[1], selector); - w6[2] = __byte_perm (w5[1], w5[0], selector); - w6[1] = __byte_perm (w5[0], w4[3], selector); - w6[0] = __byte_perm (w4[3], w4[2], selector); - w5[3] = __byte_perm (w4[2], w4[1], selector); - w5[2] = __byte_perm (w4[1], w4[0], selector); - w5[1] = __byte_perm (w4[0], w3[3], selector); - w5[0] = __byte_perm (w3[3], w3[2], selector); - w4[3] = __byte_perm (w3[2], w3[1], selector); - w4[2] = __byte_perm (w3[1], w3[0], selector); - w4[1] = __byte_perm (w3[0], w2[3], selector); - w4[0] = __byte_perm (w2[3], w2[2], selector); - w3[3] = __byte_perm (w2[2], w2[1], selector); - w3[2] = __byte_perm (w2[1], w2[0], selector); - w3[1] = __byte_perm (w2[0], w1[3], selector); - w3[0] = __byte_perm (w1[3], w1[2], selector); - w2[3] = __byte_perm (w1[2], w1[1], selector); - w2[2] = __byte_perm (w1[1], w1[0], selector); - w2[1] = __byte_perm (w1[0], w0[3], selector); - w2[0] = __byte_perm (w0[3], w0[2], selector); - w1[3] = __byte_perm (w0[2], w0[1], selector); - w1[2] = __byte_perm (w0[1], w0[0], selector); - w1[1] = __byte_perm (w0[0], 0, selector); + w7[3] = hc_byte_perm (w6[2], w6[1], selector); + w7[2] = hc_byte_perm (w6[1], w6[0], selector); + w7[1] = hc_byte_perm (w6[0], w5[3], selector); + w7[0] = hc_byte_perm (w5[3], w5[2], selector); + w6[3] = hc_byte_perm (w5[2], w5[1], selector); + w6[2] = hc_byte_perm (w5[1], w5[0], selector); + w6[1] = hc_byte_perm (w5[0], w4[3], selector); + w6[0] = hc_byte_perm (w4[3], w4[2], selector); + w5[3] = hc_byte_perm (w4[2], w4[1], selector); + w5[2] = hc_byte_perm (w4[1], w4[0], selector); + w5[1] = hc_byte_perm (w4[0], w3[3], selector); + w5[0] = hc_byte_perm (w3[3], w3[2], selector); + w4[3] = hc_byte_perm (w3[2], w3[1], selector); + w4[2] = hc_byte_perm (w3[1], w3[0], selector); + w4[1] = hc_byte_perm (w3[0], w2[3], selector); + w4[0] = hc_byte_perm (w2[3], w2[2], selector); + w3[3] = hc_byte_perm (w2[2], w2[1], selector); + w3[2] = hc_byte_perm (w2[1], w2[0], selector); + w3[1] = hc_byte_perm (w2[0], w1[3], selector); + w3[0] = hc_byte_perm (w1[3], w1[2], selector); + w2[3] = hc_byte_perm (w1[2], w1[1], selector); + w2[2] = hc_byte_perm (w1[1], w1[0], selector); + w2[1] = hc_byte_perm (w1[0], w0[3], selector); + w2[0] = hc_byte_perm (w0[3], w0[2], selector); + w1[3] = hc_byte_perm (w0[2], w0[1], selector); + w1[2] = hc_byte_perm (w0[1], w0[0], selector); + w1[1] = hc_byte_perm (w0[0], 0, selector); w1[0] = 0; w0[3] = 0; w0[2] = 0; @@ -7328,32 +7328,32 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 6: - w7[3] = __byte_perm (w6[1], w6[0], selector); - w7[2] = __byte_perm (w6[0], w5[3], selector); - w7[1] = __byte_perm (w5[3], w5[2], selector); - w7[0] = __byte_perm (w5[2], w5[1], selector); - w6[3] = __byte_perm (w5[1], w5[0], selector); - w6[2] = __byte_perm (w5[0], w4[3], selector); - w6[1] = __byte_perm (w4[3], w4[2], selector); - w6[0] = __byte_perm (w4[2], w4[1], selector); - w5[3] = __byte_perm (w4[1], w4[0], selector); - w5[2] = __byte_perm (w4[0], w3[3], selector); - w5[1] = __byte_perm (w3[3], w3[2], selector); - w5[0] = __byte_perm (w3[2], w3[1], selector); - w4[3] = __byte_perm (w3[1], w3[0], selector); - w4[2] = __byte_perm (w3[0], w2[3], selector); - w4[1] = __byte_perm (w2[3], w2[2], selector); - w4[0] = __byte_perm (w2[2], w2[1], selector); - w3[3] = __byte_perm (w2[1], w2[0], selector); - w3[2] = __byte_perm (w2[0], w1[3], selector); - w3[1] = __byte_perm (w1[3], w1[2], selector); - w3[0] = __byte_perm (w1[2], w1[1], selector); - w2[3] = __byte_perm (w1[1], w1[0], selector); - w2[2] = __byte_perm (w1[0], w0[3], selector); - w2[1] = __byte_perm (w0[3], w0[2], selector); - w2[0] = __byte_perm (w0[2], w0[1], selector); - w1[3] = __byte_perm (w0[1], w0[0], selector); - w1[2] = __byte_perm (w0[0], 0, selector); + w7[3] = hc_byte_perm (w6[1], w6[0], selector); + w7[2] = hc_byte_perm (w6[0], w5[3], selector); + w7[1] = hc_byte_perm (w5[3], w5[2], selector); + w7[0] = hc_byte_perm (w5[2], w5[1], selector); + w6[3] = hc_byte_perm (w5[1], w5[0], selector); + w6[2] = hc_byte_perm (w5[0], w4[3], selector); + w6[1] = hc_byte_perm (w4[3], w4[2], selector); + w6[0] = hc_byte_perm (w4[2], w4[1], selector); + w5[3] = hc_byte_perm (w4[1], w4[0], selector); + w5[2] = hc_byte_perm (w4[0], w3[3], selector); + w5[1] = hc_byte_perm (w3[3], w3[2], selector); + w5[0] = hc_byte_perm (w3[2], w3[1], selector); + w4[3] = hc_byte_perm (w3[1], w3[0], selector); + w4[2] = hc_byte_perm (w3[0], w2[3], selector); + w4[1] = hc_byte_perm (w2[3], w2[2], selector); + w4[0] = hc_byte_perm (w2[2], w2[1], selector); + w3[3] = hc_byte_perm (w2[1], w2[0], selector); + w3[2] = hc_byte_perm (w2[0], w1[3], selector); + w3[1] = hc_byte_perm (w1[3], w1[2], selector); + w3[0] = hc_byte_perm (w1[2], w1[1], selector); + w2[3] = hc_byte_perm (w1[1], w1[0], selector); + w2[2] = hc_byte_perm (w1[0], w0[3], selector); + w2[1] = hc_byte_perm (w0[3], w0[2], selector); + w2[0] = hc_byte_perm (w0[2], w0[1], selector); + w1[3] = hc_byte_perm (w0[1], w0[0], selector); + w1[2] = hc_byte_perm (w0[0], 0, selector); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -7364,31 +7364,31 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 7: - w7[3] = __byte_perm (w6[0], w5[3], selector); - w7[2] = __byte_perm (w5[3], w5[2], selector); - w7[1] = __byte_perm (w5[2], w5[1], selector); - w7[0] = __byte_perm (w5[1], w5[0], selector); - w6[3] = __byte_perm (w5[0], w4[3], selector); - w6[2] = __byte_perm (w4[3], w4[2], selector); - w6[1] = __byte_perm (w4[2], w4[1], selector); - w6[0] = __byte_perm (w4[1], w4[0], selector); - w5[3] = __byte_perm (w4[0], w3[3], selector); - w5[2] = __byte_perm (w3[3], w3[2], selector); - w5[1] = __byte_perm (w3[2], w3[1], selector); - w5[0] = __byte_perm (w3[1], w3[0], selector); - w4[3] = __byte_perm (w3[0], w2[3], selector); - w4[2] = __byte_perm (w2[3], w2[2], selector); - w4[1] = __byte_perm (w2[2], w2[1], selector); - w4[0] = __byte_perm (w2[1], w2[0], selector); - w3[3] = __byte_perm (w2[0], w1[3], selector); - w3[2] = __byte_perm (w1[3], w1[2], selector); - w3[1] = __byte_perm (w1[2], w1[1], selector); - w3[0] = __byte_perm (w1[1], w1[0], selector); - w2[3] = __byte_perm (w1[0], w0[3], selector); - w2[2] = __byte_perm (w0[3], w0[2], selector); - w2[1] = __byte_perm (w0[2], w0[1], selector); - w2[0] = __byte_perm (w0[1], w0[0], selector); - w1[3] = __byte_perm (w0[0], 0, selector); + w7[3] = hc_byte_perm (w6[0], w5[3], selector); + w7[2] = hc_byte_perm (w5[3], w5[2], selector); + w7[1] = hc_byte_perm (w5[2], w5[1], selector); + w7[0] = hc_byte_perm (w5[1], w5[0], selector); + w6[3] = hc_byte_perm (w5[0], w4[3], selector); + w6[2] = hc_byte_perm (w4[3], w4[2], selector); + w6[1] = hc_byte_perm (w4[2], w4[1], selector); + w6[0] = hc_byte_perm (w4[1], w4[0], selector); + w5[3] = hc_byte_perm (w4[0], w3[3], selector); + w5[2] = hc_byte_perm (w3[3], w3[2], selector); + w5[1] = hc_byte_perm (w3[2], w3[1], selector); + w5[0] = hc_byte_perm (w3[1], w3[0], selector); + w4[3] = hc_byte_perm (w3[0], w2[3], selector); + w4[2] = hc_byte_perm (w2[3], w2[2], selector); + w4[1] = hc_byte_perm (w2[2], w2[1], selector); + w4[0] = hc_byte_perm (w2[1], w2[0], selector); + w3[3] = hc_byte_perm (w2[0], w1[3], selector); + w3[2] = hc_byte_perm (w1[3], w1[2], selector); + w3[1] = hc_byte_perm (w1[2], w1[1], selector); + w3[0] = hc_byte_perm (w1[1], w1[0], selector); + w2[3] = hc_byte_perm (w1[0], w0[3], selector); + w2[2] = hc_byte_perm (w0[3], w0[2], selector); + w2[1] = hc_byte_perm (w0[2], w0[1], selector); + w2[0] = hc_byte_perm (w0[1], w0[0], selector); + w1[3] = hc_byte_perm (w0[0], 0, selector); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -7400,30 +7400,30 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 8: - w7[3] = __byte_perm (w5[3], w5[2], selector); - w7[2] = __byte_perm (w5[2], w5[1], selector); - w7[1] = __byte_perm (w5[1], w5[0], selector); - w7[0] = __byte_perm (w5[0], w4[3], selector); - w6[3] = __byte_perm (w4[3], w4[2], selector); - w6[2] = __byte_perm (w4[2], w4[1], selector); - w6[1] = __byte_perm (w4[1], w4[0], selector); - w6[0] = __byte_perm (w4[0], w3[3], selector); - w5[3] = __byte_perm (w3[3], w3[2], selector); - w5[2] = __byte_perm (w3[2], w3[1], selector); - w5[1] = __byte_perm (w3[1], w3[0], selector); - w5[0] = __byte_perm (w3[0], w2[3], selector); - w4[3] = __byte_perm (w2[3], w2[2], selector); - w4[2] = __byte_perm (w2[2], w2[1], selector); - w4[1] = __byte_perm (w2[1], w2[0], selector); - w4[0] = __byte_perm (w2[0], w1[3], selector); - w3[3] = __byte_perm (w1[3], w1[2], selector); - w3[2] = __byte_perm (w1[2], w1[1], selector); - w3[1] = __byte_perm (w1[1], w1[0], selector); - w3[0] = __byte_perm (w1[0], w0[3], selector); - w2[3] = __byte_perm (w0[3], w0[2], selector); - w2[2] = __byte_perm (w0[2], w0[1], selector); - w2[1] = __byte_perm (w0[1], w0[0], selector); - w2[0] = __byte_perm (w0[0], 0, selector); + w7[3] = hc_byte_perm (w5[3], w5[2], selector); + w7[2] = hc_byte_perm (w5[2], w5[1], selector); + w7[1] = hc_byte_perm (w5[1], w5[0], selector); + w7[0] = hc_byte_perm (w5[0], w4[3], selector); + w6[3] = hc_byte_perm (w4[3], w4[2], selector); + w6[2] = hc_byte_perm (w4[2], w4[1], selector); + w6[1] = hc_byte_perm (w4[1], w4[0], selector); + w6[0] = hc_byte_perm (w4[0], w3[3], selector); + w5[3] = hc_byte_perm (w3[3], w3[2], selector); + w5[2] = hc_byte_perm (w3[2], w3[1], selector); + w5[1] = hc_byte_perm (w3[1], w3[0], selector); + w5[0] = hc_byte_perm (w3[0], w2[3], selector); + w4[3] = hc_byte_perm (w2[3], w2[2], selector); + w4[2] = hc_byte_perm (w2[2], w2[1], selector); + w4[1] = hc_byte_perm (w2[1], w2[0], selector); + w4[0] = hc_byte_perm (w2[0], w1[3], selector); + w3[3] = hc_byte_perm (w1[3], w1[2], selector); + w3[2] = hc_byte_perm (w1[2], w1[1], selector); + w3[1] = hc_byte_perm (w1[1], w1[0], selector); + w3[0] = hc_byte_perm (w1[0], w0[3], selector); + w2[3] = hc_byte_perm (w0[3], w0[2], selector); + w2[2] = hc_byte_perm (w0[2], w0[1], selector); + w2[1] = hc_byte_perm (w0[1], w0[0], selector); + w2[0] = hc_byte_perm (w0[0], 0, selector); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -7436,29 +7436,29 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 9: - w7[3] = __byte_perm (w5[2], w5[1], selector); - w7[2] = __byte_perm (w5[1], w5[0], selector); - w7[1] = __byte_perm (w5[0], w4[3], selector); - w7[0] = __byte_perm (w4[3], w4[2], selector); - w6[3] = __byte_perm (w4[2], w4[1], selector); - w6[2] = __byte_perm (w4[1], w4[0], selector); - w6[1] = __byte_perm (w4[0], w3[3], selector); - w6[0] = __byte_perm (w3[3], w3[2], selector); - w5[3] = __byte_perm (w3[2], w3[1], selector); - w5[2] = __byte_perm (w3[1], w3[0], selector); - w5[1] = __byte_perm (w3[0], w2[3], selector); - w5[0] = __byte_perm (w2[3], w2[2], selector); - w4[3] = __byte_perm (w2[2], w2[1], selector); - w4[2] = __byte_perm (w2[1], w2[0], selector); - w4[1] = __byte_perm (w2[0], w1[3], selector); - w4[0] = __byte_perm (w1[3], w1[2], selector); - w3[3] = __byte_perm (w1[2], w1[1], selector); - w3[2] = __byte_perm (w1[1], w1[0], selector); - w3[1] = __byte_perm (w1[0], w0[3], selector); - w3[0] = __byte_perm (w0[3], w0[2], selector); - w2[3] = __byte_perm (w0[2], w0[1], selector); - w2[2] = __byte_perm (w0[1], w0[0], selector); - w2[1] = __byte_perm (w0[0], 0, selector); + w7[3] = hc_byte_perm (w5[2], w5[1], selector); + w7[2] = hc_byte_perm (w5[1], w5[0], selector); + w7[1] = hc_byte_perm (w5[0], w4[3], selector); + w7[0] = hc_byte_perm (w4[3], w4[2], selector); + w6[3] = hc_byte_perm (w4[2], w4[1], selector); + w6[2] = hc_byte_perm (w4[1], w4[0], selector); + w6[1] = hc_byte_perm (w4[0], w3[3], selector); + w6[0] = hc_byte_perm (w3[3], w3[2], selector); + w5[3] = hc_byte_perm (w3[2], w3[1], selector); + w5[2] = hc_byte_perm (w3[1], w3[0], selector); + w5[1] = hc_byte_perm (w3[0], w2[3], selector); + w5[0] = hc_byte_perm (w2[3], w2[2], selector); + w4[3] = hc_byte_perm (w2[2], w2[1], selector); + w4[2] = hc_byte_perm (w2[1], w2[0], selector); + w4[1] = hc_byte_perm (w2[0], w1[3], selector); + w4[0] = hc_byte_perm (w1[3], w1[2], selector); + w3[3] = hc_byte_perm (w1[2], w1[1], selector); + w3[2] = hc_byte_perm (w1[1], w1[0], selector); + w3[1] = hc_byte_perm (w1[0], w0[3], selector); + w3[0] = hc_byte_perm (w0[3], w0[2], selector); + w2[3] = hc_byte_perm (w0[2], w0[1], selector); + w2[2] = hc_byte_perm (w0[1], w0[0], selector); + w2[1] = hc_byte_perm (w0[0], 0, selector); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -7472,28 +7472,28 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 10: - w7[3] = __byte_perm (w5[1], w5[0], selector); - w7[2] = __byte_perm (w5[0], w4[3], selector); - w7[1] = __byte_perm (w4[3], w4[2], selector); - w7[0] = __byte_perm (w4[2], w4[1], selector); - w6[3] = __byte_perm (w4[1], w4[0], selector); - w6[2] = __byte_perm (w4[0], w3[3], selector); - w6[1] = __byte_perm (w3[3], w3[2], selector); - w6[0] = __byte_perm (w3[2], w3[1], selector); - w5[3] = __byte_perm (w3[1], w3[0], selector); - w5[2] = __byte_perm (w3[0], w2[3], selector); - w5[1] = __byte_perm (w2[3], w2[2], selector); - w5[0] = __byte_perm (w2[2], w2[1], selector); - w4[3] = __byte_perm (w2[1], w2[0], selector); - w4[2] = __byte_perm (w2[0], w1[3], selector); - w4[1] = __byte_perm (w1[3], w1[2], selector); - w4[0] = __byte_perm (w1[2], w1[1], selector); - w3[3] = __byte_perm (w1[1], w1[0], selector); - w3[2] = __byte_perm (w1[0], w0[3], selector); - w3[1] = __byte_perm (w0[3], w0[2], selector); - w3[0] = __byte_perm (w0[2], w0[1], selector); - w2[3] = __byte_perm (w0[1], w0[0], selector); - w2[2] = __byte_perm (w0[0], 0, selector); + w7[3] = hc_byte_perm (w5[1], w5[0], selector); + w7[2] = hc_byte_perm (w5[0], w4[3], selector); + w7[1] = hc_byte_perm (w4[3], w4[2], selector); + w7[0] = hc_byte_perm (w4[2], w4[1], selector); + w6[3] = hc_byte_perm (w4[1], w4[0], selector); + w6[2] = hc_byte_perm (w4[0], w3[3], selector); + w6[1] = hc_byte_perm (w3[3], w3[2], selector); + w6[0] = hc_byte_perm (w3[2], w3[1], selector); + w5[3] = hc_byte_perm (w3[1], w3[0], selector); + w5[2] = hc_byte_perm (w3[0], w2[3], selector); + w5[1] = hc_byte_perm (w2[3], w2[2], selector); + w5[0] = hc_byte_perm (w2[2], w2[1], selector); + w4[3] = hc_byte_perm (w2[1], w2[0], selector); + w4[2] = hc_byte_perm (w2[0], w1[3], selector); + w4[1] = hc_byte_perm (w1[3], w1[2], selector); + w4[0] = hc_byte_perm (w1[2], w1[1], selector); + w3[3] = hc_byte_perm (w1[1], w1[0], selector); + w3[2] = hc_byte_perm (w1[0], w0[3], selector); + w3[1] = hc_byte_perm (w0[3], w0[2], selector); + w3[0] = hc_byte_perm (w0[2], w0[1], selector); + w2[3] = hc_byte_perm (w0[1], w0[0], selector); + w2[2] = hc_byte_perm (w0[0], 0, selector); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -7508,27 +7508,27 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 11: - w7[3] = __byte_perm (w5[0], w4[3], selector); - w7[2] = __byte_perm (w4[3], w4[2], selector); - w7[1] = __byte_perm (w4[2], w4[1], selector); - w7[0] = __byte_perm (w4[1], w4[0], selector); - w6[3] = __byte_perm (w4[0], w3[3], selector); - w6[2] = __byte_perm (w3[3], w3[2], selector); - w6[1] = __byte_perm (w3[2], w3[1], selector); - w6[0] = __byte_perm (w3[1], w3[0], selector); - w5[3] = __byte_perm (w3[0], w2[3], selector); - w5[2] = __byte_perm (w2[3], w2[2], selector); - w5[1] = __byte_perm (w2[2], w2[1], selector); - w5[0] = __byte_perm (w2[1], w2[0], selector); - w4[3] = __byte_perm (w2[0], w1[3], selector); - w4[2] = __byte_perm (w1[3], w1[2], selector); - w4[1] = __byte_perm (w1[2], w1[1], selector); - w4[0] = __byte_perm (w1[1], w1[0], selector); - w3[3] = __byte_perm (w1[0], w0[3], selector); - w3[2] = __byte_perm (w0[3], w0[2], selector); - w3[1] = __byte_perm (w0[2], w0[1], selector); - w3[0] = __byte_perm (w0[1], w0[0], selector); - w2[3] = __byte_perm (w0[0], 0, selector); + w7[3] = hc_byte_perm (w5[0], w4[3], selector); + w7[2] = hc_byte_perm (w4[3], w4[2], selector); + w7[1] = hc_byte_perm (w4[2], w4[1], selector); + w7[0] = hc_byte_perm (w4[1], w4[0], selector); + w6[3] = hc_byte_perm (w4[0], w3[3], selector); + w6[2] = hc_byte_perm (w3[3], w3[2], selector); + w6[1] = hc_byte_perm (w3[2], w3[1], selector); + w6[0] = hc_byte_perm (w3[1], w3[0], selector); + w5[3] = hc_byte_perm (w3[0], w2[3], selector); + w5[2] = hc_byte_perm (w2[3], w2[2], selector); + w5[1] = hc_byte_perm (w2[2], w2[1], selector); + w5[0] = hc_byte_perm (w2[1], w2[0], selector); + w4[3] = hc_byte_perm (w2[0], w1[3], selector); + w4[2] = hc_byte_perm (w1[3], w1[2], selector); + w4[1] = hc_byte_perm (w1[2], w1[1], selector); + w4[0] = hc_byte_perm (w1[1], w1[0], selector); + w3[3] = hc_byte_perm (w1[0], w0[3], selector); + w3[2] = hc_byte_perm (w0[3], w0[2], selector); + w3[1] = hc_byte_perm (w0[2], w0[1], selector); + w3[0] = hc_byte_perm (w0[1], w0[0], selector); + w2[3] = hc_byte_perm (w0[0], 0, selector); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -7544,26 +7544,26 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 12: - w7[3] = __byte_perm (w4[3], w4[2], selector); - w7[2] = __byte_perm (w4[2], w4[1], selector); - w7[1] = __byte_perm (w4[1], w4[0], selector); - w7[0] = __byte_perm (w4[0], w3[3], selector); - w6[3] = __byte_perm (w3[3], w3[2], selector); - w6[2] = __byte_perm (w3[2], w3[1], selector); - w6[1] = __byte_perm (w3[1], w3[0], selector); - w6[0] = __byte_perm (w3[0], w2[3], selector); - w5[3] = __byte_perm (w2[3], w2[2], selector); - w5[2] = __byte_perm (w2[2], w2[1], selector); - w5[1] = __byte_perm (w2[1], w2[0], selector); - w5[0] = __byte_perm (w2[0], w1[3], selector); - w4[3] = __byte_perm (w1[3], w1[2], selector); - w4[2] = __byte_perm (w1[2], w1[1], selector); - w4[1] = __byte_perm (w1[1], w1[0], selector); - w4[0] = __byte_perm (w1[0], w0[3], selector); - w3[3] = __byte_perm (w0[3], w0[2], selector); - w3[2] = __byte_perm (w0[2], w0[1], selector); - w3[1] = __byte_perm (w0[1], w0[0], selector); - w3[0] = __byte_perm (w0[0], 0, selector); + w7[3] = hc_byte_perm (w4[3], w4[2], selector); + w7[2] = hc_byte_perm (w4[2], w4[1], selector); + w7[1] = hc_byte_perm (w4[1], w4[0], selector); + w7[0] = hc_byte_perm (w4[0], w3[3], selector); + w6[3] = hc_byte_perm (w3[3], w3[2], selector); + w6[2] = hc_byte_perm (w3[2], w3[1], selector); + w6[1] = hc_byte_perm (w3[1], w3[0], selector); + w6[0] = hc_byte_perm (w3[0], w2[3], selector); + w5[3] = hc_byte_perm (w2[3], w2[2], selector); + w5[2] = hc_byte_perm (w2[2], w2[1], selector); + w5[1] = hc_byte_perm (w2[1], w2[0], selector); + w5[0] = hc_byte_perm (w2[0], w1[3], selector); + w4[3] = hc_byte_perm (w1[3], w1[2], selector); + w4[2] = hc_byte_perm (w1[2], w1[1], selector); + w4[1] = hc_byte_perm (w1[1], w1[0], selector); + w4[0] = hc_byte_perm (w1[0], w0[3], selector); + w3[3] = hc_byte_perm (w0[3], w0[2], selector); + w3[2] = hc_byte_perm (w0[2], w0[1], selector); + w3[1] = hc_byte_perm (w0[1], w0[0], selector); + w3[0] = hc_byte_perm (w0[0], 0, selector); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -7580,25 +7580,25 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 13: - w7[3] = __byte_perm (w4[2], w4[1], selector); - w7[2] = __byte_perm (w4[1], w4[0], selector); - w7[1] = __byte_perm (w4[0], w3[3], selector); - w7[0] = __byte_perm (w3[3], w3[2], selector); - w6[3] = __byte_perm (w3[2], w3[1], selector); - w6[2] = __byte_perm (w3[1], w3[0], selector); - w6[1] = __byte_perm (w3[0], w2[3], selector); - w6[0] = __byte_perm (w2[3], w2[2], selector); - w5[3] = __byte_perm (w2[2], w2[1], selector); - w5[2] = __byte_perm (w2[1], w2[0], selector); - w5[1] = __byte_perm (w2[0], w1[3], selector); - w5[0] = __byte_perm (w1[3], w1[2], selector); - w4[3] = __byte_perm (w1[2], w1[1], selector); - w4[2] = __byte_perm (w1[1], w1[0], selector); - w4[1] = __byte_perm (w1[0], w0[3], selector); - w4[0] = __byte_perm (w0[3], w0[2], selector); - w3[3] = __byte_perm (w0[2], w0[1], selector); - w3[2] = __byte_perm (w0[1], w0[0], selector); - w3[1] = __byte_perm (w0[0], 0, selector); + w7[3] = hc_byte_perm (w4[2], w4[1], selector); + w7[2] = hc_byte_perm (w4[1], w4[0], selector); + w7[1] = hc_byte_perm (w4[0], w3[3], selector); + w7[0] = hc_byte_perm (w3[3], w3[2], selector); + w6[3] = hc_byte_perm (w3[2], w3[1], selector); + w6[2] = hc_byte_perm (w3[1], w3[0], selector); + w6[1] = hc_byte_perm (w3[0], w2[3], selector); + w6[0] = hc_byte_perm (w2[3], w2[2], selector); + w5[3] = hc_byte_perm (w2[2], w2[1], selector); + w5[2] = hc_byte_perm (w2[1], w2[0], selector); + w5[1] = hc_byte_perm (w2[0], w1[3], selector); + w5[0] = hc_byte_perm (w1[3], w1[2], selector); + w4[3] = hc_byte_perm (w1[2], w1[1], selector); + w4[2] = hc_byte_perm (w1[1], w1[0], selector); + w4[1] = hc_byte_perm (w1[0], w0[3], selector); + w4[0] = hc_byte_perm (w0[3], w0[2], selector); + w3[3] = hc_byte_perm (w0[2], w0[1], selector); + w3[2] = hc_byte_perm (w0[1], w0[0], selector); + w3[1] = hc_byte_perm (w0[0], 0, selector); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -7616,24 +7616,24 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 14: - w7[3] = __byte_perm (w4[1], w4[0], selector); - w7[2] = __byte_perm (w4[0], w3[3], selector); - w7[1] = __byte_perm (w3[3], w3[2], selector); - w7[0] = __byte_perm (w3[2], w3[1], selector); - w6[3] = __byte_perm (w3[1], w3[0], selector); - w6[2] = __byte_perm (w3[0], w2[3], selector); - w6[1] = __byte_perm (w2[3], w2[2], selector); - w6[0] = __byte_perm (w2[2], w2[1], selector); - w5[3] = __byte_perm (w2[1], w2[0], selector); - w5[2] = __byte_perm (w2[0], w1[3], selector); - w5[1] = __byte_perm (w1[3], w1[2], selector); - w5[0] = __byte_perm (w1[2], w1[1], selector); - w4[3] = __byte_perm (w1[1], w1[0], selector); - w4[2] = __byte_perm (w1[0], w0[3], selector); - w4[1] = __byte_perm (w0[3], w0[2], selector); - w4[0] = __byte_perm (w0[2], w0[1], selector); - w3[3] = __byte_perm (w0[1], w0[0], selector); - w3[2] = __byte_perm (w0[0], 0, selector); + w7[3] = hc_byte_perm (w4[1], w4[0], selector); + w7[2] = hc_byte_perm (w4[0], w3[3], selector); + w7[1] = hc_byte_perm (w3[3], w3[2], selector); + w7[0] = hc_byte_perm (w3[2], w3[1], selector); + w6[3] = hc_byte_perm (w3[1], w3[0], selector); + w6[2] = hc_byte_perm (w3[0], w2[3], selector); + w6[1] = hc_byte_perm (w2[3], w2[2], selector); + w6[0] = hc_byte_perm (w2[2], w2[1], selector); + w5[3] = hc_byte_perm (w2[1], w2[0], selector); + w5[2] = hc_byte_perm (w2[0], w1[3], selector); + w5[1] = hc_byte_perm (w1[3], w1[2], selector); + w5[0] = hc_byte_perm (w1[2], w1[1], selector); + w4[3] = hc_byte_perm (w1[1], w1[0], selector); + w4[2] = hc_byte_perm (w1[0], w0[3], selector); + w4[1] = hc_byte_perm (w0[3], w0[2], selector); + w4[0] = hc_byte_perm (w0[2], w0[1], selector); + w3[3] = hc_byte_perm (w0[1], w0[0], selector); + w3[2] = hc_byte_perm (w0[0], 0, selector); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -7652,23 +7652,23 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 15: - w7[3] = __byte_perm (w4[0], w3[3], selector); - w7[2] = __byte_perm (w3[3], w3[2], selector); - w7[1] = __byte_perm (w3[2], w3[1], selector); - w7[0] = __byte_perm (w3[1], w3[0], selector); - w6[3] = __byte_perm (w3[0], w2[3], selector); - w6[2] = __byte_perm (w2[3], w2[2], selector); - w6[1] = __byte_perm (w2[2], w2[1], selector); - w6[0] = __byte_perm (w2[1], w2[0], selector); - w5[3] = __byte_perm (w2[0], w1[3], selector); - w5[2] = __byte_perm (w1[3], w1[2], selector); - w5[1] = __byte_perm (w1[2], w1[1], selector); - w5[0] = __byte_perm (w1[1], w1[0], selector); - w4[3] = __byte_perm (w1[0], w0[3], selector); - w4[2] = __byte_perm (w0[3], w0[2], selector); - w4[1] = __byte_perm (w0[2], w0[1], selector); - w4[0] = __byte_perm (w0[1], w0[0], selector); - w3[3] = __byte_perm (w0[0], 0, selector); + w7[3] = hc_byte_perm (w4[0], w3[3], selector); + w7[2] = hc_byte_perm (w3[3], w3[2], selector); + w7[1] = hc_byte_perm (w3[2], w3[1], selector); + w7[0] = hc_byte_perm (w3[1], w3[0], selector); + w6[3] = hc_byte_perm (w3[0], w2[3], selector); + w6[2] = hc_byte_perm (w2[3], w2[2], selector); + w6[1] = hc_byte_perm (w2[2], w2[1], selector); + w6[0] = hc_byte_perm (w2[1], w2[0], selector); + w5[3] = hc_byte_perm (w2[0], w1[3], selector); + w5[2] = hc_byte_perm (w1[3], w1[2], selector); + w5[1] = hc_byte_perm (w1[2], w1[1], selector); + w5[0] = hc_byte_perm (w1[1], w1[0], selector); + w4[3] = hc_byte_perm (w1[0], w0[3], selector); + w4[2] = hc_byte_perm (w0[3], w0[2], selector); + w4[1] = hc_byte_perm (w0[2], w0[1], selector); + w4[0] = hc_byte_perm (w0[1], w0[0], selector); + w3[3] = hc_byte_perm (w0[0], 0, selector); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -7688,22 +7688,22 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 16: - w7[3] = __byte_perm (w3[3], w3[2], selector); - w7[2] = __byte_perm (w3[2], w3[1], selector); - w7[1] = __byte_perm (w3[1], w3[0], selector); - w7[0] = __byte_perm (w3[0], w2[3], selector); - w6[3] = __byte_perm (w2[3], w2[2], selector); - w6[2] = __byte_perm (w2[2], w2[1], selector); - w6[1] = __byte_perm (w2[1], w2[0], selector); - w6[0] = __byte_perm (w2[0], w1[3], selector); - w5[3] = __byte_perm (w1[3], w1[2], selector); - w5[2] = __byte_perm (w1[2], w1[1], selector); - w5[1] = __byte_perm (w1[1], w1[0], selector); - w5[0] = __byte_perm (w1[0], w0[3], selector); - w4[3] = __byte_perm (w0[3], w0[2], selector); - w4[2] = __byte_perm (w0[2], w0[1], selector); - w4[1] = __byte_perm (w0[1], w0[0], selector); - w4[0] = __byte_perm (w0[0], 0, selector); + w7[3] = hc_byte_perm (w3[3], w3[2], selector); + w7[2] = hc_byte_perm (w3[2], w3[1], selector); + w7[1] = hc_byte_perm (w3[1], w3[0], selector); + w7[0] = hc_byte_perm (w3[0], w2[3], selector); + w6[3] = hc_byte_perm (w2[3], w2[2], selector); + w6[2] = hc_byte_perm (w2[2], w2[1], selector); + w6[1] = hc_byte_perm (w2[1], w2[0], selector); + w6[0] = hc_byte_perm (w2[0], w1[3], selector); + w5[3] = hc_byte_perm (w1[3], w1[2], selector); + w5[2] = hc_byte_perm (w1[2], w1[1], selector); + w5[1] = hc_byte_perm (w1[1], w1[0], selector); + w5[0] = hc_byte_perm (w1[0], w0[3], selector); + w4[3] = hc_byte_perm (w0[3], w0[2], selector); + w4[2] = hc_byte_perm (w0[2], w0[1], selector); + w4[1] = hc_byte_perm (w0[1], w0[0], selector); + w4[0] = hc_byte_perm (w0[0], 0, selector); w3[3] = 0; w3[2] = 0; w3[1] = 0; @@ -7724,21 +7724,21 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 17: - w7[3] = __byte_perm (w3[2], w3[1], selector); - w7[2] = __byte_perm (w3[1], w3[0], selector); - w7[1] = __byte_perm (w3[0], w2[3], selector); - w7[0] = __byte_perm (w2[3], w2[2], selector); - w6[3] = __byte_perm (w2[2], w2[1], selector); - w6[2] = __byte_perm (w2[1], w2[0], selector); - w6[1] = __byte_perm (w2[0], w1[3], selector); - w6[0] = __byte_perm (w1[3], w1[2], selector); - w5[3] = __byte_perm (w1[2], w1[1], selector); - w5[2] = __byte_perm (w1[1], w1[0], selector); - w5[1] = __byte_perm (w1[0], w0[3], selector); - w5[0] = __byte_perm (w0[3], w0[2], selector); - w4[3] = __byte_perm (w0[2], w0[1], selector); - w4[2] = __byte_perm (w0[1], w0[0], selector); - w4[1] = __byte_perm (w0[0], 0, selector); + w7[3] = hc_byte_perm (w3[2], w3[1], selector); + w7[2] = hc_byte_perm (w3[1], w3[0], selector); + w7[1] = hc_byte_perm (w3[0], w2[3], selector); + w7[0] = hc_byte_perm (w2[3], w2[2], selector); + w6[3] = hc_byte_perm (w2[2], w2[1], selector); + w6[2] = hc_byte_perm (w2[1], w2[0], selector); + w6[1] = hc_byte_perm (w2[0], w1[3], selector); + w6[0] = hc_byte_perm (w1[3], w1[2], selector); + w5[3] = hc_byte_perm (w1[2], w1[1], selector); + w5[2] = hc_byte_perm (w1[1], w1[0], selector); + w5[1] = hc_byte_perm (w1[0], w0[3], selector); + w5[0] = hc_byte_perm (w0[3], w0[2], selector); + w4[3] = hc_byte_perm (w0[2], w0[1], selector); + w4[2] = hc_byte_perm (w0[1], w0[0], selector); + w4[1] = hc_byte_perm (w0[0], 0, selector); w4[0] = 0; w3[3] = 0; w3[2] = 0; @@ -7760,20 +7760,20 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 18: - w7[3] = __byte_perm (w3[1], w3[0], selector); - w7[2] = __byte_perm (w3[0], w2[3], selector); - w7[1] = __byte_perm (w2[3], w2[2], selector); - w7[0] = __byte_perm (w2[2], w2[1], selector); - w6[3] = __byte_perm (w2[1], w2[0], selector); - w6[2] = __byte_perm (w2[0], w1[3], selector); - w6[1] = __byte_perm (w1[3], w1[2], selector); - w6[0] = __byte_perm (w1[2], w1[1], selector); - w5[3] = __byte_perm (w1[1], w1[0], selector); - w5[2] = __byte_perm (w1[0], w0[3], selector); - w5[1] = __byte_perm (w0[3], w0[2], selector); - w5[0] = __byte_perm (w0[2], w0[1], selector); - w4[3] = __byte_perm (w0[1], w0[0], selector); - w4[2] = __byte_perm (w0[0], 0, selector); + w7[3] = hc_byte_perm (w3[1], w3[0], selector); + w7[2] = hc_byte_perm (w3[0], w2[3], selector); + w7[1] = hc_byte_perm (w2[3], w2[2], selector); + w7[0] = hc_byte_perm (w2[2], w2[1], selector); + w6[3] = hc_byte_perm (w2[1], w2[0], selector); + w6[2] = hc_byte_perm (w2[0], w1[3], selector); + w6[1] = hc_byte_perm (w1[3], w1[2], selector); + w6[0] = hc_byte_perm (w1[2], w1[1], selector); + w5[3] = hc_byte_perm (w1[1], w1[0], selector); + w5[2] = hc_byte_perm (w1[0], w0[3], selector); + w5[1] = hc_byte_perm (w0[3], w0[2], selector); + w5[0] = hc_byte_perm (w0[2], w0[1], selector); + w4[3] = hc_byte_perm (w0[1], w0[0], selector); + w4[2] = hc_byte_perm (w0[0], 0, selector); w4[1] = 0; w4[0] = 0; w3[3] = 0; @@ -7796,19 +7796,19 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 19: - w7[3] = __byte_perm (w3[0], w2[3], selector); - w7[2] = __byte_perm (w2[3], w2[2], selector); - w7[1] = __byte_perm (w2[2], w2[1], selector); - w7[0] = __byte_perm (w2[1], w2[0], selector); - w6[3] = __byte_perm (w2[0], w1[3], selector); - w6[2] = __byte_perm (w1[3], w1[2], selector); - w6[1] = __byte_perm (w1[2], w1[1], selector); - w6[0] = __byte_perm (w1[1], w1[0], selector); - w5[3] = __byte_perm (w1[0], w0[3], selector); - w5[2] = __byte_perm (w0[3], w0[2], selector); - w5[1] = __byte_perm (w0[2], w0[1], selector); - w5[0] = __byte_perm (w0[1], w0[0], selector); - w4[3] = __byte_perm (w0[0], 0, selector); + w7[3] = hc_byte_perm (w3[0], w2[3], selector); + w7[2] = hc_byte_perm (w2[3], w2[2], selector); + w7[1] = hc_byte_perm (w2[2], w2[1], selector); + w7[0] = hc_byte_perm (w2[1], w2[0], selector); + w6[3] = hc_byte_perm (w2[0], w1[3], selector); + w6[2] = hc_byte_perm (w1[3], w1[2], selector); + w6[1] = hc_byte_perm (w1[2], w1[1], selector); + w6[0] = hc_byte_perm (w1[1], w1[0], selector); + w5[3] = hc_byte_perm (w1[0], w0[3], selector); + w5[2] = hc_byte_perm (w0[3], w0[2], selector); + w5[1] = hc_byte_perm (w0[2], w0[1], selector); + w5[0] = hc_byte_perm (w0[1], w0[0], selector); + w4[3] = hc_byte_perm (w0[0], 0, selector); w4[2] = 0; w4[1] = 0; w4[0] = 0; @@ -7832,18 +7832,18 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 20: - w7[3] = __byte_perm (w2[3], w2[2], selector); - w7[2] = __byte_perm (w2[2], w2[1], selector); - w7[1] = __byte_perm (w2[1], w2[0], selector); - w7[0] = __byte_perm (w2[0], w1[3], selector); - w6[3] = __byte_perm (w1[3], w1[2], selector); - w6[2] = __byte_perm (w1[2], w1[1], selector); - w6[1] = __byte_perm (w1[1], w1[0], selector); - w6[0] = __byte_perm (w1[0], w0[3], selector); - w5[3] = __byte_perm (w0[3], w0[2], selector); - w5[2] = __byte_perm (w0[2], w0[1], selector); - w5[1] = __byte_perm (w0[1], w0[0], selector); - w5[0] = __byte_perm (w0[0], 0, selector); + w7[3] = hc_byte_perm (w2[3], w2[2], selector); + w7[2] = hc_byte_perm (w2[2], w2[1], selector); + w7[1] = hc_byte_perm (w2[1], w2[0], selector); + w7[0] = hc_byte_perm (w2[0], w1[3], selector); + w6[3] = hc_byte_perm (w1[3], w1[2], selector); + w6[2] = hc_byte_perm (w1[2], w1[1], selector); + w6[1] = hc_byte_perm (w1[1], w1[0], selector); + w6[0] = hc_byte_perm (w1[0], w0[3], selector); + w5[3] = hc_byte_perm (w0[3], w0[2], selector); + w5[2] = hc_byte_perm (w0[2], w0[1], selector); + w5[1] = hc_byte_perm (w0[1], w0[0], selector); + w5[0] = hc_byte_perm (w0[0], 0, selector); w4[3] = 0; w4[2] = 0; w4[1] = 0; @@ -7868,17 +7868,17 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 21: - w7[3] = __byte_perm (w2[2], w2[1], selector); - w7[2] = __byte_perm (w2[1], w2[0], selector); - w7[1] = __byte_perm (w2[0], w1[3], selector); - w7[0] = __byte_perm (w1[3], w1[2], selector); - w6[3] = __byte_perm (w1[2], w1[1], selector); - w6[2] = __byte_perm (w1[1], w1[0], selector); - w6[1] = __byte_perm (w1[0], w0[3], selector); - w6[0] = __byte_perm (w0[3], w0[2], selector); - w5[3] = __byte_perm (w0[2], w0[1], selector); - w5[2] = __byte_perm (w0[1], w0[0], selector); - w5[1] = __byte_perm (w0[0], 0, selector); + w7[3] = hc_byte_perm (w2[2], w2[1], selector); + w7[2] = hc_byte_perm (w2[1], w2[0], selector); + w7[1] = hc_byte_perm (w2[0], w1[3], selector); + w7[0] = hc_byte_perm (w1[3], w1[2], selector); + w6[3] = hc_byte_perm (w1[2], w1[1], selector); + w6[2] = hc_byte_perm (w1[1], w1[0], selector); + w6[1] = hc_byte_perm (w1[0], w0[3], selector); + w6[0] = hc_byte_perm (w0[3], w0[2], selector); + w5[3] = hc_byte_perm (w0[2], w0[1], selector); + w5[2] = hc_byte_perm (w0[1], w0[0], selector); + w5[1] = hc_byte_perm (w0[0], 0, selector); w5[0] = 0; w4[3] = 0; w4[2] = 0; @@ -7904,16 +7904,16 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 22: - w7[3] = __byte_perm (w2[1], w2[0], selector); - w7[2] = __byte_perm (w2[0], w1[3], selector); - w7[1] = __byte_perm (w1[3], w1[2], selector); - w7[0] = __byte_perm (w1[2], w1[1], selector); - w6[3] = __byte_perm (w1[1], w1[0], selector); - w6[2] = __byte_perm (w1[0], w0[3], selector); - w6[1] = __byte_perm (w0[3], w0[2], selector); - w6[0] = __byte_perm (w0[2], w0[1], selector); - w5[3] = __byte_perm (w0[1], w0[0], selector); - w5[2] = __byte_perm (w0[0], 0, selector); + w7[3] = hc_byte_perm (w2[1], w2[0], selector); + w7[2] = hc_byte_perm (w2[0], w1[3], selector); + w7[1] = hc_byte_perm (w1[3], w1[2], selector); + w7[0] = hc_byte_perm (w1[2], w1[1], selector); + w6[3] = hc_byte_perm (w1[1], w1[0], selector); + w6[2] = hc_byte_perm (w1[0], w0[3], selector); + w6[1] = hc_byte_perm (w0[3], w0[2], selector); + w6[0] = hc_byte_perm (w0[2], w0[1], selector); + w5[3] = hc_byte_perm (w0[1], w0[0], selector); + w5[2] = hc_byte_perm (w0[0], 0, selector); w5[1] = 0; w5[0] = 0; w4[3] = 0; @@ -7940,15 +7940,15 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 23: - w7[3] = __byte_perm (w2[0], w1[3], selector); - w7[2] = __byte_perm (w1[3], w1[2], selector); - w7[1] = __byte_perm (w1[2], w1[1], selector); - w7[0] = __byte_perm (w1[1], w1[0], selector); - w6[3] = __byte_perm (w1[0], w0[3], selector); - w6[2] = __byte_perm (w0[3], w0[2], selector); - w6[1] = __byte_perm (w0[2], w0[1], selector); - w6[0] = __byte_perm (w0[1], w0[0], selector); - w5[3] = __byte_perm (w0[0], 0, selector); + w7[3] = hc_byte_perm (w2[0], w1[3], selector); + w7[2] = hc_byte_perm (w1[3], w1[2], selector); + w7[1] = hc_byte_perm (w1[2], w1[1], selector); + w7[0] = hc_byte_perm (w1[1], w1[0], selector); + w6[3] = hc_byte_perm (w1[0], w0[3], selector); + w6[2] = hc_byte_perm (w0[3], w0[2], selector); + w6[1] = hc_byte_perm (w0[2], w0[1], selector); + w6[0] = hc_byte_perm (w0[1], w0[0], selector); + w5[3] = hc_byte_perm (w0[0], 0, selector); w5[2] = 0; w5[1] = 0; w5[0] = 0; @@ -7976,14 +7976,14 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 24: - w7[3] = __byte_perm (w1[3], w1[2], selector); - w7[2] = __byte_perm (w1[2], w1[1], selector); - w7[1] = __byte_perm (w1[1], w1[0], selector); - w7[0] = __byte_perm (w1[0], w0[3], selector); - w6[3] = __byte_perm (w0[3], w0[2], selector); - w6[2] = __byte_perm (w0[2], w0[1], selector); - w6[1] = __byte_perm (w0[1], w0[0], selector); - w6[0] = __byte_perm (w0[0], 0, selector); + w7[3] = hc_byte_perm (w1[3], w1[2], selector); + w7[2] = hc_byte_perm (w1[2], w1[1], selector); + w7[1] = hc_byte_perm (w1[1], w1[0], selector); + w7[0] = hc_byte_perm (w1[0], w0[3], selector); + w6[3] = hc_byte_perm (w0[3], w0[2], selector); + w6[2] = hc_byte_perm (w0[2], w0[1], selector); + w6[1] = hc_byte_perm (w0[1], w0[0], selector); + w6[0] = hc_byte_perm (w0[0], 0, selector); w5[3] = 0; w5[2] = 0; w5[1] = 0; @@ -8012,13 +8012,13 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 25: - w7[3] = __byte_perm (w1[2], w1[1], selector); - w7[2] = __byte_perm (w1[1], w1[0], selector); - w7[1] = __byte_perm (w1[0], w0[3], selector); - w7[0] = __byte_perm (w0[3], w0[2], selector); - w6[3] = __byte_perm (w0[2], w0[1], selector); - w6[2] = __byte_perm (w0[1], w0[0], selector); - w6[1] = __byte_perm (w0[0], 0, selector); + w7[3] = hc_byte_perm (w1[2], w1[1], selector); + w7[2] = hc_byte_perm (w1[1], w1[0], selector); + w7[1] = hc_byte_perm (w1[0], w0[3], selector); + w7[0] = hc_byte_perm (w0[3], w0[2], selector); + w6[3] = hc_byte_perm (w0[2], w0[1], selector); + w6[2] = hc_byte_perm (w0[1], w0[0], selector); + w6[1] = hc_byte_perm (w0[0], 0, selector); w6[0] = 0; w5[3] = 0; w5[2] = 0; @@ -8048,12 +8048,12 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 26: - w7[3] = __byte_perm (w1[1], w1[0], selector); - w7[2] = __byte_perm (w1[0], w0[3], selector); - w7[1] = __byte_perm (w0[3], w0[2], selector); - w7[0] = __byte_perm (w0[2], w0[1], selector); - w6[3] = __byte_perm (w0[1], w0[0], selector); - w6[2] = __byte_perm (w0[0], 0, selector); + w7[3] = hc_byte_perm (w1[1], w1[0], selector); + w7[2] = hc_byte_perm (w1[0], w0[3], selector); + w7[1] = hc_byte_perm (w0[3], w0[2], selector); + w7[0] = hc_byte_perm (w0[2], w0[1], selector); + w6[3] = hc_byte_perm (w0[1], w0[0], selector); + w6[2] = hc_byte_perm (w0[0], 0, selector); w6[1] = 0; w6[0] = 0; w5[3] = 0; @@ -8084,11 +8084,11 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 27: - w7[3] = __byte_perm (w1[0], w0[3], selector); - w7[2] = __byte_perm (w0[3], w0[2], selector); - w7[1] = __byte_perm (w0[2], w0[1], selector); - w7[0] = __byte_perm (w0[1], w0[0], selector); - w6[3] = __byte_perm (w0[0], 0, selector); + w7[3] = hc_byte_perm (w1[0], w0[3], selector); + w7[2] = hc_byte_perm (w0[3], w0[2], selector); + w7[1] = hc_byte_perm (w0[2], w0[1], selector); + w7[0] = hc_byte_perm (w0[1], w0[0], selector); + w6[3] = hc_byte_perm (w0[0], 0, selector); w6[2] = 0; w6[1] = 0; w6[0] = 0; @@ -8120,10 +8120,10 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 28: - w7[3] = __byte_perm (w0[3], w0[2], selector); - w7[2] = __byte_perm (w0[2], w0[1], selector); - w7[1] = __byte_perm (w0[1], w0[0], selector); - w7[0] = __byte_perm (w0[0], 0, selector); + w7[3] = hc_byte_perm (w0[3], w0[2], selector); + w7[2] = hc_byte_perm (w0[2], w0[1], selector); + w7[1] = hc_byte_perm (w0[1], w0[0], selector); + w7[0] = hc_byte_perm (w0[0], 0, selector); w6[3] = 0; w6[2] = 0; w6[1] = 0; @@ -8156,9 +8156,9 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 29: - w7[3] = __byte_perm (w0[2], w0[1], selector); - w7[2] = __byte_perm (w0[1], w0[0], selector); - w7[1] = __byte_perm (w0[0], 0, selector); + w7[3] = hc_byte_perm (w0[2], w0[1], selector); + w7[2] = hc_byte_perm (w0[1], w0[0], selector); + w7[1] = hc_byte_perm (w0[0], 0, selector); w7[0] = 0; w6[3] = 0; w6[2] = 0; @@ -8192,8 +8192,8 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 30: - w7[3] = __byte_perm (w0[1], w0[0], selector); - w7[2] = __byte_perm (w0[0], 0, selector); + w7[3] = hc_byte_perm (w0[1], w0[0], selector); + w7[2] = hc_byte_perm (w0[0], 0, selector); w7[1] = 0; w7[0] = 0; w6[3] = 0; @@ -8228,7 +8228,7 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 31: - w7[3] = __byte_perm (w0[0], 0, selector); + w7[3] = hc_byte_perm (w0[0], 0, selector); w7[2] = 0; w7[1] = 0; w7[0] = 0; @@ -8274,153 +8274,153 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 switch (offset_switch) { case 0: - c0[0] = amd_bytealign (w7[3], 0, offset); - w7[3] = amd_bytealign (w7[2], w7[3], offset); - w7[2] = amd_bytealign (w7[1], w7[2], offset); - w7[1] = amd_bytealign (w7[0], w7[1], offset); - w7[0] = amd_bytealign (w6[3], w7[0], offset); - w6[3] = amd_bytealign (w6[2], w6[3], offset); - w6[2] = amd_bytealign (w6[1], w6[2], offset); - w6[1] = amd_bytealign (w6[0], w6[1], offset); - w6[0] = amd_bytealign (w5[3], w6[0], offset); - w5[3] = amd_bytealign (w5[2], w5[3], offset); - w5[2] = amd_bytealign (w5[1], w5[2], offset); - w5[1] = amd_bytealign (w5[0], w5[1], offset); - w5[0] = amd_bytealign (w4[3], w5[0], offset); - w4[3] = amd_bytealign (w4[2], w4[3], offset); - w4[2] = amd_bytealign (w4[1], w4[2], offset); - w4[1] = amd_bytealign (w4[0], w4[1], offset); - w4[0] = amd_bytealign (w3[3], w4[0], offset); - w3[3] = amd_bytealign (w3[2], w3[3], offset); - w3[2] = amd_bytealign (w3[1], w3[2], offset); - w3[1] = amd_bytealign (w3[0], w3[1], offset); - w3[0] = amd_bytealign (w2[3], w3[0], offset); - w2[3] = amd_bytealign (w2[2], w2[3], offset); - w2[2] = amd_bytealign (w2[1], w2[2], offset); - w2[1] = amd_bytealign (w2[0], w2[1], offset); - w2[0] = amd_bytealign (w1[3], w2[0], offset); - w1[3] = amd_bytealign (w1[2], w1[3], offset); - w1[2] = amd_bytealign (w1[1], w1[2], offset); - w1[1] = amd_bytealign (w1[0], w1[1], offset); - w1[0] = amd_bytealign (w0[3], w1[0], offset); - w0[3] = amd_bytealign (w0[2], w0[3], offset); - w0[2] = amd_bytealign (w0[1], w0[2], offset); - w0[1] = amd_bytealign (w0[0], w0[1], offset); - w0[0] = amd_bytealign ( 0, w0[0], offset); + c0[0] = hc_bytealign (w7[3], 0, offset); + w7[3] = hc_bytealign (w7[2], w7[3], offset); + w7[2] = hc_bytealign (w7[1], w7[2], offset); + w7[1] = hc_bytealign (w7[0], w7[1], offset); + w7[0] = hc_bytealign (w6[3], w7[0], offset); + w6[3] = hc_bytealign (w6[2], w6[3], offset); + w6[2] = hc_bytealign (w6[1], w6[2], offset); + w6[1] = hc_bytealign (w6[0], w6[1], offset); + w6[0] = hc_bytealign (w5[3], w6[0], offset); + w5[3] = hc_bytealign (w5[2], w5[3], offset); + w5[2] = hc_bytealign (w5[1], w5[2], offset); + w5[1] = hc_bytealign (w5[0], w5[1], offset); + w5[0] = hc_bytealign (w4[3], w5[0], offset); + w4[3] = hc_bytealign (w4[2], w4[3], offset); + w4[2] = hc_bytealign (w4[1], w4[2], offset); + w4[1] = hc_bytealign (w4[0], w4[1], offset); + w4[0] = hc_bytealign (w3[3], w4[0], offset); + w3[3] = hc_bytealign (w3[2], w3[3], offset); + w3[2] = hc_bytealign (w3[1], w3[2], offset); + w3[1] = hc_bytealign (w3[0], w3[1], offset); + w3[0] = hc_bytealign (w2[3], w3[0], offset); + w2[3] = hc_bytealign (w2[2], w2[3], offset); + w2[2] = hc_bytealign (w2[1], w2[2], offset); + w2[1] = hc_bytealign (w2[0], w2[1], offset); + w2[0] = hc_bytealign (w1[3], w2[0], offset); + w1[3] = hc_bytealign (w1[2], w1[3], offset); + w1[2] = hc_bytealign (w1[1], w1[2], offset); + w1[1] = hc_bytealign (w1[0], w1[1], offset); + w1[0] = hc_bytealign (w0[3], w1[0], offset); + w0[3] = hc_bytealign (w0[2], w0[3], offset); + w0[2] = hc_bytealign (w0[1], w0[2], offset); + w0[1] = hc_bytealign (w0[0], w0[1], offset); + w0[0] = hc_bytealign ( 0, w0[0], offset); break; case 1: - c0[1] = amd_bytealign (w7[3], 0, offset); - c0[0] = amd_bytealign (w7[2], w7[3], offset); - w7[3] = amd_bytealign (w7[1], w7[2], offset); - w7[2] = amd_bytealign (w7[0], w7[1], offset); - w7[1] = amd_bytealign (w6[3], w7[0], offset); - w7[0] = amd_bytealign (w6[2], w6[3], offset); - w6[3] = amd_bytealign (w6[1], w6[2], offset); - w6[2] = amd_bytealign (w6[0], w6[1], offset); - w6[1] = amd_bytealign (w5[3], w6[0], offset); - w6[0] = amd_bytealign (w5[2], w5[3], offset); - w5[3] = amd_bytealign (w5[1], w5[2], offset); - w5[2] = amd_bytealign (w5[0], w5[1], offset); - w5[1] = amd_bytealign (w4[3], w5[0], offset); - w5[0] = amd_bytealign (w4[2], w4[3], offset); - w4[3] = amd_bytealign (w4[1], w4[2], offset); - w4[2] = amd_bytealign (w4[0], w4[1], offset); - w4[1] = amd_bytealign (w3[3], w4[0], offset); - w4[0] = amd_bytealign (w3[2], w3[3], offset); - w3[3] = amd_bytealign (w3[1], w3[2], offset); - w3[2] = amd_bytealign (w3[0], w3[1], offset); - w3[1] = amd_bytealign (w2[3], w3[0], offset); - w3[0] = amd_bytealign (w2[2], w2[3], offset); - w2[3] = amd_bytealign (w2[1], w2[2], offset); - w2[2] = amd_bytealign (w2[0], w2[1], offset); - w2[1] = amd_bytealign (w1[3], w2[0], offset); - w2[0] = amd_bytealign (w1[2], w1[3], offset); - w1[3] = amd_bytealign (w1[1], w1[2], offset); - w1[2] = amd_bytealign (w1[0], w1[1], offset); - w1[1] = amd_bytealign (w0[3], w1[0], offset); - w1[0] = amd_bytealign (w0[2], w0[3], offset); - w0[3] = amd_bytealign (w0[1], w0[2], offset); - w0[2] = amd_bytealign (w0[0], w0[1], offset); - w0[1] = amd_bytealign ( 0, w0[0], offset); + c0[1] = hc_bytealign (w7[3], 0, offset); + c0[0] = hc_bytealign (w7[2], w7[3], offset); + w7[3] = hc_bytealign (w7[1], w7[2], offset); + w7[2] = hc_bytealign (w7[0], w7[1], offset); + w7[1] = hc_bytealign (w6[3], w7[0], offset); + w7[0] = hc_bytealign (w6[2], w6[3], offset); + w6[3] = hc_bytealign (w6[1], w6[2], offset); + w6[2] = hc_bytealign (w6[0], w6[1], offset); + w6[1] = hc_bytealign (w5[3], w6[0], offset); + w6[0] = hc_bytealign (w5[2], w5[3], offset); + w5[3] = hc_bytealign (w5[1], w5[2], offset); + w5[2] = hc_bytealign (w5[0], w5[1], offset); + w5[1] = hc_bytealign (w4[3], w5[0], offset); + w5[0] = hc_bytealign (w4[2], w4[3], offset); + w4[3] = hc_bytealign (w4[1], w4[2], offset); + w4[2] = hc_bytealign (w4[0], w4[1], offset); + w4[1] = hc_bytealign (w3[3], w4[0], offset); + w4[0] = hc_bytealign (w3[2], w3[3], offset); + w3[3] = hc_bytealign (w3[1], w3[2], offset); + w3[2] = hc_bytealign (w3[0], w3[1], offset); + w3[1] = hc_bytealign (w2[3], w3[0], offset); + w3[0] = hc_bytealign (w2[2], w2[3], offset); + w2[3] = hc_bytealign (w2[1], w2[2], offset); + w2[2] = hc_bytealign (w2[0], w2[1], offset); + w2[1] = hc_bytealign (w1[3], w2[0], offset); + w2[0] = hc_bytealign (w1[2], w1[3], offset); + w1[3] = hc_bytealign (w1[1], w1[2], offset); + w1[2] = hc_bytealign (w1[0], w1[1], offset); + w1[1] = hc_bytealign (w0[3], w1[0], offset); + w1[0] = hc_bytealign (w0[2], w0[3], offset); + w0[3] = hc_bytealign (w0[1], w0[2], offset); + w0[2] = hc_bytealign (w0[0], w0[1], offset); + w0[1] = hc_bytealign ( 0, w0[0], offset); w0[0] = 0; break; case 2: - c0[2] = amd_bytealign (w7[3], 0, offset); - c0[1] = amd_bytealign (w7[2], w7[3], offset); - c0[0] = amd_bytealign (w7[1], w7[2], offset); - w7[3] = amd_bytealign (w7[0], w7[1], offset); - w7[2] = amd_bytealign (w6[3], w7[0], offset); - w7[1] = amd_bytealign (w6[2], w6[3], offset); - w7[0] = amd_bytealign (w6[1], w6[2], offset); - w6[3] = amd_bytealign (w6[0], w6[1], offset); - w6[2] = amd_bytealign (w5[3], w6[0], offset); - w6[1] = amd_bytealign (w5[2], w5[3], offset); - w6[0] = amd_bytealign (w5[1], w5[2], offset); - w5[3] = amd_bytealign (w5[0], w5[1], offset); - w5[2] = amd_bytealign (w4[3], w5[0], offset); - w5[1] = amd_bytealign (w4[2], w4[3], offset); - w5[0] = amd_bytealign (w4[1], w4[2], offset); - w4[3] = amd_bytealign (w4[0], w4[1], offset); - w4[2] = amd_bytealign (w3[3], w4[0], offset); - w4[1] = amd_bytealign (w3[2], w3[3], offset); - w4[0] = amd_bytealign (w3[1], w3[2], offset); - w3[3] = amd_bytealign (w3[0], w3[1], offset); - w3[2] = amd_bytealign (w2[3], w3[0], offset); - w3[1] = amd_bytealign (w2[2], w2[3], offset); - w3[0] = amd_bytealign (w2[1], w2[2], offset); - w2[3] = amd_bytealign (w2[0], w2[1], offset); - w2[2] = amd_bytealign (w1[3], w2[0], offset); - w2[1] = amd_bytealign (w1[2], w1[3], offset); - w2[0] = amd_bytealign (w1[1], w1[2], offset); - w1[3] = amd_bytealign (w1[0], w1[1], offset); - w1[2] = amd_bytealign (w0[3], w1[0], offset); - w1[1] = amd_bytealign (w0[2], w0[3], offset); - w1[0] = amd_bytealign (w0[1], w0[2], offset); - w0[3] = amd_bytealign (w0[0], w0[1], offset); - w0[2] = amd_bytealign ( 0, w0[0], offset); + c0[2] = hc_bytealign (w7[3], 0, offset); + c0[1] = hc_bytealign (w7[2], w7[3], offset); + c0[0] = hc_bytealign (w7[1], w7[2], offset); + w7[3] = hc_bytealign (w7[0], w7[1], offset); + w7[2] = hc_bytealign (w6[3], w7[0], offset); + w7[1] = hc_bytealign (w6[2], w6[3], offset); + w7[0] = hc_bytealign (w6[1], w6[2], offset); + w6[3] = hc_bytealign (w6[0], w6[1], offset); + w6[2] = hc_bytealign (w5[3], w6[0], offset); + w6[1] = hc_bytealign (w5[2], w5[3], offset); + w6[0] = hc_bytealign (w5[1], w5[2], offset); + w5[3] = hc_bytealign (w5[0], w5[1], offset); + w5[2] = hc_bytealign (w4[3], w5[0], offset); + w5[1] = hc_bytealign (w4[2], w4[3], offset); + w5[0] = hc_bytealign (w4[1], w4[2], offset); + w4[3] = hc_bytealign (w4[0], w4[1], offset); + w4[2] = hc_bytealign (w3[3], w4[0], offset); + w4[1] = hc_bytealign (w3[2], w3[3], offset); + w4[0] = hc_bytealign (w3[1], w3[2], offset); + w3[3] = hc_bytealign (w3[0], w3[1], offset); + w3[2] = hc_bytealign (w2[3], w3[0], offset); + w3[1] = hc_bytealign (w2[2], w2[3], offset); + w3[0] = hc_bytealign (w2[1], w2[2], offset); + w2[3] = hc_bytealign (w2[0], w2[1], offset); + w2[2] = hc_bytealign (w1[3], w2[0], offset); + w2[1] = hc_bytealign (w1[2], w1[3], offset); + w2[0] = hc_bytealign (w1[1], w1[2], offset); + w1[3] = hc_bytealign (w1[0], w1[1], offset); + w1[2] = hc_bytealign (w0[3], w1[0], offset); + w1[1] = hc_bytealign (w0[2], w0[3], offset); + w1[0] = hc_bytealign (w0[1], w0[2], offset); + w0[3] = hc_bytealign (w0[0], w0[1], offset); + w0[2] = hc_bytealign ( 0, w0[0], offset); w0[1] = 0; w0[0] = 0; break; case 3: - c0[3] = amd_bytealign (w7[3], 0, offset); - c0[2] = amd_bytealign (w7[2], w7[3], offset); - c0[1] = amd_bytealign (w7[1], w7[2], offset); - c0[0] = amd_bytealign (w7[0], w7[1], offset); - w7[3] = amd_bytealign (w6[3], w7[0], offset); - w7[2] = amd_bytealign (w6[2], w6[3], offset); - w7[1] = amd_bytealign (w6[1], w6[2], offset); - w7[0] = amd_bytealign (w6[0], w6[1], offset); - w6[3] = amd_bytealign (w5[3], w6[0], offset); - w6[2] = amd_bytealign (w5[2], w5[3], offset); - w6[1] = amd_bytealign (w5[1], w5[2], offset); - w6[0] = amd_bytealign (w5[0], w5[1], offset); - w5[3] = amd_bytealign (w4[3], w5[0], offset); - w5[2] = amd_bytealign (w4[2], w4[3], offset); - w5[1] = amd_bytealign (w4[1], w4[2], offset); - w5[0] = amd_bytealign (w4[0], w4[1], offset); - w4[3] = amd_bytealign (w3[3], w4[0], offset); - w4[2] = amd_bytealign (w3[2], w3[3], offset); - w4[1] = amd_bytealign (w3[1], w3[2], offset); - w4[0] = amd_bytealign (w3[0], w3[1], offset); - w3[3] = amd_bytealign (w2[3], w3[0], offset); - w3[2] = amd_bytealign (w2[2], w2[3], offset); - w3[1] = amd_bytealign (w2[1], w2[2], offset); - w3[0] = amd_bytealign (w2[0], w2[1], offset); - w2[3] = amd_bytealign (w1[3], w2[0], offset); - w2[2] = amd_bytealign (w1[2], w1[3], offset); - w2[1] = amd_bytealign (w1[1], w1[2], offset); - w2[0] = amd_bytealign (w1[0], w1[1], offset); - w1[3] = amd_bytealign (w0[3], w1[0], offset); - w1[2] = amd_bytealign (w0[2], w0[3], offset); - w1[1] = amd_bytealign (w0[1], w0[2], offset); - w1[0] = amd_bytealign (w0[0], w0[1], offset); - w0[3] = amd_bytealign ( 0, w0[0], offset); + c0[3] = hc_bytealign (w7[3], 0, offset); + c0[2] = hc_bytealign (w7[2], w7[3], offset); + c0[1] = hc_bytealign (w7[1], w7[2], offset); + c0[0] = hc_bytealign (w7[0], w7[1], offset); + w7[3] = hc_bytealign (w6[3], w7[0], offset); + w7[2] = hc_bytealign (w6[2], w6[3], offset); + w7[1] = hc_bytealign (w6[1], w6[2], offset); + w7[0] = hc_bytealign (w6[0], w6[1], offset); + w6[3] = hc_bytealign (w5[3], w6[0], offset); + w6[2] = hc_bytealign (w5[2], w5[3], offset); + w6[1] = hc_bytealign (w5[1], w5[2], offset); + w6[0] = hc_bytealign (w5[0], w5[1], offset); + w5[3] = hc_bytealign (w4[3], w5[0], offset); + w5[2] = hc_bytealign (w4[2], w4[3], offset); + w5[1] = hc_bytealign (w4[1], w4[2], offset); + w5[0] = hc_bytealign (w4[0], w4[1], offset); + w4[3] = hc_bytealign (w3[3], w4[0], offset); + w4[2] = hc_bytealign (w3[2], w3[3], offset); + w4[1] = hc_bytealign (w3[1], w3[2], offset); + w4[0] = hc_bytealign (w3[0], w3[1], offset); + w3[3] = hc_bytealign (w2[3], w3[0], offset); + w3[2] = hc_bytealign (w2[2], w2[3], offset); + w3[1] = hc_bytealign (w2[1], w2[2], offset); + w3[0] = hc_bytealign (w2[0], w2[1], offset); + w2[3] = hc_bytealign (w1[3], w2[0], offset); + w2[2] = hc_bytealign (w1[2], w1[3], offset); + w2[1] = hc_bytealign (w1[1], w1[2], offset); + w2[0] = hc_bytealign (w1[0], w1[1], offset); + w1[3] = hc_bytealign (w0[3], w1[0], offset); + w1[2] = hc_bytealign (w0[2], w0[3], offset); + w1[1] = hc_bytealign (w0[1], w0[2], offset); + w1[0] = hc_bytealign (w0[0], w0[1], offset); + w0[3] = hc_bytealign ( 0, w0[0], offset); w0[2] = 0; w0[1] = 0; w0[0] = 0; @@ -8428,39 +8428,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 4: - c1[0] = amd_bytealign (w7[3], 0, offset); - c0[3] = amd_bytealign (w7[2], w7[3], offset); - c0[2] = amd_bytealign (w7[1], w7[2], offset); - c0[1] = amd_bytealign (w7[0], w7[1], offset); - c0[0] = amd_bytealign (w6[3], w7[0], offset); - w7[3] = amd_bytealign (w6[2], w6[3], offset); - w7[2] = amd_bytealign (w6[1], w6[2], offset); - w7[1] = amd_bytealign (w6[0], w6[1], offset); - w7[0] = amd_bytealign (w5[3], w6[0], offset); - w6[3] = amd_bytealign (w5[2], w5[3], offset); - w6[2] = amd_bytealign (w5[1], w5[2], offset); - w6[1] = amd_bytealign (w5[0], w5[1], offset); - w6[0] = amd_bytealign (w4[3], w5[0], offset); - w5[3] = amd_bytealign (w4[2], w4[3], offset); - w5[2] = amd_bytealign (w4[1], w4[2], offset); - w5[1] = amd_bytealign (w4[0], w4[1], offset); - w5[0] = amd_bytealign (w3[3], w4[0], offset); - w4[3] = amd_bytealign (w3[2], w3[3], offset); - w4[2] = amd_bytealign (w3[1], w3[2], offset); - w4[1] = amd_bytealign (w3[0], w3[1], offset); - w4[0] = amd_bytealign (w2[3], w3[0], offset); - w3[3] = amd_bytealign (w2[2], w2[3], offset); - w3[2] = amd_bytealign (w2[1], w2[2], offset); - w3[1] = amd_bytealign (w2[0], w2[1], offset); - w3[0] = amd_bytealign (w1[3], w2[0], offset); - w2[3] = amd_bytealign (w1[2], w1[3], offset); - w2[2] = amd_bytealign (w1[1], w1[2], offset); - w2[1] = amd_bytealign (w1[0], w1[1], offset); - w2[0] = amd_bytealign (w0[3], w1[0], offset); - w1[3] = amd_bytealign (w0[2], w0[3], offset); - w1[2] = amd_bytealign (w0[1], w0[2], offset); - w1[1] = amd_bytealign (w0[0], w0[1], offset); - w1[0] = amd_bytealign ( 0, w0[0], offset); + c1[0] = hc_bytealign (w7[3], 0, offset); + c0[3] = hc_bytealign (w7[2], w7[3], offset); + c0[2] = hc_bytealign (w7[1], w7[2], offset); + c0[1] = hc_bytealign (w7[0], w7[1], offset); + c0[0] = hc_bytealign (w6[3], w7[0], offset); + w7[3] = hc_bytealign (w6[2], w6[3], offset); + w7[2] = hc_bytealign (w6[1], w6[2], offset); + w7[1] = hc_bytealign (w6[0], w6[1], offset); + w7[0] = hc_bytealign (w5[3], w6[0], offset); + w6[3] = hc_bytealign (w5[2], w5[3], offset); + w6[2] = hc_bytealign (w5[1], w5[2], offset); + w6[1] = hc_bytealign (w5[0], w5[1], offset); + w6[0] = hc_bytealign (w4[3], w5[0], offset); + w5[3] = hc_bytealign (w4[2], w4[3], offset); + w5[2] = hc_bytealign (w4[1], w4[2], offset); + w5[1] = hc_bytealign (w4[0], w4[1], offset); + w5[0] = hc_bytealign (w3[3], w4[0], offset); + w4[3] = hc_bytealign (w3[2], w3[3], offset); + w4[2] = hc_bytealign (w3[1], w3[2], offset); + w4[1] = hc_bytealign (w3[0], w3[1], offset); + w4[0] = hc_bytealign (w2[3], w3[0], offset); + w3[3] = hc_bytealign (w2[2], w2[3], offset); + w3[2] = hc_bytealign (w2[1], w2[2], offset); + w3[1] = hc_bytealign (w2[0], w2[1], offset); + w3[0] = hc_bytealign (w1[3], w2[0], offset); + w2[3] = hc_bytealign (w1[2], w1[3], offset); + w2[2] = hc_bytealign (w1[1], w1[2], offset); + w2[1] = hc_bytealign (w1[0], w1[1], offset); + w2[0] = hc_bytealign (w0[3], w1[0], offset); + w1[3] = hc_bytealign (w0[2], w0[3], offset); + w1[2] = hc_bytealign (w0[1], w0[2], offset); + w1[1] = hc_bytealign (w0[0], w0[1], offset); + w1[0] = hc_bytealign ( 0, w0[0], offset); w0[3] = 0; w0[2] = 0; w0[1] = 0; @@ -8469,39 +8469,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 5: - c1[1] = amd_bytealign (w7[3], 0, offset); - c1[0] = amd_bytealign (w7[2], w7[3], offset); - c0[3] = amd_bytealign (w7[1], w7[2], offset); - c0[2] = amd_bytealign (w7[0], w7[1], offset); - c0[1] = amd_bytealign (w6[3], w7[0], offset); - c0[0] = amd_bytealign (w6[2], w6[3], offset); - w7[3] = amd_bytealign (w6[1], w6[2], offset); - w7[2] = amd_bytealign (w6[0], w6[1], offset); - w7[1] = amd_bytealign (w5[3], w6[0], offset); - w7[0] = amd_bytealign (w5[2], w5[3], offset); - w6[3] = amd_bytealign (w5[1], w5[2], offset); - w6[2] = amd_bytealign (w5[0], w5[1], offset); - w6[1] = amd_bytealign (w4[3], w5[0], offset); - w6[0] = amd_bytealign (w4[2], w4[3], offset); - w5[3] = amd_bytealign (w4[1], w4[2], offset); - w5[2] = amd_bytealign (w4[0], w4[1], offset); - w5[1] = amd_bytealign (w3[3], w4[0], offset); - w5[0] = amd_bytealign (w3[2], w3[3], offset); - w4[3] = amd_bytealign (w3[1], w3[2], offset); - w4[2] = amd_bytealign (w3[0], w3[1], offset); - w4[1] = amd_bytealign (w2[3], w3[0], offset); - w4[0] = amd_bytealign (w2[2], w2[3], offset); - w3[3] = amd_bytealign (w2[1], w2[2], offset); - w3[2] = amd_bytealign (w2[0], w2[1], offset); - w3[1] = amd_bytealign (w1[3], w2[0], offset); - w3[0] = amd_bytealign (w1[2], w1[3], offset); - w2[3] = amd_bytealign (w1[1], w1[2], offset); - w2[2] = amd_bytealign (w1[0], w1[1], offset); - w2[1] = amd_bytealign (w0[3], w1[0], offset); - w2[0] = amd_bytealign (w0[2], w0[3], offset); - w1[3] = amd_bytealign (w0[1], w0[2], offset); - w1[2] = amd_bytealign (w0[0], w0[1], offset); - w1[1] = amd_bytealign ( 0, w0[0], offset); + c1[1] = hc_bytealign (w7[3], 0, offset); + c1[0] = hc_bytealign (w7[2], w7[3], offset); + c0[3] = hc_bytealign (w7[1], w7[2], offset); + c0[2] = hc_bytealign (w7[0], w7[1], offset); + c0[1] = hc_bytealign (w6[3], w7[0], offset); + c0[0] = hc_bytealign (w6[2], w6[3], offset); + w7[3] = hc_bytealign (w6[1], w6[2], offset); + w7[2] = hc_bytealign (w6[0], w6[1], offset); + w7[1] = hc_bytealign (w5[3], w6[0], offset); + w7[0] = hc_bytealign (w5[2], w5[3], offset); + w6[3] = hc_bytealign (w5[1], w5[2], offset); + w6[2] = hc_bytealign (w5[0], w5[1], offset); + w6[1] = hc_bytealign (w4[3], w5[0], offset); + w6[0] = hc_bytealign (w4[2], w4[3], offset); + w5[3] = hc_bytealign (w4[1], w4[2], offset); + w5[2] = hc_bytealign (w4[0], w4[1], offset); + w5[1] = hc_bytealign (w3[3], w4[0], offset); + w5[0] = hc_bytealign (w3[2], w3[3], offset); + w4[3] = hc_bytealign (w3[1], w3[2], offset); + w4[2] = hc_bytealign (w3[0], w3[1], offset); + w4[1] = hc_bytealign (w2[3], w3[0], offset); + w4[0] = hc_bytealign (w2[2], w2[3], offset); + w3[3] = hc_bytealign (w2[1], w2[2], offset); + w3[2] = hc_bytealign (w2[0], w2[1], offset); + w3[1] = hc_bytealign (w1[3], w2[0], offset); + w3[0] = hc_bytealign (w1[2], w1[3], offset); + w2[3] = hc_bytealign (w1[1], w1[2], offset); + w2[2] = hc_bytealign (w1[0], w1[1], offset); + w2[1] = hc_bytealign (w0[3], w1[0], offset); + w2[0] = hc_bytealign (w0[2], w0[3], offset); + w1[3] = hc_bytealign (w0[1], w0[2], offset); + w1[2] = hc_bytealign (w0[0], w0[1], offset); + w1[1] = hc_bytealign ( 0, w0[0], offset); w1[0] = 0; w0[3] = 0; w0[2] = 0; @@ -8511,39 +8511,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 6: - c1[2] = amd_bytealign (w7[3], 0, offset); - c1[1] = amd_bytealign (w7[2], w7[3], offset); - c1[0] = amd_bytealign (w7[1], w7[2], offset); - c0[3] = amd_bytealign (w7[0], w7[1], offset); - c0[2] = amd_bytealign (w6[3], w7[0], offset); - c0[1] = amd_bytealign (w6[2], w6[3], offset); - c0[0] = amd_bytealign (w6[1], w6[2], offset); - w7[3] = amd_bytealign (w6[0], w6[1], offset); - w7[2] = amd_bytealign (w5[3], w6[0], offset); - w7[1] = amd_bytealign (w5[2], w5[3], offset); - w7[0] = amd_bytealign (w5[1], w5[2], offset); - w6[3] = amd_bytealign (w5[0], w5[1], offset); - w6[2] = amd_bytealign (w4[3], w5[0], offset); - w6[1] = amd_bytealign (w4[2], w4[3], offset); - w6[0] = amd_bytealign (w4[1], w4[2], offset); - w5[3] = amd_bytealign (w4[0], w4[1], offset); - w5[2] = amd_bytealign (w3[3], w4[0], offset); - w5[1] = amd_bytealign (w3[2], w3[3], offset); - w5[0] = amd_bytealign (w3[1], w3[2], offset); - w4[3] = amd_bytealign (w3[0], w3[1], offset); - w4[2] = amd_bytealign (w2[3], w3[0], offset); - w4[1] = amd_bytealign (w2[2], w2[3], offset); - w4[0] = amd_bytealign (w2[1], w2[2], offset); - w3[3] = amd_bytealign (w2[0], w2[1], offset); - w3[2] = amd_bytealign (w1[3], w2[0], offset); - w3[1] = amd_bytealign (w1[2], w1[3], offset); - w3[0] = amd_bytealign (w1[1], w1[2], offset); - w2[3] = amd_bytealign (w1[0], w1[1], offset); - w2[2] = amd_bytealign (w0[3], w1[0], offset); - w2[1] = amd_bytealign (w0[2], w0[3], offset); - w2[0] = amd_bytealign (w0[1], w0[2], offset); - w1[3] = amd_bytealign (w0[0], w0[1], offset); - w1[2] = amd_bytealign ( 0, w0[0], offset); + c1[2] = hc_bytealign (w7[3], 0, offset); + c1[1] = hc_bytealign (w7[2], w7[3], offset); + c1[0] = hc_bytealign (w7[1], w7[2], offset); + c0[3] = hc_bytealign (w7[0], w7[1], offset); + c0[2] = hc_bytealign (w6[3], w7[0], offset); + c0[1] = hc_bytealign (w6[2], w6[3], offset); + c0[0] = hc_bytealign (w6[1], w6[2], offset); + w7[3] = hc_bytealign (w6[0], w6[1], offset); + w7[2] = hc_bytealign (w5[3], w6[0], offset); + w7[1] = hc_bytealign (w5[2], w5[3], offset); + w7[0] = hc_bytealign (w5[1], w5[2], offset); + w6[3] = hc_bytealign (w5[0], w5[1], offset); + w6[2] = hc_bytealign (w4[3], w5[0], offset); + w6[1] = hc_bytealign (w4[2], w4[3], offset); + w6[0] = hc_bytealign (w4[1], w4[2], offset); + w5[3] = hc_bytealign (w4[0], w4[1], offset); + w5[2] = hc_bytealign (w3[3], w4[0], offset); + w5[1] = hc_bytealign (w3[2], w3[3], offset); + w5[0] = hc_bytealign (w3[1], w3[2], offset); + w4[3] = hc_bytealign (w3[0], w3[1], offset); + w4[2] = hc_bytealign (w2[3], w3[0], offset); + w4[1] = hc_bytealign (w2[2], w2[3], offset); + w4[0] = hc_bytealign (w2[1], w2[2], offset); + w3[3] = hc_bytealign (w2[0], w2[1], offset); + w3[2] = hc_bytealign (w1[3], w2[0], offset); + w3[1] = hc_bytealign (w1[2], w1[3], offset); + w3[0] = hc_bytealign (w1[1], w1[2], offset); + w2[3] = hc_bytealign (w1[0], w1[1], offset); + w2[2] = hc_bytealign (w0[3], w1[0], offset); + w2[1] = hc_bytealign (w0[2], w0[3], offset); + w2[0] = hc_bytealign (w0[1], w0[2], offset); + w1[3] = hc_bytealign (w0[0], w0[1], offset); + w1[2] = hc_bytealign ( 0, w0[0], offset); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -8554,39 +8554,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 7: - c1[3] = amd_bytealign (w7[3], 0, offset); - c1[2] = amd_bytealign (w7[2], w7[3], offset); - c1[1] = amd_bytealign (w7[1], w7[2], offset); - c1[0] = amd_bytealign (w7[0], w7[1], offset); - c0[3] = amd_bytealign (w6[3], w7[0], offset); - c0[2] = amd_bytealign (w6[2], w6[3], offset); - c0[1] = amd_bytealign (w6[1], w6[2], offset); - c0[0] = amd_bytealign (w6[0], w6[1], offset); - w7[3] = amd_bytealign (w5[3], w6[0], offset); - w7[2] = amd_bytealign (w5[2], w5[3], offset); - w7[1] = amd_bytealign (w5[1], w5[2], offset); - w7[0] = amd_bytealign (w5[0], w5[1], offset); - w6[3] = amd_bytealign (w4[3], w5[0], offset); - w6[2] = amd_bytealign (w4[2], w4[3], offset); - w6[1] = amd_bytealign (w4[1], w4[2], offset); - w6[0] = amd_bytealign (w4[0], w4[1], offset); - w5[3] = amd_bytealign (w3[3], w4[0], offset); - w5[2] = amd_bytealign (w3[2], w3[3], offset); - w5[1] = amd_bytealign (w3[1], w3[2], offset); - w5[0] = amd_bytealign (w3[0], w3[1], offset); - w4[3] = amd_bytealign (w2[3], w3[0], offset); - w4[2] = amd_bytealign (w2[2], w2[3], offset); - w4[1] = amd_bytealign (w2[1], w2[2], offset); - w4[0] = amd_bytealign (w2[0], w2[1], offset); - w3[3] = amd_bytealign (w1[3], w2[0], offset); - w3[2] = amd_bytealign (w1[2], w1[3], offset); - w3[1] = amd_bytealign (w1[1], w1[2], offset); - w3[0] = amd_bytealign (w1[0], w1[1], offset); - w2[3] = amd_bytealign (w0[3], w1[0], offset); - w2[2] = amd_bytealign (w0[2], w0[3], offset); - w2[1] = amd_bytealign (w0[1], w0[2], offset); - w2[0] = amd_bytealign (w0[0], w0[1], offset); - w1[3] = amd_bytealign ( 0, w0[0], offset); + c1[3] = hc_bytealign (w7[3], 0, offset); + c1[2] = hc_bytealign (w7[2], w7[3], offset); + c1[1] = hc_bytealign (w7[1], w7[2], offset); + c1[0] = hc_bytealign (w7[0], w7[1], offset); + c0[3] = hc_bytealign (w6[3], w7[0], offset); + c0[2] = hc_bytealign (w6[2], w6[3], offset); + c0[1] = hc_bytealign (w6[1], w6[2], offset); + c0[0] = hc_bytealign (w6[0], w6[1], offset); + w7[3] = hc_bytealign (w5[3], w6[0], offset); + w7[2] = hc_bytealign (w5[2], w5[3], offset); + w7[1] = hc_bytealign (w5[1], w5[2], offset); + w7[0] = hc_bytealign (w5[0], w5[1], offset); + w6[3] = hc_bytealign (w4[3], w5[0], offset); + w6[2] = hc_bytealign (w4[2], w4[3], offset); + w6[1] = hc_bytealign (w4[1], w4[2], offset); + w6[0] = hc_bytealign (w4[0], w4[1], offset); + w5[3] = hc_bytealign (w3[3], w4[0], offset); + w5[2] = hc_bytealign (w3[2], w3[3], offset); + w5[1] = hc_bytealign (w3[1], w3[2], offset); + w5[0] = hc_bytealign (w3[0], w3[1], offset); + w4[3] = hc_bytealign (w2[3], w3[0], offset); + w4[2] = hc_bytealign (w2[2], w2[3], offset); + w4[1] = hc_bytealign (w2[1], w2[2], offset); + w4[0] = hc_bytealign (w2[0], w2[1], offset); + w3[3] = hc_bytealign (w1[3], w2[0], offset); + w3[2] = hc_bytealign (w1[2], w1[3], offset); + w3[1] = hc_bytealign (w1[1], w1[2], offset); + w3[0] = hc_bytealign (w1[0], w1[1], offset); + w2[3] = hc_bytealign (w0[3], w1[0], offset); + w2[2] = hc_bytealign (w0[2], w0[3], offset); + w2[1] = hc_bytealign (w0[1], w0[2], offset); + w2[0] = hc_bytealign (w0[0], w0[1], offset); + w1[3] = hc_bytealign ( 0, w0[0], offset); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -8598,39 +8598,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 8: - c2[0] = amd_bytealign (w7[3], 0, offset); - c1[3] = amd_bytealign (w7[2], w7[3], offset); - c1[2] = amd_bytealign (w7[1], w7[2], offset); - c1[1] = amd_bytealign (w7[0], w7[1], offset); - c1[0] = amd_bytealign (w6[3], w7[0], offset); - c0[3] = amd_bytealign (w6[2], w6[3], offset); - c0[2] = amd_bytealign (w6[1], w6[2], offset); - c0[1] = amd_bytealign (w6[0], w6[1], offset); - c0[0] = amd_bytealign (w5[3], w6[0], offset); - w7[3] = amd_bytealign (w5[2], w5[3], offset); - w7[2] = amd_bytealign (w5[1], w5[2], offset); - w7[1] = amd_bytealign (w5[0], w5[1], offset); - w7[0] = amd_bytealign (w4[3], w5[0], offset); - w6[3] = amd_bytealign (w4[2], w4[3], offset); - w6[2] = amd_bytealign (w4[1], w4[2], offset); - w6[1] = amd_bytealign (w4[0], w4[1], offset); - w6[0] = amd_bytealign (w3[3], w4[0], offset); - w5[3] = amd_bytealign (w3[2], w3[3], offset); - w5[2] = amd_bytealign (w3[1], w3[2], offset); - w5[1] = amd_bytealign (w3[0], w3[1], offset); - w5[0] = amd_bytealign (w2[3], w3[0], offset); - w4[3] = amd_bytealign (w2[2], w2[3], offset); - w4[2] = amd_bytealign (w2[1], w2[2], offset); - w4[1] = amd_bytealign (w2[0], w2[1], offset); - w4[0] = amd_bytealign (w1[3], w2[0], offset); - w3[3] = amd_bytealign (w1[2], w1[3], offset); - w3[2] = amd_bytealign (w1[1], w1[2], offset); - w3[1] = amd_bytealign (w1[0], w1[1], offset); - w3[0] = amd_bytealign (w0[3], w1[0], offset); - w2[3] = amd_bytealign (w0[2], w0[3], offset); - w2[2] = amd_bytealign (w0[1], w0[2], offset); - w2[1] = amd_bytealign (w0[0], w0[1], offset); - w2[0] = amd_bytealign ( 0, w0[0], offset); + c2[0] = hc_bytealign (w7[3], 0, offset); + c1[3] = hc_bytealign (w7[2], w7[3], offset); + c1[2] = hc_bytealign (w7[1], w7[2], offset); + c1[1] = hc_bytealign (w7[0], w7[1], offset); + c1[0] = hc_bytealign (w6[3], w7[0], offset); + c0[3] = hc_bytealign (w6[2], w6[3], offset); + c0[2] = hc_bytealign (w6[1], w6[2], offset); + c0[1] = hc_bytealign (w6[0], w6[1], offset); + c0[0] = hc_bytealign (w5[3], w6[0], offset); + w7[3] = hc_bytealign (w5[2], w5[3], offset); + w7[2] = hc_bytealign (w5[1], w5[2], offset); + w7[1] = hc_bytealign (w5[0], w5[1], offset); + w7[0] = hc_bytealign (w4[3], w5[0], offset); + w6[3] = hc_bytealign (w4[2], w4[3], offset); + w6[2] = hc_bytealign (w4[1], w4[2], offset); + w6[1] = hc_bytealign (w4[0], w4[1], offset); + w6[0] = hc_bytealign (w3[3], w4[0], offset); + w5[3] = hc_bytealign (w3[2], w3[3], offset); + w5[2] = hc_bytealign (w3[1], w3[2], offset); + w5[1] = hc_bytealign (w3[0], w3[1], offset); + w5[0] = hc_bytealign (w2[3], w3[0], offset); + w4[3] = hc_bytealign (w2[2], w2[3], offset); + w4[2] = hc_bytealign (w2[1], w2[2], offset); + w4[1] = hc_bytealign (w2[0], w2[1], offset); + w4[0] = hc_bytealign (w1[3], w2[0], offset); + w3[3] = hc_bytealign (w1[2], w1[3], offset); + w3[2] = hc_bytealign (w1[1], w1[2], offset); + w3[1] = hc_bytealign (w1[0], w1[1], offset); + w3[0] = hc_bytealign (w0[3], w1[0], offset); + w2[3] = hc_bytealign (w0[2], w0[3], offset); + w2[2] = hc_bytealign (w0[1], w0[2], offset); + w2[1] = hc_bytealign (w0[0], w0[1], offset); + w2[0] = hc_bytealign ( 0, w0[0], offset); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -8643,39 +8643,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 9: - c2[1] = amd_bytealign (w7[3], 0, offset); - c2[0] = amd_bytealign (w7[2], w7[3], offset); - c1[3] = amd_bytealign (w7[1], w7[2], offset); - c1[2] = amd_bytealign (w7[0], w7[1], offset); - c1[1] = amd_bytealign (w6[3], w7[0], offset); - c1[0] = amd_bytealign (w6[2], w6[3], offset); - c0[3] = amd_bytealign (w6[1], w6[2], offset); - c0[2] = amd_bytealign (w6[0], w6[1], offset); - c0[1] = amd_bytealign (w5[3], w6[0], offset); - c0[0] = amd_bytealign (w5[2], w5[3], offset); - w7[3] = amd_bytealign (w5[1], w5[2], offset); - w7[2] = amd_bytealign (w5[0], w5[1], offset); - w7[1] = amd_bytealign (w4[3], w5[0], offset); - w7[0] = amd_bytealign (w4[2], w4[3], offset); - w6[3] = amd_bytealign (w4[1], w4[2], offset); - w6[2] = amd_bytealign (w4[0], w4[1], offset); - w6[1] = amd_bytealign (w3[3], w4[0], offset); - w6[0] = amd_bytealign (w3[2], w3[3], offset); - w5[3] = amd_bytealign (w3[1], w3[2], offset); - w5[2] = amd_bytealign (w3[0], w3[1], offset); - w5[1] = amd_bytealign (w2[3], w3[0], offset); - w5[0] = amd_bytealign (w2[2], w2[3], offset); - w4[3] = amd_bytealign (w2[1], w2[2], offset); - w4[2] = amd_bytealign (w2[0], w2[1], offset); - w4[1] = amd_bytealign (w1[3], w2[0], offset); - w4[0] = amd_bytealign (w1[2], w1[3], offset); - w3[3] = amd_bytealign (w1[1], w1[2], offset); - w3[2] = amd_bytealign (w1[0], w1[1], offset); - w3[1] = amd_bytealign (w0[3], w1[0], offset); - w3[0] = amd_bytealign (w0[2], w0[3], offset); - w2[3] = amd_bytealign (w0[1], w0[2], offset); - w2[2] = amd_bytealign (w0[0], w0[1], offset); - w2[1] = amd_bytealign ( 0, w0[0], offset); + c2[1] = hc_bytealign (w7[3], 0, offset); + c2[0] = hc_bytealign (w7[2], w7[3], offset); + c1[3] = hc_bytealign (w7[1], w7[2], offset); + c1[2] = hc_bytealign (w7[0], w7[1], offset); + c1[1] = hc_bytealign (w6[3], w7[0], offset); + c1[0] = hc_bytealign (w6[2], w6[3], offset); + c0[3] = hc_bytealign (w6[1], w6[2], offset); + c0[2] = hc_bytealign (w6[0], w6[1], offset); + c0[1] = hc_bytealign (w5[3], w6[0], offset); + c0[0] = hc_bytealign (w5[2], w5[3], offset); + w7[3] = hc_bytealign (w5[1], w5[2], offset); + w7[2] = hc_bytealign (w5[0], w5[1], offset); + w7[1] = hc_bytealign (w4[3], w5[0], offset); + w7[0] = hc_bytealign (w4[2], w4[3], offset); + w6[3] = hc_bytealign (w4[1], w4[2], offset); + w6[2] = hc_bytealign (w4[0], w4[1], offset); + w6[1] = hc_bytealign (w3[3], w4[0], offset); + w6[0] = hc_bytealign (w3[2], w3[3], offset); + w5[3] = hc_bytealign (w3[1], w3[2], offset); + w5[2] = hc_bytealign (w3[0], w3[1], offset); + w5[1] = hc_bytealign (w2[3], w3[0], offset); + w5[0] = hc_bytealign (w2[2], w2[3], offset); + w4[3] = hc_bytealign (w2[1], w2[2], offset); + w4[2] = hc_bytealign (w2[0], w2[1], offset); + w4[1] = hc_bytealign (w1[3], w2[0], offset); + w4[0] = hc_bytealign (w1[2], w1[3], offset); + w3[3] = hc_bytealign (w1[1], w1[2], offset); + w3[2] = hc_bytealign (w1[0], w1[1], offset); + w3[1] = hc_bytealign (w0[3], w1[0], offset); + w3[0] = hc_bytealign (w0[2], w0[3], offset); + w2[3] = hc_bytealign (w0[1], w0[2], offset); + w2[2] = hc_bytealign (w0[0], w0[1], offset); + w2[1] = hc_bytealign ( 0, w0[0], offset); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -8689,39 +8689,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 10: - c2[2] = amd_bytealign (w7[3], 0, offset); - c2[1] = amd_bytealign (w7[2], w7[3], offset); - c2[0] = amd_bytealign (w7[1], w7[2], offset); - c1[3] = amd_bytealign (w7[0], w7[1], offset); - c1[2] = amd_bytealign (w6[3], w7[0], offset); - c1[1] = amd_bytealign (w6[2], w6[3], offset); - c1[0] = amd_bytealign (w6[1], w6[2], offset); - c0[3] = amd_bytealign (w6[0], w6[1], offset); - c0[2] = amd_bytealign (w5[3], w6[0], offset); - c0[1] = amd_bytealign (w5[2], w5[3], offset); - c0[0] = amd_bytealign (w5[1], w5[2], offset); - w7[3] = amd_bytealign (w5[0], w5[1], offset); - w7[2] = amd_bytealign (w4[3], w5[0], offset); - w7[1] = amd_bytealign (w4[2], w4[3], offset); - w7[0] = amd_bytealign (w4[1], w4[2], offset); - w6[3] = amd_bytealign (w4[0], w4[1], offset); - w6[2] = amd_bytealign (w3[3], w4[0], offset); - w6[1] = amd_bytealign (w3[2], w3[3], offset); - w6[0] = amd_bytealign (w3[1], w3[2], offset); - w5[3] = amd_bytealign (w3[0], w3[1], offset); - w5[2] = amd_bytealign (w2[3], w3[0], offset); - w5[1] = amd_bytealign (w2[2], w2[3], offset); - w5[0] = amd_bytealign (w2[1], w2[2], offset); - w4[3] = amd_bytealign (w2[0], w2[1], offset); - w4[2] = amd_bytealign (w1[3], w2[0], offset); - w4[1] = amd_bytealign (w1[2], w1[3], offset); - w4[0] = amd_bytealign (w1[1], w1[2], offset); - w3[3] = amd_bytealign (w1[0], w1[1], offset); - w3[2] = amd_bytealign (w0[3], w1[0], offset); - w3[1] = amd_bytealign (w0[2], w0[3], offset); - w3[0] = amd_bytealign (w0[1], w0[2], offset); - w2[3] = amd_bytealign (w0[0], w0[1], offset); - w2[2] = amd_bytealign ( 0, w0[0], offset); + c2[2] = hc_bytealign (w7[3], 0, offset); + c2[1] = hc_bytealign (w7[2], w7[3], offset); + c2[0] = hc_bytealign (w7[1], w7[2], offset); + c1[3] = hc_bytealign (w7[0], w7[1], offset); + c1[2] = hc_bytealign (w6[3], w7[0], offset); + c1[1] = hc_bytealign (w6[2], w6[3], offset); + c1[0] = hc_bytealign (w6[1], w6[2], offset); + c0[3] = hc_bytealign (w6[0], w6[1], offset); + c0[2] = hc_bytealign (w5[3], w6[0], offset); + c0[1] = hc_bytealign (w5[2], w5[3], offset); + c0[0] = hc_bytealign (w5[1], w5[2], offset); + w7[3] = hc_bytealign (w5[0], w5[1], offset); + w7[2] = hc_bytealign (w4[3], w5[0], offset); + w7[1] = hc_bytealign (w4[2], w4[3], offset); + w7[0] = hc_bytealign (w4[1], w4[2], offset); + w6[3] = hc_bytealign (w4[0], w4[1], offset); + w6[2] = hc_bytealign (w3[3], w4[0], offset); + w6[1] = hc_bytealign (w3[2], w3[3], offset); + w6[0] = hc_bytealign (w3[1], w3[2], offset); + w5[3] = hc_bytealign (w3[0], w3[1], offset); + w5[2] = hc_bytealign (w2[3], w3[0], offset); + w5[1] = hc_bytealign (w2[2], w2[3], offset); + w5[0] = hc_bytealign (w2[1], w2[2], offset); + w4[3] = hc_bytealign (w2[0], w2[1], offset); + w4[2] = hc_bytealign (w1[3], w2[0], offset); + w4[1] = hc_bytealign (w1[2], w1[3], offset); + w4[0] = hc_bytealign (w1[1], w1[2], offset); + w3[3] = hc_bytealign (w1[0], w1[1], offset); + w3[2] = hc_bytealign (w0[3], w1[0], offset); + w3[1] = hc_bytealign (w0[2], w0[3], offset); + w3[0] = hc_bytealign (w0[1], w0[2], offset); + w2[3] = hc_bytealign (w0[0], w0[1], offset); + w2[2] = hc_bytealign ( 0, w0[0], offset); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -8736,39 +8736,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 11: - c2[3] = amd_bytealign (w7[3], 0, offset); - c2[2] = amd_bytealign (w7[2], w7[3], offset); - c2[1] = amd_bytealign (w7[1], w7[2], offset); - c2[0] = amd_bytealign (w7[0], w7[1], offset); - c1[3] = amd_bytealign (w6[3], w7[0], offset); - c1[2] = amd_bytealign (w6[2], w6[3], offset); - c1[1] = amd_bytealign (w6[1], w6[2], offset); - c1[0] = amd_bytealign (w6[0], w6[1], offset); - c0[3] = amd_bytealign (w5[3], w6[0], offset); - c0[2] = amd_bytealign (w5[2], w5[3], offset); - c0[1] = amd_bytealign (w5[1], w5[2], offset); - c0[0] = amd_bytealign (w5[0], w5[1], offset); - w7[3] = amd_bytealign (w4[3], w5[0], offset); - w7[2] = amd_bytealign (w4[2], w4[3], offset); - w7[1] = amd_bytealign (w4[1], w4[2], offset); - w7[0] = amd_bytealign (w4[0], w4[1], offset); - w6[3] = amd_bytealign (w3[3], w4[0], offset); - w6[2] = amd_bytealign (w3[2], w3[3], offset); - w6[1] = amd_bytealign (w3[1], w3[2], offset); - w6[0] = amd_bytealign (w3[0], w3[1], offset); - w5[3] = amd_bytealign (w2[3], w3[0], offset); - w5[2] = amd_bytealign (w2[2], w2[3], offset); - w5[1] = amd_bytealign (w2[1], w2[2], offset); - w5[0] = amd_bytealign (w2[0], w2[1], offset); - w4[3] = amd_bytealign (w1[3], w2[0], offset); - w4[2] = amd_bytealign (w1[2], w1[3], offset); - w4[1] = amd_bytealign (w1[1], w1[2], offset); - w4[0] = amd_bytealign (w1[0], w1[1], offset); - w3[3] = amd_bytealign (w0[3], w1[0], offset); - w3[2] = amd_bytealign (w0[2], w0[3], offset); - w3[1] = amd_bytealign (w0[1], w0[2], offset); - w3[0] = amd_bytealign (w0[0], w0[1], offset); - w2[3] = amd_bytealign ( 0, w0[0], offset); + c2[3] = hc_bytealign (w7[3], 0, offset); + c2[2] = hc_bytealign (w7[2], w7[3], offset); + c2[1] = hc_bytealign (w7[1], w7[2], offset); + c2[0] = hc_bytealign (w7[0], w7[1], offset); + c1[3] = hc_bytealign (w6[3], w7[0], offset); + c1[2] = hc_bytealign (w6[2], w6[3], offset); + c1[1] = hc_bytealign (w6[1], w6[2], offset); + c1[0] = hc_bytealign (w6[0], w6[1], offset); + c0[3] = hc_bytealign (w5[3], w6[0], offset); + c0[2] = hc_bytealign (w5[2], w5[3], offset); + c0[1] = hc_bytealign (w5[1], w5[2], offset); + c0[0] = hc_bytealign (w5[0], w5[1], offset); + w7[3] = hc_bytealign (w4[3], w5[0], offset); + w7[2] = hc_bytealign (w4[2], w4[3], offset); + w7[1] = hc_bytealign (w4[1], w4[2], offset); + w7[0] = hc_bytealign (w4[0], w4[1], offset); + w6[3] = hc_bytealign (w3[3], w4[0], offset); + w6[2] = hc_bytealign (w3[2], w3[3], offset); + w6[1] = hc_bytealign (w3[1], w3[2], offset); + w6[0] = hc_bytealign (w3[0], w3[1], offset); + w5[3] = hc_bytealign (w2[3], w3[0], offset); + w5[2] = hc_bytealign (w2[2], w2[3], offset); + w5[1] = hc_bytealign (w2[1], w2[2], offset); + w5[0] = hc_bytealign (w2[0], w2[1], offset); + w4[3] = hc_bytealign (w1[3], w2[0], offset); + w4[2] = hc_bytealign (w1[2], w1[3], offset); + w4[1] = hc_bytealign (w1[1], w1[2], offset); + w4[0] = hc_bytealign (w1[0], w1[1], offset); + w3[3] = hc_bytealign (w0[3], w1[0], offset); + w3[2] = hc_bytealign (w0[2], w0[3], offset); + w3[1] = hc_bytealign (w0[1], w0[2], offset); + w3[0] = hc_bytealign (w0[0], w0[1], offset); + w2[3] = hc_bytealign ( 0, w0[0], offset); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -8784,39 +8784,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 12: - c3[0] = amd_bytealign (w7[3], 0, offset); - c2[3] = amd_bytealign (w7[2], w7[3], offset); - c2[2] = amd_bytealign (w7[1], w7[2], offset); - c2[1] = amd_bytealign (w7[0], w7[1], offset); - c2[0] = amd_bytealign (w6[3], w7[0], offset); - c1[3] = amd_bytealign (w6[2], w6[3], offset); - c1[2] = amd_bytealign (w6[1], w6[2], offset); - c1[1] = amd_bytealign (w6[0], w6[1], offset); - c1[0] = amd_bytealign (w5[3], w6[0], offset); - c0[3] = amd_bytealign (w5[2], w5[3], offset); - c0[2] = amd_bytealign (w5[1], w5[2], offset); - c0[1] = amd_bytealign (w5[0], w5[1], offset); - c0[0] = amd_bytealign (w4[3], w5[0], offset); - w7[3] = amd_bytealign (w4[2], w4[3], offset); - w7[2] = amd_bytealign (w4[1], w4[2], offset); - w7[1] = amd_bytealign (w4[0], w4[1], offset); - w7[0] = amd_bytealign (w3[3], w4[0], offset); - w6[3] = amd_bytealign (w3[2], w3[3], offset); - w6[2] = amd_bytealign (w3[1], w3[2], offset); - w6[1] = amd_bytealign (w3[0], w3[1], offset); - w6[0] = amd_bytealign (w2[3], w3[0], offset); - w5[3] = amd_bytealign (w2[2], w2[3], offset); - w5[2] = amd_bytealign (w2[1], w2[2], offset); - w5[1] = amd_bytealign (w2[0], w2[1], offset); - w5[0] = amd_bytealign (w1[3], w2[0], offset); - w4[3] = amd_bytealign (w1[2], w1[3], offset); - w4[2] = amd_bytealign (w1[1], w1[2], offset); - w4[1] = amd_bytealign (w1[0], w1[1], offset); - w4[0] = amd_bytealign (w0[3], w1[0], offset); - w3[3] = amd_bytealign (w0[2], w0[3], offset); - w3[2] = amd_bytealign (w0[1], w0[2], offset); - w3[1] = amd_bytealign (w0[0], w0[1], offset); - w3[0] = amd_bytealign ( 0, w0[0], offset); + c3[0] = hc_bytealign (w7[3], 0, offset); + c2[3] = hc_bytealign (w7[2], w7[3], offset); + c2[2] = hc_bytealign (w7[1], w7[2], offset); + c2[1] = hc_bytealign (w7[0], w7[1], offset); + c2[0] = hc_bytealign (w6[3], w7[0], offset); + c1[3] = hc_bytealign (w6[2], w6[3], offset); + c1[2] = hc_bytealign (w6[1], w6[2], offset); + c1[1] = hc_bytealign (w6[0], w6[1], offset); + c1[0] = hc_bytealign (w5[3], w6[0], offset); + c0[3] = hc_bytealign (w5[2], w5[3], offset); + c0[2] = hc_bytealign (w5[1], w5[2], offset); + c0[1] = hc_bytealign (w5[0], w5[1], offset); + c0[0] = hc_bytealign (w4[3], w5[0], offset); + w7[3] = hc_bytealign (w4[2], w4[3], offset); + w7[2] = hc_bytealign (w4[1], w4[2], offset); + w7[1] = hc_bytealign (w4[0], w4[1], offset); + w7[0] = hc_bytealign (w3[3], w4[0], offset); + w6[3] = hc_bytealign (w3[2], w3[3], offset); + w6[2] = hc_bytealign (w3[1], w3[2], offset); + w6[1] = hc_bytealign (w3[0], w3[1], offset); + w6[0] = hc_bytealign (w2[3], w3[0], offset); + w5[3] = hc_bytealign (w2[2], w2[3], offset); + w5[2] = hc_bytealign (w2[1], w2[2], offset); + w5[1] = hc_bytealign (w2[0], w2[1], offset); + w5[0] = hc_bytealign (w1[3], w2[0], offset); + w4[3] = hc_bytealign (w1[2], w1[3], offset); + w4[2] = hc_bytealign (w1[1], w1[2], offset); + w4[1] = hc_bytealign (w1[0], w1[1], offset); + w4[0] = hc_bytealign (w0[3], w1[0], offset); + w3[3] = hc_bytealign (w0[2], w0[3], offset); + w3[2] = hc_bytealign (w0[1], w0[2], offset); + w3[1] = hc_bytealign (w0[0], w0[1], offset); + w3[0] = hc_bytealign ( 0, w0[0], offset); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -8833,39 +8833,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 13: - c3[1] = amd_bytealign (w7[3], 0, offset); - c3[0] = amd_bytealign (w7[2], w7[3], offset); - c2[3] = amd_bytealign (w7[1], w7[2], offset); - c2[2] = amd_bytealign (w7[0], w7[1], offset); - c2[1] = amd_bytealign (w6[3], w7[0], offset); - c2[0] = amd_bytealign (w6[2], w6[3], offset); - c1[3] = amd_bytealign (w6[1], w6[2], offset); - c1[2] = amd_bytealign (w6[0], w6[1], offset); - c1[1] = amd_bytealign (w5[3], w6[0], offset); - c1[0] = amd_bytealign (w5[2], w5[3], offset); - c0[3] = amd_bytealign (w5[1], w5[2], offset); - c0[2] = amd_bytealign (w5[0], w5[1], offset); - c0[1] = amd_bytealign (w4[3], w5[0], offset); - c0[0] = amd_bytealign (w4[2], w4[3], offset); - w7[3] = amd_bytealign (w4[1], w4[2], offset); - w7[2] = amd_bytealign (w4[0], w4[1], offset); - w7[1] = amd_bytealign (w3[3], w4[0], offset); - w7[0] = amd_bytealign (w3[2], w3[3], offset); - w6[3] = amd_bytealign (w3[1], w3[2], offset); - w6[2] = amd_bytealign (w3[0], w3[1], offset); - w6[1] = amd_bytealign (w2[3], w3[0], offset); - w6[0] = amd_bytealign (w2[2], w2[3], offset); - w5[3] = amd_bytealign (w2[1], w2[2], offset); - w5[2] = amd_bytealign (w2[0], w2[1], offset); - w5[1] = amd_bytealign (w1[3], w2[0], offset); - w5[0] = amd_bytealign (w1[2], w1[3], offset); - w4[3] = amd_bytealign (w1[1], w1[2], offset); - w4[2] = amd_bytealign (w1[0], w1[1], offset); - w4[1] = amd_bytealign (w0[3], w1[0], offset); - w4[0] = amd_bytealign (w0[2], w0[3], offset); - w3[3] = amd_bytealign (w0[1], w0[2], offset); - w3[2] = amd_bytealign (w0[0], w0[1], offset); - w3[1] = amd_bytealign ( 0, w0[0], offset); + c3[1] = hc_bytealign (w7[3], 0, offset); + c3[0] = hc_bytealign (w7[2], w7[3], offset); + c2[3] = hc_bytealign (w7[1], w7[2], offset); + c2[2] = hc_bytealign (w7[0], w7[1], offset); + c2[1] = hc_bytealign (w6[3], w7[0], offset); + c2[0] = hc_bytealign (w6[2], w6[3], offset); + c1[3] = hc_bytealign (w6[1], w6[2], offset); + c1[2] = hc_bytealign (w6[0], w6[1], offset); + c1[1] = hc_bytealign (w5[3], w6[0], offset); + c1[0] = hc_bytealign (w5[2], w5[3], offset); + c0[3] = hc_bytealign (w5[1], w5[2], offset); + c0[2] = hc_bytealign (w5[0], w5[1], offset); + c0[1] = hc_bytealign (w4[3], w5[0], offset); + c0[0] = hc_bytealign (w4[2], w4[3], offset); + w7[3] = hc_bytealign (w4[1], w4[2], offset); + w7[2] = hc_bytealign (w4[0], w4[1], offset); + w7[1] = hc_bytealign (w3[3], w4[0], offset); + w7[0] = hc_bytealign (w3[2], w3[3], offset); + w6[3] = hc_bytealign (w3[1], w3[2], offset); + w6[2] = hc_bytealign (w3[0], w3[1], offset); + w6[1] = hc_bytealign (w2[3], w3[0], offset); + w6[0] = hc_bytealign (w2[2], w2[3], offset); + w5[3] = hc_bytealign (w2[1], w2[2], offset); + w5[2] = hc_bytealign (w2[0], w2[1], offset); + w5[1] = hc_bytealign (w1[3], w2[0], offset); + w5[0] = hc_bytealign (w1[2], w1[3], offset); + w4[3] = hc_bytealign (w1[1], w1[2], offset); + w4[2] = hc_bytealign (w1[0], w1[1], offset); + w4[1] = hc_bytealign (w0[3], w1[0], offset); + w4[0] = hc_bytealign (w0[2], w0[3], offset); + w3[3] = hc_bytealign (w0[1], w0[2], offset); + w3[2] = hc_bytealign (w0[0], w0[1], offset); + w3[1] = hc_bytealign ( 0, w0[0], offset); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -8883,39 +8883,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 14: - c3[2] = amd_bytealign (w7[3], 0, offset); - c3[1] = amd_bytealign (w7[2], w7[3], offset); - c3[0] = amd_bytealign (w7[1], w7[2], offset); - c2[3] = amd_bytealign (w7[0], w7[1], offset); - c2[2] = amd_bytealign (w6[3], w7[0], offset); - c2[1] = amd_bytealign (w6[2], w6[3], offset); - c2[0] = amd_bytealign (w6[1], w6[2], offset); - c1[3] = amd_bytealign (w6[0], w6[1], offset); - c1[2] = amd_bytealign (w5[3], w6[0], offset); - c1[1] = amd_bytealign (w5[2], w5[3], offset); - c1[0] = amd_bytealign (w5[1], w5[2], offset); - c0[3] = amd_bytealign (w5[0], w5[1], offset); - c0[2] = amd_bytealign (w4[3], w5[0], offset); - c0[1] = amd_bytealign (w4[2], w4[3], offset); - c0[0] = amd_bytealign (w4[1], w4[2], offset); - w7[3] = amd_bytealign (w4[0], w4[1], offset); - w7[2] = amd_bytealign (w3[3], w4[0], offset); - w7[1] = amd_bytealign (w3[2], w3[3], offset); - w7[0] = amd_bytealign (w3[1], w3[2], offset); - w6[3] = amd_bytealign (w3[0], w3[1], offset); - w6[2] = amd_bytealign (w2[3], w3[0], offset); - w6[1] = amd_bytealign (w2[2], w2[3], offset); - w6[0] = amd_bytealign (w2[1], w2[2], offset); - w5[3] = amd_bytealign (w2[0], w2[1], offset); - w5[2] = amd_bytealign (w1[3], w2[0], offset); - w5[1] = amd_bytealign (w1[2], w1[3], offset); - w5[0] = amd_bytealign (w1[1], w1[2], offset); - w4[3] = amd_bytealign (w1[0], w1[1], offset); - w4[2] = amd_bytealign (w0[3], w1[0], offset); - w4[1] = amd_bytealign (w0[2], w0[3], offset); - w4[0] = amd_bytealign (w0[1], w0[2], offset); - w3[3] = amd_bytealign (w0[0], w0[1], offset); - w3[2] = amd_bytealign ( 0, w0[0], offset); + c3[2] = hc_bytealign (w7[3], 0, offset); + c3[1] = hc_bytealign (w7[2], w7[3], offset); + c3[0] = hc_bytealign (w7[1], w7[2], offset); + c2[3] = hc_bytealign (w7[0], w7[1], offset); + c2[2] = hc_bytealign (w6[3], w7[0], offset); + c2[1] = hc_bytealign (w6[2], w6[3], offset); + c2[0] = hc_bytealign (w6[1], w6[2], offset); + c1[3] = hc_bytealign (w6[0], w6[1], offset); + c1[2] = hc_bytealign (w5[3], w6[0], offset); + c1[1] = hc_bytealign (w5[2], w5[3], offset); + c1[0] = hc_bytealign (w5[1], w5[2], offset); + c0[3] = hc_bytealign (w5[0], w5[1], offset); + c0[2] = hc_bytealign (w4[3], w5[0], offset); + c0[1] = hc_bytealign (w4[2], w4[3], offset); + c0[0] = hc_bytealign (w4[1], w4[2], offset); + w7[3] = hc_bytealign (w4[0], w4[1], offset); + w7[2] = hc_bytealign (w3[3], w4[0], offset); + w7[1] = hc_bytealign (w3[2], w3[3], offset); + w7[0] = hc_bytealign (w3[1], w3[2], offset); + w6[3] = hc_bytealign (w3[0], w3[1], offset); + w6[2] = hc_bytealign (w2[3], w3[0], offset); + w6[1] = hc_bytealign (w2[2], w2[3], offset); + w6[0] = hc_bytealign (w2[1], w2[2], offset); + w5[3] = hc_bytealign (w2[0], w2[1], offset); + w5[2] = hc_bytealign (w1[3], w2[0], offset); + w5[1] = hc_bytealign (w1[2], w1[3], offset); + w5[0] = hc_bytealign (w1[1], w1[2], offset); + w4[3] = hc_bytealign (w1[0], w1[1], offset); + w4[2] = hc_bytealign (w0[3], w1[0], offset); + w4[1] = hc_bytealign (w0[2], w0[3], offset); + w4[0] = hc_bytealign (w0[1], w0[2], offset); + w3[3] = hc_bytealign (w0[0], w0[1], offset); + w3[2] = hc_bytealign ( 0, w0[0], offset); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -8934,39 +8934,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 15: - c3[3] = amd_bytealign (w7[3], 0, offset); - c3[2] = amd_bytealign (w7[2], w7[3], offset); - c3[1] = amd_bytealign (w7[1], w7[2], offset); - c3[0] = amd_bytealign (w7[0], w7[1], offset); - c2[3] = amd_bytealign (w6[3], w7[0], offset); - c2[2] = amd_bytealign (w6[2], w6[3], offset); - c2[1] = amd_bytealign (w6[1], w6[2], offset); - c2[0] = amd_bytealign (w6[0], w6[1], offset); - c1[3] = amd_bytealign (w5[3], w6[0], offset); - c1[2] = amd_bytealign (w5[2], w5[3], offset); - c1[1] = amd_bytealign (w5[1], w5[2], offset); - c1[0] = amd_bytealign (w5[0], w5[1], offset); - c0[3] = amd_bytealign (w4[3], w5[0], offset); - c0[2] = amd_bytealign (w4[2], w4[3], offset); - c0[1] = amd_bytealign (w4[1], w4[2], offset); - c0[0] = amd_bytealign (w4[0], w4[1], offset); - w7[3] = amd_bytealign (w3[3], w4[0], offset); - w7[2] = amd_bytealign (w3[2], w3[3], offset); - w7[1] = amd_bytealign (w3[1], w3[2], offset); - w7[0] = amd_bytealign (w3[0], w3[1], offset); - w6[3] = amd_bytealign (w2[3], w3[0], offset); - w6[2] = amd_bytealign (w2[2], w2[3], offset); - w6[1] = amd_bytealign (w2[1], w2[2], offset); - w6[0] = amd_bytealign (w2[0], w2[1], offset); - w5[3] = amd_bytealign (w1[3], w2[0], offset); - w5[2] = amd_bytealign (w1[2], w1[3], offset); - w5[1] = amd_bytealign (w1[1], w1[2], offset); - w5[0] = amd_bytealign (w1[0], w1[1], offset); - w4[3] = amd_bytealign (w0[3], w1[0], offset); - w4[2] = amd_bytealign (w0[2], w0[3], offset); - w4[1] = amd_bytealign (w0[1], w0[2], offset); - w4[0] = amd_bytealign (w0[0], w0[1], offset); - w3[3] = amd_bytealign ( 0, w0[0], offset); + c3[3] = hc_bytealign (w7[3], 0, offset); + c3[2] = hc_bytealign (w7[2], w7[3], offset); + c3[1] = hc_bytealign (w7[1], w7[2], offset); + c3[0] = hc_bytealign (w7[0], w7[1], offset); + c2[3] = hc_bytealign (w6[3], w7[0], offset); + c2[2] = hc_bytealign (w6[2], w6[3], offset); + c2[1] = hc_bytealign (w6[1], w6[2], offset); + c2[0] = hc_bytealign (w6[0], w6[1], offset); + c1[3] = hc_bytealign (w5[3], w6[0], offset); + c1[2] = hc_bytealign (w5[2], w5[3], offset); + c1[1] = hc_bytealign (w5[1], w5[2], offset); + c1[0] = hc_bytealign (w5[0], w5[1], offset); + c0[3] = hc_bytealign (w4[3], w5[0], offset); + c0[2] = hc_bytealign (w4[2], w4[3], offset); + c0[1] = hc_bytealign (w4[1], w4[2], offset); + c0[0] = hc_bytealign (w4[0], w4[1], offset); + w7[3] = hc_bytealign (w3[3], w4[0], offset); + w7[2] = hc_bytealign (w3[2], w3[3], offset); + w7[1] = hc_bytealign (w3[1], w3[2], offset); + w7[0] = hc_bytealign (w3[0], w3[1], offset); + w6[3] = hc_bytealign (w2[3], w3[0], offset); + w6[2] = hc_bytealign (w2[2], w2[3], offset); + w6[1] = hc_bytealign (w2[1], w2[2], offset); + w6[0] = hc_bytealign (w2[0], w2[1], offset); + w5[3] = hc_bytealign (w1[3], w2[0], offset); + w5[2] = hc_bytealign (w1[2], w1[3], offset); + w5[1] = hc_bytealign (w1[1], w1[2], offset); + w5[0] = hc_bytealign (w1[0], w1[1], offset); + w4[3] = hc_bytealign (w0[3], w1[0], offset); + w4[2] = hc_bytealign (w0[2], w0[3], offset); + w4[1] = hc_bytealign (w0[1], w0[2], offset); + w4[0] = hc_bytealign (w0[0], w0[1], offset); + w3[3] = hc_bytealign ( 0, w0[0], offset); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -8986,39 +8986,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 16: - c4[0] = amd_bytealign (w7[3], 0, offset); - c3[3] = amd_bytealign (w7[2], w7[3], offset); - c3[2] = amd_bytealign (w7[1], w7[2], offset); - c3[1] = amd_bytealign (w7[0], w7[1], offset); - c3[0] = amd_bytealign (w6[3], w7[0], offset); - c2[3] = amd_bytealign (w6[2], w6[3], offset); - c2[2] = amd_bytealign (w6[1], w6[2], offset); - c2[1] = amd_bytealign (w6[0], w6[1], offset); - c2[0] = amd_bytealign (w5[3], w6[0], offset); - c1[3] = amd_bytealign (w5[2], w5[3], offset); - c1[2] = amd_bytealign (w5[1], w5[2], offset); - c1[1] = amd_bytealign (w5[0], w5[1], offset); - c1[0] = amd_bytealign (w4[3], w5[0], offset); - c0[3] = amd_bytealign (w4[2], w4[3], offset); - c0[2] = amd_bytealign (w4[1], w4[2], offset); - c0[1] = amd_bytealign (w4[0], w4[1], offset); - c0[0] = amd_bytealign (w3[3], w4[0], offset); - w7[3] = amd_bytealign (w3[2], w3[3], offset); - w7[2] = amd_bytealign (w3[1], w3[2], offset); - w7[1] = amd_bytealign (w3[0], w3[1], offset); - w7[0] = amd_bytealign (w2[3], w3[0], offset); - w6[3] = amd_bytealign (w2[2], w2[3], offset); - w6[2] = amd_bytealign (w2[1], w2[2], offset); - w6[1] = amd_bytealign (w2[0], w2[1], offset); - w6[0] = amd_bytealign (w1[3], w2[0], offset); - w5[3] = amd_bytealign (w1[2], w1[3], offset); - w5[2] = amd_bytealign (w1[1], w1[2], offset); - w5[1] = amd_bytealign (w1[0], w1[1], offset); - w5[0] = amd_bytealign (w0[3], w1[0], offset); - w4[3] = amd_bytealign (w0[2], w0[3], offset); - w4[2] = amd_bytealign (w0[1], w0[2], offset); - w4[1] = amd_bytealign (w0[0], w0[1], offset); - w4[0] = amd_bytealign ( 0, w0[0], offset); + c4[0] = hc_bytealign (w7[3], 0, offset); + c3[3] = hc_bytealign (w7[2], w7[3], offset); + c3[2] = hc_bytealign (w7[1], w7[2], offset); + c3[1] = hc_bytealign (w7[0], w7[1], offset); + c3[0] = hc_bytealign (w6[3], w7[0], offset); + c2[3] = hc_bytealign (w6[2], w6[3], offset); + c2[2] = hc_bytealign (w6[1], w6[2], offset); + c2[1] = hc_bytealign (w6[0], w6[1], offset); + c2[0] = hc_bytealign (w5[3], w6[0], offset); + c1[3] = hc_bytealign (w5[2], w5[3], offset); + c1[2] = hc_bytealign (w5[1], w5[2], offset); + c1[1] = hc_bytealign (w5[0], w5[1], offset); + c1[0] = hc_bytealign (w4[3], w5[0], offset); + c0[3] = hc_bytealign (w4[2], w4[3], offset); + c0[2] = hc_bytealign (w4[1], w4[2], offset); + c0[1] = hc_bytealign (w4[0], w4[1], offset); + c0[0] = hc_bytealign (w3[3], w4[0], offset); + w7[3] = hc_bytealign (w3[2], w3[3], offset); + w7[2] = hc_bytealign (w3[1], w3[2], offset); + w7[1] = hc_bytealign (w3[0], w3[1], offset); + w7[0] = hc_bytealign (w2[3], w3[0], offset); + w6[3] = hc_bytealign (w2[2], w2[3], offset); + w6[2] = hc_bytealign (w2[1], w2[2], offset); + w6[1] = hc_bytealign (w2[0], w2[1], offset); + w6[0] = hc_bytealign (w1[3], w2[0], offset); + w5[3] = hc_bytealign (w1[2], w1[3], offset); + w5[2] = hc_bytealign (w1[1], w1[2], offset); + w5[1] = hc_bytealign (w1[0], w1[1], offset); + w5[0] = hc_bytealign (w0[3], w1[0], offset); + w4[3] = hc_bytealign (w0[2], w0[3], offset); + w4[2] = hc_bytealign (w0[1], w0[2], offset); + w4[1] = hc_bytealign (w0[0], w0[1], offset); + w4[0] = hc_bytealign ( 0, w0[0], offset); w3[3] = 0; w3[2] = 0; w3[1] = 0; @@ -9039,39 +9039,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 17: - c4[1] = amd_bytealign (w7[3], 0, offset); - c4[0] = amd_bytealign (w7[2], w7[3], offset); - c3[3] = amd_bytealign (w7[1], w7[2], offset); - c3[2] = amd_bytealign (w7[0], w7[1], offset); - c3[1] = amd_bytealign (w6[3], w7[0], offset); - c3[0] = amd_bytealign (w6[2], w6[3], offset); - c2[3] = amd_bytealign (w6[1], w6[2], offset); - c2[2] = amd_bytealign (w6[0], w6[1], offset); - c2[1] = amd_bytealign (w5[3], w6[0], offset); - c2[0] = amd_bytealign (w5[2], w5[3], offset); - c1[3] = amd_bytealign (w5[1], w5[2], offset); - c1[2] = amd_bytealign (w5[0], w5[1], offset); - c1[1] = amd_bytealign (w4[3], w5[0], offset); - c1[0] = amd_bytealign (w4[2], w4[3], offset); - c0[3] = amd_bytealign (w4[1], w4[2], offset); - c0[2] = amd_bytealign (w4[0], w4[1], offset); - c0[1] = amd_bytealign (w3[3], w4[0], offset); - c0[0] = amd_bytealign (w3[2], w3[3], offset); - w7[3] = amd_bytealign (w3[1], w3[2], offset); - w7[2] = amd_bytealign (w3[0], w3[1], offset); - w7[1] = amd_bytealign (w2[3], w3[0], offset); - w7[0] = amd_bytealign (w2[2], w2[3], offset); - w6[3] = amd_bytealign (w2[1], w2[2], offset); - w6[2] = amd_bytealign (w2[0], w2[1], offset); - w6[1] = amd_bytealign (w1[3], w2[0], offset); - w6[0] = amd_bytealign (w1[2], w1[3], offset); - w5[3] = amd_bytealign (w1[1], w1[2], offset); - w5[2] = amd_bytealign (w1[0], w1[1], offset); - w5[1] = amd_bytealign (w0[3], w1[0], offset); - w5[0] = amd_bytealign (w0[2], w0[3], offset); - w4[3] = amd_bytealign (w0[1], w0[2], offset); - w4[2] = amd_bytealign (w0[0], w0[1], offset); - w4[1] = amd_bytealign ( 0, w0[0], offset); + c4[1] = hc_bytealign (w7[3], 0, offset); + c4[0] = hc_bytealign (w7[2], w7[3], offset); + c3[3] = hc_bytealign (w7[1], w7[2], offset); + c3[2] = hc_bytealign (w7[0], w7[1], offset); + c3[1] = hc_bytealign (w6[3], w7[0], offset); + c3[0] = hc_bytealign (w6[2], w6[3], offset); + c2[3] = hc_bytealign (w6[1], w6[2], offset); + c2[2] = hc_bytealign (w6[0], w6[1], offset); + c2[1] = hc_bytealign (w5[3], w6[0], offset); + c2[0] = hc_bytealign (w5[2], w5[3], offset); + c1[3] = hc_bytealign (w5[1], w5[2], offset); + c1[2] = hc_bytealign (w5[0], w5[1], offset); + c1[1] = hc_bytealign (w4[3], w5[0], offset); + c1[0] = hc_bytealign (w4[2], w4[3], offset); + c0[3] = hc_bytealign (w4[1], w4[2], offset); + c0[2] = hc_bytealign (w4[0], w4[1], offset); + c0[1] = hc_bytealign (w3[3], w4[0], offset); + c0[0] = hc_bytealign (w3[2], w3[3], offset); + w7[3] = hc_bytealign (w3[1], w3[2], offset); + w7[2] = hc_bytealign (w3[0], w3[1], offset); + w7[1] = hc_bytealign (w2[3], w3[0], offset); + w7[0] = hc_bytealign (w2[2], w2[3], offset); + w6[3] = hc_bytealign (w2[1], w2[2], offset); + w6[2] = hc_bytealign (w2[0], w2[1], offset); + w6[1] = hc_bytealign (w1[3], w2[0], offset); + w6[0] = hc_bytealign (w1[2], w1[3], offset); + w5[3] = hc_bytealign (w1[1], w1[2], offset); + w5[2] = hc_bytealign (w1[0], w1[1], offset); + w5[1] = hc_bytealign (w0[3], w1[0], offset); + w5[0] = hc_bytealign (w0[2], w0[3], offset); + w4[3] = hc_bytealign (w0[1], w0[2], offset); + w4[2] = hc_bytealign (w0[0], w0[1], offset); + w4[1] = hc_bytealign ( 0, w0[0], offset); w4[0] = 0; w3[3] = 0; w3[2] = 0; @@ -9093,39 +9093,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 18: - c4[2] = amd_bytealign (w7[3], 0, offset); - c4[1] = amd_bytealign (w7[2], w7[3], offset); - c4[0] = amd_bytealign (w7[1], w7[2], offset); - c3[3] = amd_bytealign (w7[0], w7[1], offset); - c3[2] = amd_bytealign (w6[3], w7[0], offset); - c3[1] = amd_bytealign (w6[2], w6[3], offset); - c3[0] = amd_bytealign (w6[1], w6[2], offset); - c2[3] = amd_bytealign (w6[0], w6[1], offset); - c2[2] = amd_bytealign (w5[3], w6[0], offset); - c2[1] = amd_bytealign (w5[2], w5[3], offset); - c2[0] = amd_bytealign (w5[1], w5[2], offset); - c1[3] = amd_bytealign (w5[0], w5[1], offset); - c1[2] = amd_bytealign (w4[3], w5[0], offset); - c1[1] = amd_bytealign (w4[2], w4[3], offset); - c1[0] = amd_bytealign (w4[1], w4[2], offset); - c0[3] = amd_bytealign (w4[0], w4[1], offset); - c0[2] = amd_bytealign (w3[3], w4[0], offset); - c0[1] = amd_bytealign (w3[2], w3[3], offset); - c0[0] = amd_bytealign (w3[1], w3[2], offset); - w7[3] = amd_bytealign (w3[0], w3[1], offset); - w7[2] = amd_bytealign (w2[3], w3[0], offset); - w7[1] = amd_bytealign (w2[2], w2[3], offset); - w7[0] = amd_bytealign (w2[1], w2[2], offset); - w6[3] = amd_bytealign (w2[0], w2[1], offset); - w6[2] = amd_bytealign (w1[3], w2[0], offset); - w6[1] = amd_bytealign (w1[2], w1[3], offset); - w6[0] = amd_bytealign (w1[1], w1[2], offset); - w5[3] = amd_bytealign (w1[0], w1[1], offset); - w5[2] = amd_bytealign (w0[3], w1[0], offset); - w5[1] = amd_bytealign (w0[2], w0[3], offset); - w5[0] = amd_bytealign (w0[1], w0[2], offset); - w4[3] = amd_bytealign (w0[0], w0[1], offset); - w4[2] = amd_bytealign ( 0, w0[0], offset); + c4[2] = hc_bytealign (w7[3], 0, offset); + c4[1] = hc_bytealign (w7[2], w7[3], offset); + c4[0] = hc_bytealign (w7[1], w7[2], offset); + c3[3] = hc_bytealign (w7[0], w7[1], offset); + c3[2] = hc_bytealign (w6[3], w7[0], offset); + c3[1] = hc_bytealign (w6[2], w6[3], offset); + c3[0] = hc_bytealign (w6[1], w6[2], offset); + c2[3] = hc_bytealign (w6[0], w6[1], offset); + c2[2] = hc_bytealign (w5[3], w6[0], offset); + c2[1] = hc_bytealign (w5[2], w5[3], offset); + c2[0] = hc_bytealign (w5[1], w5[2], offset); + c1[3] = hc_bytealign (w5[0], w5[1], offset); + c1[2] = hc_bytealign (w4[3], w5[0], offset); + c1[1] = hc_bytealign (w4[2], w4[3], offset); + c1[0] = hc_bytealign (w4[1], w4[2], offset); + c0[3] = hc_bytealign (w4[0], w4[1], offset); + c0[2] = hc_bytealign (w3[3], w4[0], offset); + c0[1] = hc_bytealign (w3[2], w3[3], offset); + c0[0] = hc_bytealign (w3[1], w3[2], offset); + w7[3] = hc_bytealign (w3[0], w3[1], offset); + w7[2] = hc_bytealign (w2[3], w3[0], offset); + w7[1] = hc_bytealign (w2[2], w2[3], offset); + w7[0] = hc_bytealign (w2[1], w2[2], offset); + w6[3] = hc_bytealign (w2[0], w2[1], offset); + w6[2] = hc_bytealign (w1[3], w2[0], offset); + w6[1] = hc_bytealign (w1[2], w1[3], offset); + w6[0] = hc_bytealign (w1[1], w1[2], offset); + w5[3] = hc_bytealign (w1[0], w1[1], offset); + w5[2] = hc_bytealign (w0[3], w1[0], offset); + w5[1] = hc_bytealign (w0[2], w0[3], offset); + w5[0] = hc_bytealign (w0[1], w0[2], offset); + w4[3] = hc_bytealign (w0[0], w0[1], offset); + w4[2] = hc_bytealign ( 0, w0[0], offset); w4[1] = 0; w4[0] = 0; w3[3] = 0; @@ -9148,39 +9148,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 19: - c4[3] = amd_bytealign (w7[3], 0, offset); - c4[2] = amd_bytealign (w7[2], w7[3], offset); - c4[1] = amd_bytealign (w7[1], w7[2], offset); - c4[0] = amd_bytealign (w7[0], w7[1], offset); - c3[3] = amd_bytealign (w6[3], w7[0], offset); - c3[2] = amd_bytealign (w6[2], w6[3], offset); - c3[1] = amd_bytealign (w6[1], w6[2], offset); - c3[0] = amd_bytealign (w6[0], w6[1], offset); - c2[3] = amd_bytealign (w5[3], w6[0], offset); - c2[2] = amd_bytealign (w5[2], w5[3], offset); - c2[1] = amd_bytealign (w5[1], w5[2], offset); - c2[0] = amd_bytealign (w5[0], w5[1], offset); - c1[3] = amd_bytealign (w4[3], w5[0], offset); - c1[2] = amd_bytealign (w4[2], w4[3], offset); - c1[1] = amd_bytealign (w4[1], w4[2], offset); - c1[0] = amd_bytealign (w4[0], w4[1], offset); - c0[3] = amd_bytealign (w3[3], w4[0], offset); - c0[2] = amd_bytealign (w3[2], w3[3], offset); - c0[1] = amd_bytealign (w3[1], w3[2], offset); - c0[0] = amd_bytealign (w3[0], w3[1], offset); - w7[3] = amd_bytealign (w2[3], w3[0], offset); - w7[2] = amd_bytealign (w2[2], w2[3], offset); - w7[1] = amd_bytealign (w2[1], w2[2], offset); - w7[0] = amd_bytealign (w2[0], w2[1], offset); - w6[3] = amd_bytealign (w1[3], w2[0], offset); - w6[2] = amd_bytealign (w1[2], w1[3], offset); - w6[1] = amd_bytealign (w1[1], w1[2], offset); - w6[0] = amd_bytealign (w1[0], w1[1], offset); - w5[3] = amd_bytealign (w0[3], w1[0], offset); - w5[2] = amd_bytealign (w0[2], w0[3], offset); - w5[1] = amd_bytealign (w0[1], w0[2], offset); - w5[0] = amd_bytealign (w0[0], w0[1], offset); - w4[3] = amd_bytealign ( 0, w0[0], offset); + c4[3] = hc_bytealign (w7[3], 0, offset); + c4[2] = hc_bytealign (w7[2], w7[3], offset); + c4[1] = hc_bytealign (w7[1], w7[2], offset); + c4[0] = hc_bytealign (w7[0], w7[1], offset); + c3[3] = hc_bytealign (w6[3], w7[0], offset); + c3[2] = hc_bytealign (w6[2], w6[3], offset); + c3[1] = hc_bytealign (w6[1], w6[2], offset); + c3[0] = hc_bytealign (w6[0], w6[1], offset); + c2[3] = hc_bytealign (w5[3], w6[0], offset); + c2[2] = hc_bytealign (w5[2], w5[3], offset); + c2[1] = hc_bytealign (w5[1], w5[2], offset); + c2[0] = hc_bytealign (w5[0], w5[1], offset); + c1[3] = hc_bytealign (w4[3], w5[0], offset); + c1[2] = hc_bytealign (w4[2], w4[3], offset); + c1[1] = hc_bytealign (w4[1], w4[2], offset); + c1[0] = hc_bytealign (w4[0], w4[1], offset); + c0[3] = hc_bytealign (w3[3], w4[0], offset); + c0[2] = hc_bytealign (w3[2], w3[3], offset); + c0[1] = hc_bytealign (w3[1], w3[2], offset); + c0[0] = hc_bytealign (w3[0], w3[1], offset); + w7[3] = hc_bytealign (w2[3], w3[0], offset); + w7[2] = hc_bytealign (w2[2], w2[3], offset); + w7[1] = hc_bytealign (w2[1], w2[2], offset); + w7[0] = hc_bytealign (w2[0], w2[1], offset); + w6[3] = hc_bytealign (w1[3], w2[0], offset); + w6[2] = hc_bytealign (w1[2], w1[3], offset); + w6[1] = hc_bytealign (w1[1], w1[2], offset); + w6[0] = hc_bytealign (w1[0], w1[1], offset); + w5[3] = hc_bytealign (w0[3], w1[0], offset); + w5[2] = hc_bytealign (w0[2], w0[3], offset); + w5[1] = hc_bytealign (w0[1], w0[2], offset); + w5[0] = hc_bytealign (w0[0], w0[1], offset); + w4[3] = hc_bytealign ( 0, w0[0], offset); w4[2] = 0; w4[1] = 0; w4[0] = 0; @@ -9204,39 +9204,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 20: - c5[0] = amd_bytealign (w7[3], 0, offset); - c4[3] = amd_bytealign (w7[2], w7[3], offset); - c4[2] = amd_bytealign (w7[1], w7[2], offset); - c4[1] = amd_bytealign (w7[0], w7[1], offset); - c4[0] = amd_bytealign (w6[3], w7[0], offset); - c3[3] = amd_bytealign (w6[2], w6[3], offset); - c3[2] = amd_bytealign (w6[1], w6[2], offset); - c3[1] = amd_bytealign (w6[0], w6[1], offset); - c3[0] = amd_bytealign (w5[3], w6[0], offset); - c2[3] = amd_bytealign (w5[2], w5[3], offset); - c2[2] = amd_bytealign (w5[1], w5[2], offset); - c2[1] = amd_bytealign (w5[0], w5[1], offset); - c2[0] = amd_bytealign (w4[3], w5[0], offset); - c1[3] = amd_bytealign (w4[2], w4[3], offset); - c1[2] = amd_bytealign (w4[1], w4[2], offset); - c1[1] = amd_bytealign (w4[0], w4[1], offset); - c1[0] = amd_bytealign (w3[3], w4[0], offset); - c0[3] = amd_bytealign (w3[2], w3[3], offset); - c0[2] = amd_bytealign (w3[1], w3[2], offset); - c0[1] = amd_bytealign (w3[0], w3[1], offset); - c0[0] = amd_bytealign (w2[3], w3[0], offset); - w7[3] = amd_bytealign (w2[2], w2[3], offset); - w7[2] = amd_bytealign (w2[1], w2[2], offset); - w7[1] = amd_bytealign (w2[0], w2[1], offset); - w7[0] = amd_bytealign (w1[3], w2[0], offset); - w6[3] = amd_bytealign (w1[2], w1[3], offset); - w6[2] = amd_bytealign (w1[1], w1[2], offset); - w6[1] = amd_bytealign (w1[0], w1[1], offset); - w6[0] = amd_bytealign (w0[3], w1[0], offset); - w5[3] = amd_bytealign (w0[2], w0[3], offset); - w5[2] = amd_bytealign (w0[1], w0[2], offset); - w5[1] = amd_bytealign (w0[0], w0[1], offset); - w5[0] = amd_bytealign ( 0, w0[0], offset); + c5[0] = hc_bytealign (w7[3], 0, offset); + c4[3] = hc_bytealign (w7[2], w7[3], offset); + c4[2] = hc_bytealign (w7[1], w7[2], offset); + c4[1] = hc_bytealign (w7[0], w7[1], offset); + c4[0] = hc_bytealign (w6[3], w7[0], offset); + c3[3] = hc_bytealign (w6[2], w6[3], offset); + c3[2] = hc_bytealign (w6[1], w6[2], offset); + c3[1] = hc_bytealign (w6[0], w6[1], offset); + c3[0] = hc_bytealign (w5[3], w6[0], offset); + c2[3] = hc_bytealign (w5[2], w5[3], offset); + c2[2] = hc_bytealign (w5[1], w5[2], offset); + c2[1] = hc_bytealign (w5[0], w5[1], offset); + c2[0] = hc_bytealign (w4[3], w5[0], offset); + c1[3] = hc_bytealign (w4[2], w4[3], offset); + c1[2] = hc_bytealign (w4[1], w4[2], offset); + c1[1] = hc_bytealign (w4[0], w4[1], offset); + c1[0] = hc_bytealign (w3[3], w4[0], offset); + c0[3] = hc_bytealign (w3[2], w3[3], offset); + c0[2] = hc_bytealign (w3[1], w3[2], offset); + c0[1] = hc_bytealign (w3[0], w3[1], offset); + c0[0] = hc_bytealign (w2[3], w3[0], offset); + w7[3] = hc_bytealign (w2[2], w2[3], offset); + w7[2] = hc_bytealign (w2[1], w2[2], offset); + w7[1] = hc_bytealign (w2[0], w2[1], offset); + w7[0] = hc_bytealign (w1[3], w2[0], offset); + w6[3] = hc_bytealign (w1[2], w1[3], offset); + w6[2] = hc_bytealign (w1[1], w1[2], offset); + w6[1] = hc_bytealign (w1[0], w1[1], offset); + w6[0] = hc_bytealign (w0[3], w1[0], offset); + w5[3] = hc_bytealign (w0[2], w0[3], offset); + w5[2] = hc_bytealign (w0[1], w0[2], offset); + w5[1] = hc_bytealign (w0[0], w0[1], offset); + w5[0] = hc_bytealign ( 0, w0[0], offset); w4[3] = 0; w4[2] = 0; w4[1] = 0; @@ -9261,39 +9261,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 21: - c5[1] = amd_bytealign (w7[3], 0, offset); - c5[0] = amd_bytealign (w7[2], w7[3], offset); - c4[3] = amd_bytealign (w7[1], w7[2], offset); - c4[2] = amd_bytealign (w7[0], w7[1], offset); - c4[1] = amd_bytealign (w6[3], w7[0], offset); - c4[0] = amd_bytealign (w6[2], w6[3], offset); - c3[3] = amd_bytealign (w6[1], w6[2], offset); - c3[2] = amd_bytealign (w6[0], w6[1], offset); - c3[1] = amd_bytealign (w5[3], w6[0], offset); - c3[0] = amd_bytealign (w5[2], w5[3], offset); - c2[3] = amd_bytealign (w5[1], w5[2], offset); - c2[2] = amd_bytealign (w5[0], w5[1], offset); - c2[1] = amd_bytealign (w4[3], w5[0], offset); - c2[0] = amd_bytealign (w4[2], w4[3], offset); - c1[3] = amd_bytealign (w4[1], w4[2], offset); - c1[2] = amd_bytealign (w4[0], w4[1], offset); - c1[1] = amd_bytealign (w3[3], w4[0], offset); - c1[0] = amd_bytealign (w3[2], w3[3], offset); - c0[3] = amd_bytealign (w3[1], w3[2], offset); - c0[2] = amd_bytealign (w3[0], w3[1], offset); - c0[1] = amd_bytealign (w2[3], w3[0], offset); - c0[0] = amd_bytealign (w2[2], w2[3], offset); - w7[3] = amd_bytealign (w2[1], w2[2], offset); - w7[2] = amd_bytealign (w2[0], w2[1], offset); - w7[1] = amd_bytealign (w1[3], w2[0], offset); - w7[0] = amd_bytealign (w1[2], w1[3], offset); - w6[3] = amd_bytealign (w1[1], w1[2], offset); - w6[2] = amd_bytealign (w1[0], w1[1], offset); - w6[1] = amd_bytealign (w0[3], w1[0], offset); - w6[0] = amd_bytealign (w0[2], w0[3], offset); - w5[3] = amd_bytealign (w0[1], w0[2], offset); - w5[2] = amd_bytealign (w0[0], w0[1], offset); - w5[1] = amd_bytealign ( 0, w0[0], offset); + c5[1] = hc_bytealign (w7[3], 0, offset); + c5[0] = hc_bytealign (w7[2], w7[3], offset); + c4[3] = hc_bytealign (w7[1], w7[2], offset); + c4[2] = hc_bytealign (w7[0], w7[1], offset); + c4[1] = hc_bytealign (w6[3], w7[0], offset); + c4[0] = hc_bytealign (w6[2], w6[3], offset); + c3[3] = hc_bytealign (w6[1], w6[2], offset); + c3[2] = hc_bytealign (w6[0], w6[1], offset); + c3[1] = hc_bytealign (w5[3], w6[0], offset); + c3[0] = hc_bytealign (w5[2], w5[3], offset); + c2[3] = hc_bytealign (w5[1], w5[2], offset); + c2[2] = hc_bytealign (w5[0], w5[1], offset); + c2[1] = hc_bytealign (w4[3], w5[0], offset); + c2[0] = hc_bytealign (w4[2], w4[3], offset); + c1[3] = hc_bytealign (w4[1], w4[2], offset); + c1[2] = hc_bytealign (w4[0], w4[1], offset); + c1[1] = hc_bytealign (w3[3], w4[0], offset); + c1[0] = hc_bytealign (w3[2], w3[3], offset); + c0[3] = hc_bytealign (w3[1], w3[2], offset); + c0[2] = hc_bytealign (w3[0], w3[1], offset); + c0[1] = hc_bytealign (w2[3], w3[0], offset); + c0[0] = hc_bytealign (w2[2], w2[3], offset); + w7[3] = hc_bytealign (w2[1], w2[2], offset); + w7[2] = hc_bytealign (w2[0], w2[1], offset); + w7[1] = hc_bytealign (w1[3], w2[0], offset); + w7[0] = hc_bytealign (w1[2], w1[3], offset); + w6[3] = hc_bytealign (w1[1], w1[2], offset); + w6[2] = hc_bytealign (w1[0], w1[1], offset); + w6[1] = hc_bytealign (w0[3], w1[0], offset); + w6[0] = hc_bytealign (w0[2], w0[3], offset); + w5[3] = hc_bytealign (w0[1], w0[2], offset); + w5[2] = hc_bytealign (w0[0], w0[1], offset); + w5[1] = hc_bytealign ( 0, w0[0], offset); w5[0] = 0; w4[3] = 0; w4[2] = 0; @@ -9319,39 +9319,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 22: - c5[2] = amd_bytealign (w7[3], 0, offset); - c5[1] = amd_bytealign (w7[2], w7[3], offset); - c5[0] = amd_bytealign (w7[1], w7[2], offset); - c4[3] = amd_bytealign (w7[0], w7[1], offset); - c4[2] = amd_bytealign (w6[3], w7[0], offset); - c4[1] = amd_bytealign (w6[2], w6[3], offset); - c4[0] = amd_bytealign (w6[1], w6[2], offset); - c3[3] = amd_bytealign (w6[0], w6[1], offset); - c3[2] = amd_bytealign (w5[3], w6[0], offset); - c3[1] = amd_bytealign (w5[2], w5[3], offset); - c3[0] = amd_bytealign (w5[1], w5[2], offset); - c2[3] = amd_bytealign (w5[0], w5[1], offset); - c2[2] = amd_bytealign (w4[3], w5[0], offset); - c2[1] = amd_bytealign (w4[2], w4[3], offset); - c2[0] = amd_bytealign (w4[1], w4[2], offset); - c1[3] = amd_bytealign (w4[0], w4[1], offset); - c1[2] = amd_bytealign (w3[3], w4[0], offset); - c1[1] = amd_bytealign (w3[2], w3[3], offset); - c1[0] = amd_bytealign (w3[1], w3[2], offset); - c0[3] = amd_bytealign (w3[0], w3[1], offset); - c0[2] = amd_bytealign (w2[3], w3[0], offset); - c0[1] = amd_bytealign (w2[2], w2[3], offset); - c0[0] = amd_bytealign (w2[1], w2[2], offset); - w7[3] = amd_bytealign (w2[0], w2[1], offset); - w7[2] = amd_bytealign (w1[3], w2[0], offset); - w7[1] = amd_bytealign (w1[2], w1[3], offset); - w7[0] = amd_bytealign (w1[1], w1[2], offset); - w6[3] = amd_bytealign (w1[0], w1[1], offset); - w6[2] = amd_bytealign (w0[3], w1[0], offset); - w6[1] = amd_bytealign (w0[2], w0[3], offset); - w6[0] = amd_bytealign (w0[1], w0[2], offset); - w5[3] = amd_bytealign (w0[0], w0[1], offset); - w5[2] = amd_bytealign ( 0, w0[0], offset); + c5[2] = hc_bytealign (w7[3], 0, offset); + c5[1] = hc_bytealign (w7[2], w7[3], offset); + c5[0] = hc_bytealign (w7[1], w7[2], offset); + c4[3] = hc_bytealign (w7[0], w7[1], offset); + c4[2] = hc_bytealign (w6[3], w7[0], offset); + c4[1] = hc_bytealign (w6[2], w6[3], offset); + c4[0] = hc_bytealign (w6[1], w6[2], offset); + c3[3] = hc_bytealign (w6[0], w6[1], offset); + c3[2] = hc_bytealign (w5[3], w6[0], offset); + c3[1] = hc_bytealign (w5[2], w5[3], offset); + c3[0] = hc_bytealign (w5[1], w5[2], offset); + c2[3] = hc_bytealign (w5[0], w5[1], offset); + c2[2] = hc_bytealign (w4[3], w5[0], offset); + c2[1] = hc_bytealign (w4[2], w4[3], offset); + c2[0] = hc_bytealign (w4[1], w4[2], offset); + c1[3] = hc_bytealign (w4[0], w4[1], offset); + c1[2] = hc_bytealign (w3[3], w4[0], offset); + c1[1] = hc_bytealign (w3[2], w3[3], offset); + c1[0] = hc_bytealign (w3[1], w3[2], offset); + c0[3] = hc_bytealign (w3[0], w3[1], offset); + c0[2] = hc_bytealign (w2[3], w3[0], offset); + c0[1] = hc_bytealign (w2[2], w2[3], offset); + c0[0] = hc_bytealign (w2[1], w2[2], offset); + w7[3] = hc_bytealign (w2[0], w2[1], offset); + w7[2] = hc_bytealign (w1[3], w2[0], offset); + w7[1] = hc_bytealign (w1[2], w1[3], offset); + w7[0] = hc_bytealign (w1[1], w1[2], offset); + w6[3] = hc_bytealign (w1[0], w1[1], offset); + w6[2] = hc_bytealign (w0[3], w1[0], offset); + w6[1] = hc_bytealign (w0[2], w0[3], offset); + w6[0] = hc_bytealign (w0[1], w0[2], offset); + w5[3] = hc_bytealign (w0[0], w0[1], offset); + w5[2] = hc_bytealign ( 0, w0[0], offset); w5[1] = 0; w5[0] = 0; w4[3] = 0; @@ -9378,39 +9378,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 23: - c5[3] = amd_bytealign (w7[3], 0, offset); - c5[2] = amd_bytealign (w7[2], w7[3], offset); - c5[1] = amd_bytealign (w7[1], w7[2], offset); - c5[0] = amd_bytealign (w7[0], w7[1], offset); - c4[3] = amd_bytealign (w6[3], w7[0], offset); - c4[2] = amd_bytealign (w6[2], w6[3], offset); - c4[1] = amd_bytealign (w6[1], w6[2], offset); - c4[0] = amd_bytealign (w6[0], w6[1], offset); - c3[3] = amd_bytealign (w5[3], w6[0], offset); - c3[2] = amd_bytealign (w5[2], w5[3], offset); - c3[1] = amd_bytealign (w5[1], w5[2], offset); - c3[0] = amd_bytealign (w5[0], w5[1], offset); - c2[3] = amd_bytealign (w4[3], w5[0], offset); - c2[2] = amd_bytealign (w4[2], w4[3], offset); - c2[1] = amd_bytealign (w4[1], w4[2], offset); - c2[0] = amd_bytealign (w4[0], w4[1], offset); - c1[3] = amd_bytealign (w3[3], w4[0], offset); - c1[2] = amd_bytealign (w3[2], w3[3], offset); - c1[1] = amd_bytealign (w3[1], w3[2], offset); - c1[0] = amd_bytealign (w3[0], w3[1], offset); - c0[3] = amd_bytealign (w2[3], w3[0], offset); - c0[2] = amd_bytealign (w2[2], w2[3], offset); - c0[1] = amd_bytealign (w2[1], w2[2], offset); - c0[0] = amd_bytealign (w2[0], w2[1], offset); - w7[3] = amd_bytealign (w1[3], w2[0], offset); - w7[2] = amd_bytealign (w1[2], w1[3], offset); - w7[1] = amd_bytealign (w1[1], w1[2], offset); - w7[0] = amd_bytealign (w1[0], w1[1], offset); - w6[3] = amd_bytealign (w0[3], w1[0], offset); - w6[2] = amd_bytealign (w0[2], w0[3], offset); - w6[1] = amd_bytealign (w0[1], w0[2], offset); - w6[0] = amd_bytealign (w0[0], w0[1], offset); - w5[3] = amd_bytealign ( 0, w0[0], offset); + c5[3] = hc_bytealign (w7[3], 0, offset); + c5[2] = hc_bytealign (w7[2], w7[3], offset); + c5[1] = hc_bytealign (w7[1], w7[2], offset); + c5[0] = hc_bytealign (w7[0], w7[1], offset); + c4[3] = hc_bytealign (w6[3], w7[0], offset); + c4[2] = hc_bytealign (w6[2], w6[3], offset); + c4[1] = hc_bytealign (w6[1], w6[2], offset); + c4[0] = hc_bytealign (w6[0], w6[1], offset); + c3[3] = hc_bytealign (w5[3], w6[0], offset); + c3[2] = hc_bytealign (w5[2], w5[3], offset); + c3[1] = hc_bytealign (w5[1], w5[2], offset); + c3[0] = hc_bytealign (w5[0], w5[1], offset); + c2[3] = hc_bytealign (w4[3], w5[0], offset); + c2[2] = hc_bytealign (w4[2], w4[3], offset); + c2[1] = hc_bytealign (w4[1], w4[2], offset); + c2[0] = hc_bytealign (w4[0], w4[1], offset); + c1[3] = hc_bytealign (w3[3], w4[0], offset); + c1[2] = hc_bytealign (w3[2], w3[3], offset); + c1[1] = hc_bytealign (w3[1], w3[2], offset); + c1[0] = hc_bytealign (w3[0], w3[1], offset); + c0[3] = hc_bytealign (w2[3], w3[0], offset); + c0[2] = hc_bytealign (w2[2], w2[3], offset); + c0[1] = hc_bytealign (w2[1], w2[2], offset); + c0[0] = hc_bytealign (w2[0], w2[1], offset); + w7[3] = hc_bytealign (w1[3], w2[0], offset); + w7[2] = hc_bytealign (w1[2], w1[3], offset); + w7[1] = hc_bytealign (w1[1], w1[2], offset); + w7[0] = hc_bytealign (w1[0], w1[1], offset); + w6[3] = hc_bytealign (w0[3], w1[0], offset); + w6[2] = hc_bytealign (w0[2], w0[3], offset); + w6[1] = hc_bytealign (w0[1], w0[2], offset); + w6[0] = hc_bytealign (w0[0], w0[1], offset); + w5[3] = hc_bytealign ( 0, w0[0], offset); w5[2] = 0; w5[1] = 0; w5[0] = 0; @@ -9438,39 +9438,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 24: - c6[0] = amd_bytealign (w7[3], 0, offset); - c5[3] = amd_bytealign (w7[2], w7[3], offset); - c5[2] = amd_bytealign (w7[1], w7[2], offset); - c5[1] = amd_bytealign (w7[0], w7[1], offset); - c5[0] = amd_bytealign (w6[3], w7[0], offset); - c4[3] = amd_bytealign (w6[2], w6[3], offset); - c4[2] = amd_bytealign (w6[1], w6[2], offset); - c4[1] = amd_bytealign (w6[0], w6[1], offset); - c4[0] = amd_bytealign (w5[3], w6[0], offset); - c3[3] = amd_bytealign (w5[2], w5[3], offset); - c3[2] = amd_bytealign (w5[1], w5[2], offset); - c3[1] = amd_bytealign (w5[0], w5[1], offset); - c3[0] = amd_bytealign (w4[3], w5[0], offset); - c2[3] = amd_bytealign (w4[2], w4[3], offset); - c2[2] = amd_bytealign (w4[1], w4[2], offset); - c2[1] = amd_bytealign (w4[0], w4[1], offset); - c2[0] = amd_bytealign (w3[3], w4[0], offset); - c1[3] = amd_bytealign (w3[2], w3[3], offset); - c1[2] = amd_bytealign (w3[1], w3[2], offset); - c1[1] = amd_bytealign (w3[0], w3[1], offset); - c1[0] = amd_bytealign (w2[3], w3[0], offset); - c0[3] = amd_bytealign (w2[2], w2[3], offset); - c0[2] = amd_bytealign (w2[1], w2[2], offset); - c0[1] = amd_bytealign (w2[0], w2[1], offset); - c0[0] = amd_bytealign (w1[3], w2[0], offset); - w7[3] = amd_bytealign (w1[2], w1[3], offset); - w7[2] = amd_bytealign (w1[1], w1[2], offset); - w7[1] = amd_bytealign (w1[0], w1[1], offset); - w7[0] = amd_bytealign (w0[3], w1[0], offset); - w6[3] = amd_bytealign (w0[2], w0[3], offset); - w6[2] = amd_bytealign (w0[1], w0[2], offset); - w6[1] = amd_bytealign (w0[0], w0[1], offset); - w6[0] = amd_bytealign ( 0, w0[0], offset); + c6[0] = hc_bytealign (w7[3], 0, offset); + c5[3] = hc_bytealign (w7[2], w7[3], offset); + c5[2] = hc_bytealign (w7[1], w7[2], offset); + c5[1] = hc_bytealign (w7[0], w7[1], offset); + c5[0] = hc_bytealign (w6[3], w7[0], offset); + c4[3] = hc_bytealign (w6[2], w6[3], offset); + c4[2] = hc_bytealign (w6[1], w6[2], offset); + c4[1] = hc_bytealign (w6[0], w6[1], offset); + c4[0] = hc_bytealign (w5[3], w6[0], offset); + c3[3] = hc_bytealign (w5[2], w5[3], offset); + c3[2] = hc_bytealign (w5[1], w5[2], offset); + c3[1] = hc_bytealign (w5[0], w5[1], offset); + c3[0] = hc_bytealign (w4[3], w5[0], offset); + c2[3] = hc_bytealign (w4[2], w4[3], offset); + c2[2] = hc_bytealign (w4[1], w4[2], offset); + c2[1] = hc_bytealign (w4[0], w4[1], offset); + c2[0] = hc_bytealign (w3[3], w4[0], offset); + c1[3] = hc_bytealign (w3[2], w3[3], offset); + c1[2] = hc_bytealign (w3[1], w3[2], offset); + c1[1] = hc_bytealign (w3[0], w3[1], offset); + c1[0] = hc_bytealign (w2[3], w3[0], offset); + c0[3] = hc_bytealign (w2[2], w2[3], offset); + c0[2] = hc_bytealign (w2[1], w2[2], offset); + c0[1] = hc_bytealign (w2[0], w2[1], offset); + c0[0] = hc_bytealign (w1[3], w2[0], offset); + w7[3] = hc_bytealign (w1[2], w1[3], offset); + w7[2] = hc_bytealign (w1[1], w1[2], offset); + w7[1] = hc_bytealign (w1[0], w1[1], offset); + w7[0] = hc_bytealign (w0[3], w1[0], offset); + w6[3] = hc_bytealign (w0[2], w0[3], offset); + w6[2] = hc_bytealign (w0[1], w0[2], offset); + w6[1] = hc_bytealign (w0[0], w0[1], offset); + w6[0] = hc_bytealign ( 0, w0[0], offset); w5[3] = 0; w5[2] = 0; w5[1] = 0; @@ -9499,39 +9499,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 25: - c6[1] = amd_bytealign (w7[3], 0, offset); - c6[0] = amd_bytealign (w7[2], w7[3], offset); - c5[3] = amd_bytealign (w7[1], w7[2], offset); - c5[2] = amd_bytealign (w7[0], w7[1], offset); - c5[1] = amd_bytealign (w6[3], w7[0], offset); - c5[0] = amd_bytealign (w6[2], w6[3], offset); - c4[3] = amd_bytealign (w6[1], w6[2], offset); - c4[2] = amd_bytealign (w6[0], w6[1], offset); - c4[1] = amd_bytealign (w5[3], w6[0], offset); - c4[0] = amd_bytealign (w5[2], w5[3], offset); - c3[3] = amd_bytealign (w5[1], w5[2], offset); - c3[2] = amd_bytealign (w5[0], w5[1], offset); - c3[1] = amd_bytealign (w4[3], w5[0], offset); - c3[0] = amd_bytealign (w4[2], w4[3], offset); - c2[3] = amd_bytealign (w4[1], w4[2], offset); - c2[2] = amd_bytealign (w4[0], w4[1], offset); - c2[1] = amd_bytealign (w3[3], w4[0], offset); - c2[0] = amd_bytealign (w3[2], w3[3], offset); - c1[3] = amd_bytealign (w3[1], w3[2], offset); - c1[2] = amd_bytealign (w3[0], w3[1], offset); - c1[1] = amd_bytealign (w2[3], w3[0], offset); - c1[0] = amd_bytealign (w2[2], w2[3], offset); - c0[3] = amd_bytealign (w2[1], w2[2], offset); - c0[2] = amd_bytealign (w2[0], w2[1], offset); - c0[1] = amd_bytealign (w1[3], w2[0], offset); - c0[0] = amd_bytealign (w1[2], w1[3], offset); - w7[3] = amd_bytealign (w1[1], w1[2], offset); - w7[2] = amd_bytealign (w1[0], w1[1], offset); - w7[1] = amd_bytealign (w0[3], w1[0], offset); - w7[0] = amd_bytealign (w0[2], w0[3], offset); - w6[3] = amd_bytealign (w0[1], w0[2], offset); - w6[2] = amd_bytealign (w0[0], w0[1], offset); - w6[1] = amd_bytealign ( 0, w0[0], offset); + c6[1] = hc_bytealign (w7[3], 0, offset); + c6[0] = hc_bytealign (w7[2], w7[3], offset); + c5[3] = hc_bytealign (w7[1], w7[2], offset); + c5[2] = hc_bytealign (w7[0], w7[1], offset); + c5[1] = hc_bytealign (w6[3], w7[0], offset); + c5[0] = hc_bytealign (w6[2], w6[3], offset); + c4[3] = hc_bytealign (w6[1], w6[2], offset); + c4[2] = hc_bytealign (w6[0], w6[1], offset); + c4[1] = hc_bytealign (w5[3], w6[0], offset); + c4[0] = hc_bytealign (w5[2], w5[3], offset); + c3[3] = hc_bytealign (w5[1], w5[2], offset); + c3[2] = hc_bytealign (w5[0], w5[1], offset); + c3[1] = hc_bytealign (w4[3], w5[0], offset); + c3[0] = hc_bytealign (w4[2], w4[3], offset); + c2[3] = hc_bytealign (w4[1], w4[2], offset); + c2[2] = hc_bytealign (w4[0], w4[1], offset); + c2[1] = hc_bytealign (w3[3], w4[0], offset); + c2[0] = hc_bytealign (w3[2], w3[3], offset); + c1[3] = hc_bytealign (w3[1], w3[2], offset); + c1[2] = hc_bytealign (w3[0], w3[1], offset); + c1[1] = hc_bytealign (w2[3], w3[0], offset); + c1[0] = hc_bytealign (w2[2], w2[3], offset); + c0[3] = hc_bytealign (w2[1], w2[2], offset); + c0[2] = hc_bytealign (w2[0], w2[1], offset); + c0[1] = hc_bytealign (w1[3], w2[0], offset); + c0[0] = hc_bytealign (w1[2], w1[3], offset); + w7[3] = hc_bytealign (w1[1], w1[2], offset); + w7[2] = hc_bytealign (w1[0], w1[1], offset); + w7[1] = hc_bytealign (w0[3], w1[0], offset); + w7[0] = hc_bytealign (w0[2], w0[3], offset); + w6[3] = hc_bytealign (w0[1], w0[2], offset); + w6[2] = hc_bytealign (w0[0], w0[1], offset); + w6[1] = hc_bytealign ( 0, w0[0], offset); w6[0] = 0; w5[3] = 0; w5[2] = 0; @@ -9561,39 +9561,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 26: - c6[2] = amd_bytealign (w7[3], 0, offset); - c6[1] = amd_bytealign (w7[2], w7[3], offset); - c6[0] = amd_bytealign (w7[1], w7[2], offset); - c5[3] = amd_bytealign (w7[0], w7[1], offset); - c5[2] = amd_bytealign (w6[3], w7[0], offset); - c5[1] = amd_bytealign (w6[2], w6[3], offset); - c5[0] = amd_bytealign (w6[1], w6[2], offset); - c4[3] = amd_bytealign (w6[0], w6[1], offset); - c4[2] = amd_bytealign (w5[3], w6[0], offset); - c4[1] = amd_bytealign (w5[2], w5[3], offset); - c4[0] = amd_bytealign (w5[1], w5[2], offset); - c3[3] = amd_bytealign (w5[0], w5[1], offset); - c3[2] = amd_bytealign (w4[3], w5[0], offset); - c3[1] = amd_bytealign (w4[2], w4[3], offset); - c3[0] = amd_bytealign (w4[1], w4[2], offset); - c2[3] = amd_bytealign (w4[0], w4[1], offset); - c2[2] = amd_bytealign (w3[3], w4[0], offset); - c2[1] = amd_bytealign (w3[2], w3[3], offset); - c2[0] = amd_bytealign (w3[1], w3[2], offset); - c1[3] = amd_bytealign (w3[0], w3[1], offset); - c1[2] = amd_bytealign (w2[3], w3[0], offset); - c1[1] = amd_bytealign (w2[2], w2[3], offset); - c1[0] = amd_bytealign (w2[1], w2[2], offset); - c0[3] = amd_bytealign (w2[0], w2[1], offset); - c0[2] = amd_bytealign (w1[3], w2[0], offset); - c0[1] = amd_bytealign (w1[2], w1[3], offset); - c0[0] = amd_bytealign (w1[1], w1[2], offset); - w7[3] = amd_bytealign (w1[0], w1[1], offset); - w7[2] = amd_bytealign (w0[3], w1[0], offset); - w7[1] = amd_bytealign (w0[2], w0[3], offset); - w7[0] = amd_bytealign (w0[1], w0[2], offset); - w6[3] = amd_bytealign (w0[0], w0[1], offset); - w6[2] = amd_bytealign ( 0, w0[0], offset); + c6[2] = hc_bytealign (w7[3], 0, offset); + c6[1] = hc_bytealign (w7[2], w7[3], offset); + c6[0] = hc_bytealign (w7[1], w7[2], offset); + c5[3] = hc_bytealign (w7[0], w7[1], offset); + c5[2] = hc_bytealign (w6[3], w7[0], offset); + c5[1] = hc_bytealign (w6[2], w6[3], offset); + c5[0] = hc_bytealign (w6[1], w6[2], offset); + c4[3] = hc_bytealign (w6[0], w6[1], offset); + c4[2] = hc_bytealign (w5[3], w6[0], offset); + c4[1] = hc_bytealign (w5[2], w5[3], offset); + c4[0] = hc_bytealign (w5[1], w5[2], offset); + c3[3] = hc_bytealign (w5[0], w5[1], offset); + c3[2] = hc_bytealign (w4[3], w5[0], offset); + c3[1] = hc_bytealign (w4[2], w4[3], offset); + c3[0] = hc_bytealign (w4[1], w4[2], offset); + c2[3] = hc_bytealign (w4[0], w4[1], offset); + c2[2] = hc_bytealign (w3[3], w4[0], offset); + c2[1] = hc_bytealign (w3[2], w3[3], offset); + c2[0] = hc_bytealign (w3[1], w3[2], offset); + c1[3] = hc_bytealign (w3[0], w3[1], offset); + c1[2] = hc_bytealign (w2[3], w3[0], offset); + c1[1] = hc_bytealign (w2[2], w2[3], offset); + c1[0] = hc_bytealign (w2[1], w2[2], offset); + c0[3] = hc_bytealign (w2[0], w2[1], offset); + c0[2] = hc_bytealign (w1[3], w2[0], offset); + c0[1] = hc_bytealign (w1[2], w1[3], offset); + c0[0] = hc_bytealign (w1[1], w1[2], offset); + w7[3] = hc_bytealign (w1[0], w1[1], offset); + w7[2] = hc_bytealign (w0[3], w1[0], offset); + w7[1] = hc_bytealign (w0[2], w0[3], offset); + w7[0] = hc_bytealign (w0[1], w0[2], offset); + w6[3] = hc_bytealign (w0[0], w0[1], offset); + w6[2] = hc_bytealign ( 0, w0[0], offset); w6[1] = 0; w6[0] = 0; w5[3] = 0; @@ -9624,39 +9624,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 27: - c6[3] = amd_bytealign (w7[3], 0, offset); - c6[2] = amd_bytealign (w7[2], w7[3], offset); - c6[1] = amd_bytealign (w7[1], w7[2], offset); - c6[0] = amd_bytealign (w7[0], w7[1], offset); - c5[3] = amd_bytealign (w6[3], w7[0], offset); - c5[2] = amd_bytealign (w6[2], w6[3], offset); - c5[1] = amd_bytealign (w6[1], w6[2], offset); - c5[0] = amd_bytealign (w6[0], w6[1], offset); - c4[3] = amd_bytealign (w5[3], w6[0], offset); - c4[2] = amd_bytealign (w5[2], w5[3], offset); - c4[1] = amd_bytealign (w5[1], w5[2], offset); - c4[0] = amd_bytealign (w5[0], w5[1], offset); - c3[3] = amd_bytealign (w4[3], w5[0], offset); - c3[2] = amd_bytealign (w4[2], w4[3], offset); - c3[1] = amd_bytealign (w4[1], w4[2], offset); - c3[0] = amd_bytealign (w4[0], w4[1], offset); - c2[3] = amd_bytealign (w3[3], w4[0], offset); - c2[2] = amd_bytealign (w3[2], w3[3], offset); - c2[1] = amd_bytealign (w3[1], w3[2], offset); - c2[0] = amd_bytealign (w3[0], w3[1], offset); - c1[3] = amd_bytealign (w2[3], w3[0], offset); - c1[2] = amd_bytealign (w2[2], w2[3], offset); - c1[1] = amd_bytealign (w2[1], w2[2], offset); - c1[0] = amd_bytealign (w2[0], w2[1], offset); - c0[3] = amd_bytealign (w1[3], w2[0], offset); - c0[2] = amd_bytealign (w1[2], w1[3], offset); - c0[1] = amd_bytealign (w1[1], w1[2], offset); - c0[0] = amd_bytealign (w1[0], w1[1], offset); - w7[3] = amd_bytealign (w0[3], w1[0], offset); - w7[2] = amd_bytealign (w0[2], w0[3], offset); - w7[1] = amd_bytealign (w0[1], w0[2], offset); - w7[0] = amd_bytealign (w0[0], w0[1], offset); - w6[3] = amd_bytealign ( 0, w0[0], offset); + c6[3] = hc_bytealign (w7[3], 0, offset); + c6[2] = hc_bytealign (w7[2], w7[3], offset); + c6[1] = hc_bytealign (w7[1], w7[2], offset); + c6[0] = hc_bytealign (w7[0], w7[1], offset); + c5[3] = hc_bytealign (w6[3], w7[0], offset); + c5[2] = hc_bytealign (w6[2], w6[3], offset); + c5[1] = hc_bytealign (w6[1], w6[2], offset); + c5[0] = hc_bytealign (w6[0], w6[1], offset); + c4[3] = hc_bytealign (w5[3], w6[0], offset); + c4[2] = hc_bytealign (w5[2], w5[3], offset); + c4[1] = hc_bytealign (w5[1], w5[2], offset); + c4[0] = hc_bytealign (w5[0], w5[1], offset); + c3[3] = hc_bytealign (w4[3], w5[0], offset); + c3[2] = hc_bytealign (w4[2], w4[3], offset); + c3[1] = hc_bytealign (w4[1], w4[2], offset); + c3[0] = hc_bytealign (w4[0], w4[1], offset); + c2[3] = hc_bytealign (w3[3], w4[0], offset); + c2[2] = hc_bytealign (w3[2], w3[3], offset); + c2[1] = hc_bytealign (w3[1], w3[2], offset); + c2[0] = hc_bytealign (w3[0], w3[1], offset); + c1[3] = hc_bytealign (w2[3], w3[0], offset); + c1[2] = hc_bytealign (w2[2], w2[3], offset); + c1[1] = hc_bytealign (w2[1], w2[2], offset); + c1[0] = hc_bytealign (w2[0], w2[1], offset); + c0[3] = hc_bytealign (w1[3], w2[0], offset); + c0[2] = hc_bytealign (w1[2], w1[3], offset); + c0[1] = hc_bytealign (w1[1], w1[2], offset); + c0[0] = hc_bytealign (w1[0], w1[1], offset); + w7[3] = hc_bytealign (w0[3], w1[0], offset); + w7[2] = hc_bytealign (w0[2], w0[3], offset); + w7[1] = hc_bytealign (w0[1], w0[2], offset); + w7[0] = hc_bytealign (w0[0], w0[1], offset); + w6[3] = hc_bytealign ( 0, w0[0], offset); w6[2] = 0; w6[1] = 0; w6[0] = 0; @@ -9688,39 +9688,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 28: - c7[0] = amd_bytealign (w7[3], 0, offset); - c6[3] = amd_bytealign (w7[2], w7[3], offset); - c6[2] = amd_bytealign (w7[1], w7[2], offset); - c6[1] = amd_bytealign (w7[0], w7[1], offset); - c6[0] = amd_bytealign (w6[3], w7[0], offset); - c5[3] = amd_bytealign (w6[2], w6[3], offset); - c5[2] = amd_bytealign (w6[1], w6[2], offset); - c5[1] = amd_bytealign (w6[0], w6[1], offset); - c5[0] = amd_bytealign (w5[3], w6[0], offset); - c4[3] = amd_bytealign (w5[2], w5[3], offset); - c4[2] = amd_bytealign (w5[1], w5[2], offset); - c4[1] = amd_bytealign (w5[0], w5[1], offset); - c4[0] = amd_bytealign (w4[3], w5[0], offset); - c3[3] = amd_bytealign (w4[2], w4[3], offset); - c3[2] = amd_bytealign (w4[1], w4[2], offset); - c3[1] = amd_bytealign (w4[0], w4[1], offset); - c3[0] = amd_bytealign (w3[3], w4[0], offset); - c2[3] = amd_bytealign (w3[2], w3[3], offset); - c2[2] = amd_bytealign (w3[1], w3[2], offset); - c2[1] = amd_bytealign (w3[0], w3[1], offset); - c2[0] = amd_bytealign (w2[3], w3[0], offset); - c1[3] = amd_bytealign (w2[2], w2[3], offset); - c1[2] = amd_bytealign (w2[1], w2[2], offset); - c1[1] = amd_bytealign (w2[0], w2[1], offset); - c1[0] = amd_bytealign (w1[3], w2[0], offset); - c0[3] = amd_bytealign (w1[2], w1[3], offset); - c0[2] = amd_bytealign (w1[1], w1[2], offset); - c0[1] = amd_bytealign (w1[0], w1[1], offset); - c0[0] = amd_bytealign (w0[3], w1[0], offset); - w7[3] = amd_bytealign (w0[2], w0[3], offset); - w7[2] = amd_bytealign (w0[1], w0[2], offset); - w7[1] = amd_bytealign (w0[0], w0[1], offset); - w7[0] = amd_bytealign ( 0, w0[0], offset); + c7[0] = hc_bytealign (w7[3], 0, offset); + c6[3] = hc_bytealign (w7[2], w7[3], offset); + c6[2] = hc_bytealign (w7[1], w7[2], offset); + c6[1] = hc_bytealign (w7[0], w7[1], offset); + c6[0] = hc_bytealign (w6[3], w7[0], offset); + c5[3] = hc_bytealign (w6[2], w6[3], offset); + c5[2] = hc_bytealign (w6[1], w6[2], offset); + c5[1] = hc_bytealign (w6[0], w6[1], offset); + c5[0] = hc_bytealign (w5[3], w6[0], offset); + c4[3] = hc_bytealign (w5[2], w5[3], offset); + c4[2] = hc_bytealign (w5[1], w5[2], offset); + c4[1] = hc_bytealign (w5[0], w5[1], offset); + c4[0] = hc_bytealign (w4[3], w5[0], offset); + c3[3] = hc_bytealign (w4[2], w4[3], offset); + c3[2] = hc_bytealign (w4[1], w4[2], offset); + c3[1] = hc_bytealign (w4[0], w4[1], offset); + c3[0] = hc_bytealign (w3[3], w4[0], offset); + c2[3] = hc_bytealign (w3[2], w3[3], offset); + c2[2] = hc_bytealign (w3[1], w3[2], offset); + c2[1] = hc_bytealign (w3[0], w3[1], offset); + c2[0] = hc_bytealign (w2[3], w3[0], offset); + c1[3] = hc_bytealign (w2[2], w2[3], offset); + c1[2] = hc_bytealign (w2[1], w2[2], offset); + c1[1] = hc_bytealign (w2[0], w2[1], offset); + c1[0] = hc_bytealign (w1[3], w2[0], offset); + c0[3] = hc_bytealign (w1[2], w1[3], offset); + c0[2] = hc_bytealign (w1[1], w1[2], offset); + c0[1] = hc_bytealign (w1[0], w1[1], offset); + c0[0] = hc_bytealign (w0[3], w1[0], offset); + w7[3] = hc_bytealign (w0[2], w0[3], offset); + w7[2] = hc_bytealign (w0[1], w0[2], offset); + w7[1] = hc_bytealign (w0[0], w0[1], offset); + w7[0] = hc_bytealign ( 0, w0[0], offset); w6[3] = 0; w6[2] = 0; w6[1] = 0; @@ -9753,39 +9753,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 29: - c7[1] = amd_bytealign (w7[3], 0, offset); - c7[0] = amd_bytealign (w7[2], w7[3], offset); - c6[3] = amd_bytealign (w7[1], w7[2], offset); - c6[2] = amd_bytealign (w7[0], w7[1], offset); - c6[1] = amd_bytealign (w6[3], w7[0], offset); - c6[0] = amd_bytealign (w6[2], w6[3], offset); - c5[3] = amd_bytealign (w6[1], w6[2], offset); - c5[2] = amd_bytealign (w6[0], w6[1], offset); - c5[1] = amd_bytealign (w5[3], w6[0], offset); - c5[0] = amd_bytealign (w5[2], w5[3], offset); - c4[3] = amd_bytealign (w5[1], w5[2], offset); - c4[2] = amd_bytealign (w5[0], w5[1], offset); - c4[1] = amd_bytealign (w4[3], w5[0], offset); - c4[0] = amd_bytealign (w4[2], w4[3], offset); - c3[3] = amd_bytealign (w4[1], w4[2], offset); - c3[2] = amd_bytealign (w4[0], w4[1], offset); - c3[1] = amd_bytealign (w3[3], w4[0], offset); - c3[0] = amd_bytealign (w3[2], w3[3], offset); - c2[3] = amd_bytealign (w3[1], w3[2], offset); - c2[2] = amd_bytealign (w3[0], w3[1], offset); - c2[1] = amd_bytealign (w2[3], w3[0], offset); - c2[0] = amd_bytealign (w2[2], w2[3], offset); - c1[3] = amd_bytealign (w2[1], w2[2], offset); - c1[2] = amd_bytealign (w2[0], w2[1], offset); - c1[1] = amd_bytealign (w1[3], w2[0], offset); - c1[0] = amd_bytealign (w1[2], w1[3], offset); - c0[3] = amd_bytealign (w1[1], w1[2], offset); - c0[2] = amd_bytealign (w1[0], w1[1], offset); - c0[1] = amd_bytealign (w0[3], w1[0], offset); - c0[0] = amd_bytealign (w0[2], w0[3], offset); - w7[3] = amd_bytealign (w0[1], w0[2], offset); - w7[2] = amd_bytealign (w0[0], w0[1], offset); - w7[1] = amd_bytealign ( 0, w0[0], offset); + c7[1] = hc_bytealign (w7[3], 0, offset); + c7[0] = hc_bytealign (w7[2], w7[3], offset); + c6[3] = hc_bytealign (w7[1], w7[2], offset); + c6[2] = hc_bytealign (w7[0], w7[1], offset); + c6[1] = hc_bytealign (w6[3], w7[0], offset); + c6[0] = hc_bytealign (w6[2], w6[3], offset); + c5[3] = hc_bytealign (w6[1], w6[2], offset); + c5[2] = hc_bytealign (w6[0], w6[1], offset); + c5[1] = hc_bytealign (w5[3], w6[0], offset); + c5[0] = hc_bytealign (w5[2], w5[3], offset); + c4[3] = hc_bytealign (w5[1], w5[2], offset); + c4[2] = hc_bytealign (w5[0], w5[1], offset); + c4[1] = hc_bytealign (w4[3], w5[0], offset); + c4[0] = hc_bytealign (w4[2], w4[3], offset); + c3[3] = hc_bytealign (w4[1], w4[2], offset); + c3[2] = hc_bytealign (w4[0], w4[1], offset); + c3[1] = hc_bytealign (w3[3], w4[0], offset); + c3[0] = hc_bytealign (w3[2], w3[3], offset); + c2[3] = hc_bytealign (w3[1], w3[2], offset); + c2[2] = hc_bytealign (w3[0], w3[1], offset); + c2[1] = hc_bytealign (w2[3], w3[0], offset); + c2[0] = hc_bytealign (w2[2], w2[3], offset); + c1[3] = hc_bytealign (w2[1], w2[2], offset); + c1[2] = hc_bytealign (w2[0], w2[1], offset); + c1[1] = hc_bytealign (w1[3], w2[0], offset); + c1[0] = hc_bytealign (w1[2], w1[3], offset); + c0[3] = hc_bytealign (w1[1], w1[2], offset); + c0[2] = hc_bytealign (w1[0], w1[1], offset); + c0[1] = hc_bytealign (w0[3], w1[0], offset); + c0[0] = hc_bytealign (w0[2], w0[3], offset); + w7[3] = hc_bytealign (w0[1], w0[2], offset); + w7[2] = hc_bytealign (w0[0], w0[1], offset); + w7[1] = hc_bytealign ( 0, w0[0], offset); w7[0] = 0; w6[3] = 0; w6[2] = 0; @@ -9819,39 +9819,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 30: - c7[2] = amd_bytealign (w7[3], 0, offset); - c7[1] = amd_bytealign (w7[2], w7[3], offset); - c7[0] = amd_bytealign (w7[1], w7[2], offset); - c6[3] = amd_bytealign (w7[0], w7[1], offset); - c6[2] = amd_bytealign (w6[3], w7[0], offset); - c6[1] = amd_bytealign (w6[2], w6[3], offset); - c6[0] = amd_bytealign (w6[1], w6[2], offset); - c5[3] = amd_bytealign (w6[0], w6[1], offset); - c5[2] = amd_bytealign (w5[3], w6[0], offset); - c5[1] = amd_bytealign (w5[2], w5[3], offset); - c5[0] = amd_bytealign (w5[1], w5[2], offset); - c4[3] = amd_bytealign (w5[0], w5[1], offset); - c4[2] = amd_bytealign (w4[3], w5[0], offset); - c4[1] = amd_bytealign (w4[2], w4[3], offset); - c4[0] = amd_bytealign (w4[1], w4[2], offset); - c3[3] = amd_bytealign (w4[0], w4[1], offset); - c3[2] = amd_bytealign (w3[3], w4[0], offset); - c3[1] = amd_bytealign (w3[2], w3[3], offset); - c3[0] = amd_bytealign (w3[1], w3[2], offset); - c2[3] = amd_bytealign (w3[0], w3[1], offset); - c2[2] = amd_bytealign (w2[3], w3[0], offset); - c2[1] = amd_bytealign (w2[2], w2[3], offset); - c2[0] = amd_bytealign (w2[1], w2[2], offset); - c1[3] = amd_bytealign (w2[0], w2[1], offset); - c1[2] = amd_bytealign (w1[3], w2[0], offset); - c1[1] = amd_bytealign (w1[2], w1[3], offset); - c1[0] = amd_bytealign (w1[1], w1[2], offset); - c0[3] = amd_bytealign (w1[0], w1[1], offset); - c0[2] = amd_bytealign (w0[3], w1[0], offset); - c0[1] = amd_bytealign (w0[2], w0[3], offset); - c0[0] = amd_bytealign (w0[1], w0[2], offset); - w7[3] = amd_bytealign (w0[0], w0[1], offset); - w7[2] = amd_bytealign ( 0, w0[0], offset); + c7[2] = hc_bytealign (w7[3], 0, offset); + c7[1] = hc_bytealign (w7[2], w7[3], offset); + c7[0] = hc_bytealign (w7[1], w7[2], offset); + c6[3] = hc_bytealign (w7[0], w7[1], offset); + c6[2] = hc_bytealign (w6[3], w7[0], offset); + c6[1] = hc_bytealign (w6[2], w6[3], offset); + c6[0] = hc_bytealign (w6[1], w6[2], offset); + c5[3] = hc_bytealign (w6[0], w6[1], offset); + c5[2] = hc_bytealign (w5[3], w6[0], offset); + c5[1] = hc_bytealign (w5[2], w5[3], offset); + c5[0] = hc_bytealign (w5[1], w5[2], offset); + c4[3] = hc_bytealign (w5[0], w5[1], offset); + c4[2] = hc_bytealign (w4[3], w5[0], offset); + c4[1] = hc_bytealign (w4[2], w4[3], offset); + c4[0] = hc_bytealign (w4[1], w4[2], offset); + c3[3] = hc_bytealign (w4[0], w4[1], offset); + c3[2] = hc_bytealign (w3[3], w4[0], offset); + c3[1] = hc_bytealign (w3[2], w3[3], offset); + c3[0] = hc_bytealign (w3[1], w3[2], offset); + c2[3] = hc_bytealign (w3[0], w3[1], offset); + c2[2] = hc_bytealign (w2[3], w3[0], offset); + c2[1] = hc_bytealign (w2[2], w2[3], offset); + c2[0] = hc_bytealign (w2[1], w2[2], offset); + c1[3] = hc_bytealign (w2[0], w2[1], offset); + c1[2] = hc_bytealign (w1[3], w2[0], offset); + c1[1] = hc_bytealign (w1[2], w1[3], offset); + c1[0] = hc_bytealign (w1[1], w1[2], offset); + c0[3] = hc_bytealign (w1[0], w1[1], offset); + c0[2] = hc_bytealign (w0[3], w1[0], offset); + c0[1] = hc_bytealign (w0[2], w0[3], offset); + c0[0] = hc_bytealign (w0[1], w0[2], offset); + w7[3] = hc_bytealign (w0[0], w0[1], offset); + w7[2] = hc_bytealign ( 0, w0[0], offset); w7[1] = 0; w7[0] = 0; w6[3] = 0; @@ -9886,39 +9886,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 31: - c7[3] = amd_bytealign (w7[3], 0, offset); - c7[2] = amd_bytealign (w7[2], w7[3], offset); - c7[1] = amd_bytealign (w7[1], w7[2], offset); - c7[0] = amd_bytealign (w7[0], w7[1], offset); - c6[3] = amd_bytealign (w6[3], w7[0], offset); - c6[2] = amd_bytealign (w6[2], w6[3], offset); - c6[1] = amd_bytealign (w6[1], w6[2], offset); - c6[0] = amd_bytealign (w6[0], w6[1], offset); - c5[3] = amd_bytealign (w5[3], w6[0], offset); - c5[2] = amd_bytealign (w5[2], w5[3], offset); - c5[1] = amd_bytealign (w5[1], w5[2], offset); - c5[0] = amd_bytealign (w5[0], w5[1], offset); - c4[3] = amd_bytealign (w4[3], w5[0], offset); - c4[2] = amd_bytealign (w4[2], w4[3], offset); - c4[1] = amd_bytealign (w4[1], w4[2], offset); - c4[0] = amd_bytealign (w4[0], w4[1], offset); - c3[3] = amd_bytealign (w3[3], w4[0], offset); - c3[2] = amd_bytealign (w3[2], w3[3], offset); - c3[1] = amd_bytealign (w3[1], w3[2], offset); - c3[0] = amd_bytealign (w3[0], w3[1], offset); - c2[3] = amd_bytealign (w2[3], w3[0], offset); - c2[2] = amd_bytealign (w2[2], w2[3], offset); - c2[1] = amd_bytealign (w2[1], w2[2], offset); - c2[0] = amd_bytealign (w2[0], w2[1], offset); - c1[3] = amd_bytealign (w1[3], w2[0], offset); - c1[2] = amd_bytealign (w1[2], w1[3], offset); - c1[1] = amd_bytealign (w1[1], w1[2], offset); - c1[0] = amd_bytealign (w1[0], w1[1], offset); - c0[3] = amd_bytealign (w0[3], w1[0], offset); - c0[2] = amd_bytealign (w0[2], w0[3], offset); - c0[1] = amd_bytealign (w0[1], w0[2], offset); - c0[0] = amd_bytealign (w0[0], w0[1], offset); - w7[3] = amd_bytealign ( 0, w0[0], offset); + c7[3] = hc_bytealign (w7[3], 0, offset); + c7[2] = hc_bytealign (w7[2], w7[3], offset); + c7[1] = hc_bytealign (w7[1], w7[2], offset); + c7[0] = hc_bytealign (w7[0], w7[1], offset); + c6[3] = hc_bytealign (w6[3], w7[0], offset); + c6[2] = hc_bytealign (w6[2], w6[3], offset); + c6[1] = hc_bytealign (w6[1], w6[2], offset); + c6[0] = hc_bytealign (w6[0], w6[1], offset); + c5[3] = hc_bytealign (w5[3], w6[0], offset); + c5[2] = hc_bytealign (w5[2], w5[3], offset); + c5[1] = hc_bytealign (w5[1], w5[2], offset); + c5[0] = hc_bytealign (w5[0], w5[1], offset); + c4[3] = hc_bytealign (w4[3], w5[0], offset); + c4[2] = hc_bytealign (w4[2], w4[3], offset); + c4[1] = hc_bytealign (w4[1], w4[2], offset); + c4[0] = hc_bytealign (w4[0], w4[1], offset); + c3[3] = hc_bytealign (w3[3], w4[0], offset); + c3[2] = hc_bytealign (w3[2], w3[3], offset); + c3[1] = hc_bytealign (w3[1], w3[2], offset); + c3[0] = hc_bytealign (w3[0], w3[1], offset); + c2[3] = hc_bytealign (w2[3], w3[0], offset); + c2[2] = hc_bytealign (w2[2], w2[3], offset); + c2[1] = hc_bytealign (w2[1], w2[2], offset); + c2[0] = hc_bytealign (w2[0], w2[1], offset); + c1[3] = hc_bytealign (w1[3], w2[0], offset); + c1[2] = hc_bytealign (w1[2], w1[3], offset); + c1[1] = hc_bytealign (w1[1], w1[2], offset); + c1[0] = hc_bytealign (w1[0], w1[1], offset); + c0[3] = hc_bytealign (w0[3], w1[0], offset); + c0[2] = hc_bytealign (w0[2], w0[3], offset); + c0[1] = hc_bytealign (w0[1], w0[2], offset); + c0[0] = hc_bytealign (w0[0], w0[1], offset); + w7[3] = hc_bytealign ( 0, w0[0], offset); w7[2] = 0; w7[1] = 0; w7[0] = 0; @@ -9968,153 +9968,153 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 switch (offset_switch) { case 0: - c0[0] = __byte_perm ( 0, w7[3], selector); - w7[3] = __byte_perm (w7[3], w7[2], selector); - w7[2] = __byte_perm (w7[2], w7[1], selector); - w7[1] = __byte_perm (w7[1], w7[0], selector); - w7[0] = __byte_perm (w7[0], w6[3], selector); - w6[3] = __byte_perm (w6[3], w6[2], selector); - w6[2] = __byte_perm (w6[2], w6[1], selector); - w6[1] = __byte_perm (w6[1], w6[0], selector); - w6[0] = __byte_perm (w6[0], w5[3], selector); - w5[3] = __byte_perm (w5[3], w5[2], selector); - w5[2] = __byte_perm (w5[2], w5[1], selector); - w5[1] = __byte_perm (w5[1], w5[0], selector); - w5[0] = __byte_perm (w5[0], w4[3], selector); - w4[3] = __byte_perm (w4[3], w4[2], selector); - w4[2] = __byte_perm (w4[2], w4[1], selector); - w4[1] = __byte_perm (w4[1], w4[0], selector); - w4[0] = __byte_perm (w4[0], w3[3], selector); - w3[3] = __byte_perm (w3[3], w3[2], selector); - w3[2] = __byte_perm (w3[2], w3[1], selector); - w3[1] = __byte_perm (w3[1], w3[0], selector); - w3[0] = __byte_perm (w3[0], w2[3], selector); - w2[3] = __byte_perm (w2[3], w2[2], selector); - w2[2] = __byte_perm (w2[2], w2[1], selector); - w2[1] = __byte_perm (w2[1], w2[0], selector); - w2[0] = __byte_perm (w2[0], w1[3], selector); - w1[3] = __byte_perm (w1[3], w1[2], selector); - w1[2] = __byte_perm (w1[2], w1[1], selector); - w1[1] = __byte_perm (w1[1], w1[0], selector); - w1[0] = __byte_perm (w1[0], w0[3], selector); - w0[3] = __byte_perm (w0[3], w0[2], selector); - w0[2] = __byte_perm (w0[2], w0[1], selector); - w0[1] = __byte_perm (w0[1], w0[0], selector); - w0[0] = __byte_perm (w0[0], 0, selector); + c0[0] = hc_byte_perm ( 0, w7[3], selector); + w7[3] = hc_byte_perm (w7[3], w7[2], selector); + w7[2] = hc_byte_perm (w7[2], w7[1], selector); + w7[1] = hc_byte_perm (w7[1], w7[0], selector); + w7[0] = hc_byte_perm (w7[0], w6[3], selector); + w6[3] = hc_byte_perm (w6[3], w6[2], selector); + w6[2] = hc_byte_perm (w6[2], w6[1], selector); + w6[1] = hc_byte_perm (w6[1], w6[0], selector); + w6[0] = hc_byte_perm (w6[0], w5[3], selector); + w5[3] = hc_byte_perm (w5[3], w5[2], selector); + w5[2] = hc_byte_perm (w5[2], w5[1], selector); + w5[1] = hc_byte_perm (w5[1], w5[0], selector); + w5[0] = hc_byte_perm (w5[0], w4[3], selector); + w4[3] = hc_byte_perm (w4[3], w4[2], selector); + w4[2] = hc_byte_perm (w4[2], w4[1], selector); + w4[1] = hc_byte_perm (w4[1], w4[0], selector); + w4[0] = hc_byte_perm (w4[0], w3[3], selector); + w3[3] = hc_byte_perm (w3[3], w3[2], selector); + w3[2] = hc_byte_perm (w3[2], w3[1], selector); + w3[1] = hc_byte_perm (w3[1], w3[0], selector); + w3[0] = hc_byte_perm (w3[0], w2[3], selector); + w2[3] = hc_byte_perm (w2[3], w2[2], selector); + w2[2] = hc_byte_perm (w2[2], w2[1], selector); + w2[1] = hc_byte_perm (w2[1], w2[0], selector); + w2[0] = hc_byte_perm (w2[0], w1[3], selector); + w1[3] = hc_byte_perm (w1[3], w1[2], selector); + w1[2] = hc_byte_perm (w1[2], w1[1], selector); + w1[1] = hc_byte_perm (w1[1], w1[0], selector); + w1[0] = hc_byte_perm (w1[0], w0[3], selector); + w0[3] = hc_byte_perm (w0[3], w0[2], selector); + w0[2] = hc_byte_perm (w0[2], w0[1], selector); + w0[1] = hc_byte_perm (w0[1], w0[0], selector); + w0[0] = hc_byte_perm (w0[0], 0, selector); break; case 1: - c0[1] = __byte_perm ( 0, w7[3], selector); - c0[0] = __byte_perm (w7[3], w7[2], selector); - w7[3] = __byte_perm (w7[2], w7[1], selector); - w7[2] = __byte_perm (w7[1], w7[0], selector); - w7[1] = __byte_perm (w7[0], w6[3], selector); - w7[0] = __byte_perm (w6[3], w6[2], selector); - w6[3] = __byte_perm (w6[2], w6[1], selector); - w6[2] = __byte_perm (w6[1], w6[0], selector); - w6[1] = __byte_perm (w6[0], w5[3], selector); - w6[0] = __byte_perm (w5[3], w5[2], selector); - w5[3] = __byte_perm (w5[2], w5[1], selector); - w5[2] = __byte_perm (w5[1], w5[0], selector); - w5[1] = __byte_perm (w5[0], w4[3], selector); - w5[0] = __byte_perm (w4[3], w4[2], selector); - w4[3] = __byte_perm (w4[2], w4[1], selector); - w4[2] = __byte_perm (w4[1], w4[0], selector); - w4[1] = __byte_perm (w4[0], w3[3], selector); - w4[0] = __byte_perm (w3[3], w3[2], selector); - w3[3] = __byte_perm (w3[2], w3[1], selector); - w3[2] = __byte_perm (w3[1], w3[0], selector); - w3[1] = __byte_perm (w3[0], w2[3], selector); - w3[0] = __byte_perm (w2[3], w2[2], selector); - w2[3] = __byte_perm (w2[2], w2[1], selector); - w2[2] = __byte_perm (w2[1], w2[0], selector); - w2[1] = __byte_perm (w2[0], w1[3], selector); - w2[0] = __byte_perm (w1[3], w1[2], selector); - w1[3] = __byte_perm (w1[2], w1[1], selector); - w1[2] = __byte_perm (w1[1], w1[0], selector); - w1[1] = __byte_perm (w1[0], w0[3], selector); - w1[0] = __byte_perm (w0[3], w0[2], selector); - w0[3] = __byte_perm (w0[2], w0[1], selector); - w0[2] = __byte_perm (w0[1], w0[0], selector); - w0[1] = __byte_perm (w0[0], 0, selector); + c0[1] = hc_byte_perm ( 0, w7[3], selector); + c0[0] = hc_byte_perm (w7[3], w7[2], selector); + w7[3] = hc_byte_perm (w7[2], w7[1], selector); + w7[2] = hc_byte_perm (w7[1], w7[0], selector); + w7[1] = hc_byte_perm (w7[0], w6[3], selector); + w7[0] = hc_byte_perm (w6[3], w6[2], selector); + w6[3] = hc_byte_perm (w6[2], w6[1], selector); + w6[2] = hc_byte_perm (w6[1], w6[0], selector); + w6[1] = hc_byte_perm (w6[0], w5[3], selector); + w6[0] = hc_byte_perm (w5[3], w5[2], selector); + w5[3] = hc_byte_perm (w5[2], w5[1], selector); + w5[2] = hc_byte_perm (w5[1], w5[0], selector); + w5[1] = hc_byte_perm (w5[0], w4[3], selector); + w5[0] = hc_byte_perm (w4[3], w4[2], selector); + w4[3] = hc_byte_perm (w4[2], w4[1], selector); + w4[2] = hc_byte_perm (w4[1], w4[0], selector); + w4[1] = hc_byte_perm (w4[0], w3[3], selector); + w4[0] = hc_byte_perm (w3[3], w3[2], selector); + w3[3] = hc_byte_perm (w3[2], w3[1], selector); + w3[2] = hc_byte_perm (w3[1], w3[0], selector); + w3[1] = hc_byte_perm (w3[0], w2[3], selector); + w3[0] = hc_byte_perm (w2[3], w2[2], selector); + w2[3] = hc_byte_perm (w2[2], w2[1], selector); + w2[2] = hc_byte_perm (w2[1], w2[0], selector); + w2[1] = hc_byte_perm (w2[0], w1[3], selector); + w2[0] = hc_byte_perm (w1[3], w1[2], selector); + w1[3] = hc_byte_perm (w1[2], w1[1], selector); + w1[2] = hc_byte_perm (w1[1], w1[0], selector); + w1[1] = hc_byte_perm (w1[0], w0[3], selector); + w1[0] = hc_byte_perm (w0[3], w0[2], selector); + w0[3] = hc_byte_perm (w0[2], w0[1], selector); + w0[2] = hc_byte_perm (w0[1], w0[0], selector); + w0[1] = hc_byte_perm (w0[0], 0, selector); w0[0] = 0; break; case 2: - c0[2] = __byte_perm ( 0, w7[3], selector); - c0[1] = __byte_perm (w7[3], w7[2], selector); - c0[0] = __byte_perm (w7[2], w7[1], selector); - w7[3] = __byte_perm (w7[1], w7[0], selector); - w7[2] = __byte_perm (w7[0], w6[3], selector); - w7[1] = __byte_perm (w6[3], w6[2], selector); - w7[0] = __byte_perm (w6[2], w6[1], selector); - w6[3] = __byte_perm (w6[1], w6[0], selector); - w6[2] = __byte_perm (w6[0], w5[3], selector); - w6[1] = __byte_perm (w5[3], w5[2], selector); - w6[0] = __byte_perm (w5[2], w5[1], selector); - w5[3] = __byte_perm (w5[1], w5[0], selector); - w5[2] = __byte_perm (w5[0], w4[3], selector); - w5[1] = __byte_perm (w4[3], w4[2], selector); - w5[0] = __byte_perm (w4[2], w4[1], selector); - w4[3] = __byte_perm (w4[1], w4[0], selector); - w4[2] = __byte_perm (w4[0], w3[3], selector); - w4[1] = __byte_perm (w3[3], w3[2], selector); - w4[0] = __byte_perm (w3[2], w3[1], selector); - w3[3] = __byte_perm (w3[1], w3[0], selector); - w3[2] = __byte_perm (w3[0], w2[3], selector); - w3[1] = __byte_perm (w2[3], w2[2], selector); - w3[0] = __byte_perm (w2[2], w2[1], selector); - w2[3] = __byte_perm (w2[1], w2[0], selector); - w2[2] = __byte_perm (w2[0], w1[3], selector); - w2[1] = __byte_perm (w1[3], w1[2], selector); - w2[0] = __byte_perm (w1[2], w1[1], selector); - w1[3] = __byte_perm (w1[1], w1[0], selector); - w1[2] = __byte_perm (w1[0], w0[3], selector); - w1[1] = __byte_perm (w0[3], w0[2], selector); - w1[0] = __byte_perm (w0[2], w0[1], selector); - w0[3] = __byte_perm (w0[1], w0[0], selector); - w0[2] = __byte_perm (w0[0], 0, selector); + c0[2] = hc_byte_perm ( 0, w7[3], selector); + c0[1] = hc_byte_perm (w7[3], w7[2], selector); + c0[0] = hc_byte_perm (w7[2], w7[1], selector); + w7[3] = hc_byte_perm (w7[1], w7[0], selector); + w7[2] = hc_byte_perm (w7[0], w6[3], selector); + w7[1] = hc_byte_perm (w6[3], w6[2], selector); + w7[0] = hc_byte_perm (w6[2], w6[1], selector); + w6[3] = hc_byte_perm (w6[1], w6[0], selector); + w6[2] = hc_byte_perm (w6[0], w5[3], selector); + w6[1] = hc_byte_perm (w5[3], w5[2], selector); + w6[0] = hc_byte_perm (w5[2], w5[1], selector); + w5[3] = hc_byte_perm (w5[1], w5[0], selector); + w5[2] = hc_byte_perm (w5[0], w4[3], selector); + w5[1] = hc_byte_perm (w4[3], w4[2], selector); + w5[0] = hc_byte_perm (w4[2], w4[1], selector); + w4[3] = hc_byte_perm (w4[1], w4[0], selector); + w4[2] = hc_byte_perm (w4[0], w3[3], selector); + w4[1] = hc_byte_perm (w3[3], w3[2], selector); + w4[0] = hc_byte_perm (w3[2], w3[1], selector); + w3[3] = hc_byte_perm (w3[1], w3[0], selector); + w3[2] = hc_byte_perm (w3[0], w2[3], selector); + w3[1] = hc_byte_perm (w2[3], w2[2], selector); + w3[0] = hc_byte_perm (w2[2], w2[1], selector); + w2[3] = hc_byte_perm (w2[1], w2[0], selector); + w2[2] = hc_byte_perm (w2[0], w1[3], selector); + w2[1] = hc_byte_perm (w1[3], w1[2], selector); + w2[0] = hc_byte_perm (w1[2], w1[1], selector); + w1[3] = hc_byte_perm (w1[1], w1[0], selector); + w1[2] = hc_byte_perm (w1[0], w0[3], selector); + w1[1] = hc_byte_perm (w0[3], w0[2], selector); + w1[0] = hc_byte_perm (w0[2], w0[1], selector); + w0[3] = hc_byte_perm (w0[1], w0[0], selector); + w0[2] = hc_byte_perm (w0[0], 0, selector); w0[1] = 0; w0[0] = 0; break; case 3: - c0[3] = __byte_perm ( 0, w7[3], selector); - c0[2] = __byte_perm (w7[3], w7[2], selector); - c0[1] = __byte_perm (w7[2], w7[1], selector); - c0[0] = __byte_perm (w7[1], w7[0], selector); - w7[3] = __byte_perm (w7[0], w6[3], selector); - w7[2] = __byte_perm (w6[3], w6[2], selector); - w7[1] = __byte_perm (w6[2], w6[1], selector); - w7[0] = __byte_perm (w6[1], w6[0], selector); - w6[3] = __byte_perm (w6[0], w5[3], selector); - w6[2] = __byte_perm (w5[3], w5[2], selector); - w6[1] = __byte_perm (w5[2], w5[1], selector); - w6[0] = __byte_perm (w5[1], w5[0], selector); - w5[3] = __byte_perm (w5[0], w4[3], selector); - w5[2] = __byte_perm (w4[3], w4[2], selector); - w5[1] = __byte_perm (w4[2], w4[1], selector); - w5[0] = __byte_perm (w4[1], w4[0], selector); - w4[3] = __byte_perm (w4[0], w3[3], selector); - w4[2] = __byte_perm (w3[3], w3[2], selector); - w4[1] = __byte_perm (w3[2], w3[1], selector); - w4[0] = __byte_perm (w3[1], w3[0], selector); - w3[3] = __byte_perm (w3[0], w2[3], selector); - w3[2] = __byte_perm (w2[3], w2[2], selector); - w3[1] = __byte_perm (w2[2], w2[1], selector); - w3[0] = __byte_perm (w2[1], w2[0], selector); - w2[3] = __byte_perm (w2[0], w1[3], selector); - w2[2] = __byte_perm (w1[3], w1[2], selector); - w2[1] = __byte_perm (w1[2], w1[1], selector); - w2[0] = __byte_perm (w1[1], w1[0], selector); - w1[3] = __byte_perm (w1[0], w0[3], selector); - w1[2] = __byte_perm (w0[3], w0[2], selector); - w1[1] = __byte_perm (w0[2], w0[1], selector); - w1[0] = __byte_perm (w0[1], w0[0], selector); - w0[3] = __byte_perm (w0[0], 0, selector); + c0[3] = hc_byte_perm ( 0, w7[3], selector); + c0[2] = hc_byte_perm (w7[3], w7[2], selector); + c0[1] = hc_byte_perm (w7[2], w7[1], selector); + c0[0] = hc_byte_perm (w7[1], w7[0], selector); + w7[3] = hc_byte_perm (w7[0], w6[3], selector); + w7[2] = hc_byte_perm (w6[3], w6[2], selector); + w7[1] = hc_byte_perm (w6[2], w6[1], selector); + w7[0] = hc_byte_perm (w6[1], w6[0], selector); + w6[3] = hc_byte_perm (w6[0], w5[3], selector); + w6[2] = hc_byte_perm (w5[3], w5[2], selector); + w6[1] = hc_byte_perm (w5[2], w5[1], selector); + w6[0] = hc_byte_perm (w5[1], w5[0], selector); + w5[3] = hc_byte_perm (w5[0], w4[3], selector); + w5[2] = hc_byte_perm (w4[3], w4[2], selector); + w5[1] = hc_byte_perm (w4[2], w4[1], selector); + w5[0] = hc_byte_perm (w4[1], w4[0], selector); + w4[3] = hc_byte_perm (w4[0], w3[3], selector); + w4[2] = hc_byte_perm (w3[3], w3[2], selector); + w4[1] = hc_byte_perm (w3[2], w3[1], selector); + w4[0] = hc_byte_perm (w3[1], w3[0], selector); + w3[3] = hc_byte_perm (w3[0], w2[3], selector); + w3[2] = hc_byte_perm (w2[3], w2[2], selector); + w3[1] = hc_byte_perm (w2[2], w2[1], selector); + w3[0] = hc_byte_perm (w2[1], w2[0], selector); + w2[3] = hc_byte_perm (w2[0], w1[3], selector); + w2[2] = hc_byte_perm (w1[3], w1[2], selector); + w2[1] = hc_byte_perm (w1[2], w1[1], selector); + w2[0] = hc_byte_perm (w1[1], w1[0], selector); + w1[3] = hc_byte_perm (w1[0], w0[3], selector); + w1[2] = hc_byte_perm (w0[3], w0[2], selector); + w1[1] = hc_byte_perm (w0[2], w0[1], selector); + w1[0] = hc_byte_perm (w0[1], w0[0], selector); + w0[3] = hc_byte_perm (w0[0], 0, selector); w0[2] = 0; w0[1] = 0; w0[0] = 0; @@ -10122,39 +10122,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 4: - c1[0] = __byte_perm ( 0, w7[3], selector); - c0[3] = __byte_perm (w7[3], w7[2], selector); - c0[2] = __byte_perm (w7[2], w7[1], selector); - c0[1] = __byte_perm (w7[1], w7[0], selector); - c0[0] = __byte_perm (w7[0], w6[3], selector); - w7[3] = __byte_perm (w6[3], w6[2], selector); - w7[2] = __byte_perm (w6[2], w6[1], selector); - w7[1] = __byte_perm (w6[1], w6[0], selector); - w7[0] = __byte_perm (w6[0], w5[3], selector); - w6[3] = __byte_perm (w5[3], w5[2], selector); - w6[2] = __byte_perm (w5[2], w5[1], selector); - w6[1] = __byte_perm (w5[1], w5[0], selector); - w6[0] = __byte_perm (w5[0], w4[3], selector); - w5[3] = __byte_perm (w4[3], w4[2], selector); - w5[2] = __byte_perm (w4[2], w4[1], selector); - w5[1] = __byte_perm (w4[1], w4[0], selector); - w5[0] = __byte_perm (w4[0], w3[3], selector); - w4[3] = __byte_perm (w3[3], w3[2], selector); - w4[2] = __byte_perm (w3[2], w3[1], selector); - w4[1] = __byte_perm (w3[1], w3[0], selector); - w4[0] = __byte_perm (w3[0], w2[3], selector); - w3[3] = __byte_perm (w2[3], w2[2], selector); - w3[2] = __byte_perm (w2[2], w2[1], selector); - w3[1] = __byte_perm (w2[1], w2[0], selector); - w3[0] = __byte_perm (w2[0], w1[3], selector); - w2[3] = __byte_perm (w1[3], w1[2], selector); - w2[2] = __byte_perm (w1[2], w1[1], selector); - w2[1] = __byte_perm (w1[1], w1[0], selector); - w2[0] = __byte_perm (w1[0], w0[3], selector); - w1[3] = __byte_perm (w0[3], w0[2], selector); - w1[2] = __byte_perm (w0[2], w0[1], selector); - w1[1] = __byte_perm (w0[1], w0[0], selector); - w1[0] = __byte_perm (w0[0], 0, selector); + c1[0] = hc_byte_perm ( 0, w7[3], selector); + c0[3] = hc_byte_perm (w7[3], w7[2], selector); + c0[2] = hc_byte_perm (w7[2], w7[1], selector); + c0[1] = hc_byte_perm (w7[1], w7[0], selector); + c0[0] = hc_byte_perm (w7[0], w6[3], selector); + w7[3] = hc_byte_perm (w6[3], w6[2], selector); + w7[2] = hc_byte_perm (w6[2], w6[1], selector); + w7[1] = hc_byte_perm (w6[1], w6[0], selector); + w7[0] = hc_byte_perm (w6[0], w5[3], selector); + w6[3] = hc_byte_perm (w5[3], w5[2], selector); + w6[2] = hc_byte_perm (w5[2], w5[1], selector); + w6[1] = hc_byte_perm (w5[1], w5[0], selector); + w6[0] = hc_byte_perm (w5[0], w4[3], selector); + w5[3] = hc_byte_perm (w4[3], w4[2], selector); + w5[2] = hc_byte_perm (w4[2], w4[1], selector); + w5[1] = hc_byte_perm (w4[1], w4[0], selector); + w5[0] = hc_byte_perm (w4[0], w3[3], selector); + w4[3] = hc_byte_perm (w3[3], w3[2], selector); + w4[2] = hc_byte_perm (w3[2], w3[1], selector); + w4[1] = hc_byte_perm (w3[1], w3[0], selector); + w4[0] = hc_byte_perm (w3[0], w2[3], selector); + w3[3] = hc_byte_perm (w2[3], w2[2], selector); + w3[2] = hc_byte_perm (w2[2], w2[1], selector); + w3[1] = hc_byte_perm (w2[1], w2[0], selector); + w3[0] = hc_byte_perm (w2[0], w1[3], selector); + w2[3] = hc_byte_perm (w1[3], w1[2], selector); + w2[2] = hc_byte_perm (w1[2], w1[1], selector); + w2[1] = hc_byte_perm (w1[1], w1[0], selector); + w2[0] = hc_byte_perm (w1[0], w0[3], selector); + w1[3] = hc_byte_perm (w0[3], w0[2], selector); + w1[2] = hc_byte_perm (w0[2], w0[1], selector); + w1[1] = hc_byte_perm (w0[1], w0[0], selector); + w1[0] = hc_byte_perm (w0[0], 0, selector); w0[3] = 0; w0[2] = 0; w0[1] = 0; @@ -10163,39 +10163,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 5: - c1[1] = __byte_perm ( 0, w7[3], selector); - c1[0] = __byte_perm (w7[3], w7[2], selector); - c0[3] = __byte_perm (w7[2], w7[1], selector); - c0[2] = __byte_perm (w7[1], w7[0], selector); - c0[1] = __byte_perm (w7[0], w6[3], selector); - c0[0] = __byte_perm (w6[3], w6[2], selector); - w7[3] = __byte_perm (w6[2], w6[1], selector); - w7[2] = __byte_perm (w6[1], w6[0], selector); - w7[1] = __byte_perm (w6[0], w5[3], selector); - w7[0] = __byte_perm (w5[3], w5[2], selector); - w6[3] = __byte_perm (w5[2], w5[1], selector); - w6[2] = __byte_perm (w5[1], w5[0], selector); - w6[1] = __byte_perm (w5[0], w4[3], selector); - w6[0] = __byte_perm (w4[3], w4[2], selector); - w5[3] = __byte_perm (w4[2], w4[1], selector); - w5[2] = __byte_perm (w4[1], w4[0], selector); - w5[1] = __byte_perm (w4[0], w3[3], selector); - w5[0] = __byte_perm (w3[3], w3[2], selector); - w4[3] = __byte_perm (w3[2], w3[1], selector); - w4[2] = __byte_perm (w3[1], w3[0], selector); - w4[1] = __byte_perm (w3[0], w2[3], selector); - w4[0] = __byte_perm (w2[3], w2[2], selector); - w3[3] = __byte_perm (w2[2], w2[1], selector); - w3[2] = __byte_perm (w2[1], w2[0], selector); - w3[1] = __byte_perm (w2[0], w1[3], selector); - w3[0] = __byte_perm (w1[3], w1[2], selector); - w2[3] = __byte_perm (w1[2], w1[1], selector); - w2[2] = __byte_perm (w1[1], w1[0], selector); - w2[1] = __byte_perm (w1[0], w0[3], selector); - w2[0] = __byte_perm (w0[3], w0[2], selector); - w1[3] = __byte_perm (w0[2], w0[1], selector); - w1[2] = __byte_perm (w0[1], w0[0], selector); - w1[1] = __byte_perm (w0[0], 0, selector); + c1[1] = hc_byte_perm ( 0, w7[3], selector); + c1[0] = hc_byte_perm (w7[3], w7[2], selector); + c0[3] = hc_byte_perm (w7[2], w7[1], selector); + c0[2] = hc_byte_perm (w7[1], w7[0], selector); + c0[1] = hc_byte_perm (w7[0], w6[3], selector); + c0[0] = hc_byte_perm (w6[3], w6[2], selector); + w7[3] = hc_byte_perm (w6[2], w6[1], selector); + w7[2] = hc_byte_perm (w6[1], w6[0], selector); + w7[1] = hc_byte_perm (w6[0], w5[3], selector); + w7[0] = hc_byte_perm (w5[3], w5[2], selector); + w6[3] = hc_byte_perm (w5[2], w5[1], selector); + w6[2] = hc_byte_perm (w5[1], w5[0], selector); + w6[1] = hc_byte_perm (w5[0], w4[3], selector); + w6[0] = hc_byte_perm (w4[3], w4[2], selector); + w5[3] = hc_byte_perm (w4[2], w4[1], selector); + w5[2] = hc_byte_perm (w4[1], w4[0], selector); + w5[1] = hc_byte_perm (w4[0], w3[3], selector); + w5[0] = hc_byte_perm (w3[3], w3[2], selector); + w4[3] = hc_byte_perm (w3[2], w3[1], selector); + w4[2] = hc_byte_perm (w3[1], w3[0], selector); + w4[1] = hc_byte_perm (w3[0], w2[3], selector); + w4[0] = hc_byte_perm (w2[3], w2[2], selector); + w3[3] = hc_byte_perm (w2[2], w2[1], selector); + w3[2] = hc_byte_perm (w2[1], w2[0], selector); + w3[1] = hc_byte_perm (w2[0], w1[3], selector); + w3[0] = hc_byte_perm (w1[3], w1[2], selector); + w2[3] = hc_byte_perm (w1[2], w1[1], selector); + w2[2] = hc_byte_perm (w1[1], w1[0], selector); + w2[1] = hc_byte_perm (w1[0], w0[3], selector); + w2[0] = hc_byte_perm (w0[3], w0[2], selector); + w1[3] = hc_byte_perm (w0[2], w0[1], selector); + w1[2] = hc_byte_perm (w0[1], w0[0], selector); + w1[1] = hc_byte_perm (w0[0], 0, selector); w1[0] = 0; w0[3] = 0; w0[2] = 0; @@ -10205,39 +10205,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 6: - c1[2] = __byte_perm ( 0, w7[3], selector); - c1[1] = __byte_perm (w7[3], w7[2], selector); - c1[0] = __byte_perm (w7[2], w7[1], selector); - c0[3] = __byte_perm (w7[1], w7[0], selector); - c0[2] = __byte_perm (w7[0], w6[3], selector); - c0[1] = __byte_perm (w6[3], w6[2], selector); - c0[0] = __byte_perm (w6[2], w6[1], selector); - w7[3] = __byte_perm (w6[1], w6[0], selector); - w7[2] = __byte_perm (w6[0], w5[3], selector); - w7[1] = __byte_perm (w5[3], w5[2], selector); - w7[0] = __byte_perm (w5[2], w5[1], selector); - w6[3] = __byte_perm (w5[1], w5[0], selector); - w6[2] = __byte_perm (w5[0], w4[3], selector); - w6[1] = __byte_perm (w4[3], w4[2], selector); - w6[0] = __byte_perm (w4[2], w4[1], selector); - w5[3] = __byte_perm (w4[1], w4[0], selector); - w5[2] = __byte_perm (w4[0], w3[3], selector); - w5[1] = __byte_perm (w3[3], w3[2], selector); - w5[0] = __byte_perm (w3[2], w3[1], selector); - w4[3] = __byte_perm (w3[1], w3[0], selector); - w4[2] = __byte_perm (w3[0], w2[3], selector); - w4[1] = __byte_perm (w2[3], w2[2], selector); - w4[0] = __byte_perm (w2[2], w2[1], selector); - w3[3] = __byte_perm (w2[1], w2[0], selector); - w3[2] = __byte_perm (w2[0], w1[3], selector); - w3[1] = __byte_perm (w1[3], w1[2], selector); - w3[0] = __byte_perm (w1[2], w1[1], selector); - w2[3] = __byte_perm (w1[1], w1[0], selector); - w2[2] = __byte_perm (w1[0], w0[3], selector); - w2[1] = __byte_perm (w0[3], w0[2], selector); - w2[0] = __byte_perm (w0[2], w0[1], selector); - w1[3] = __byte_perm (w0[1], w0[0], selector); - w1[2] = __byte_perm (w0[0], 0, selector); + c1[2] = hc_byte_perm ( 0, w7[3], selector); + c1[1] = hc_byte_perm (w7[3], w7[2], selector); + c1[0] = hc_byte_perm (w7[2], w7[1], selector); + c0[3] = hc_byte_perm (w7[1], w7[0], selector); + c0[2] = hc_byte_perm (w7[0], w6[3], selector); + c0[1] = hc_byte_perm (w6[3], w6[2], selector); + c0[0] = hc_byte_perm (w6[2], w6[1], selector); + w7[3] = hc_byte_perm (w6[1], w6[0], selector); + w7[2] = hc_byte_perm (w6[0], w5[3], selector); + w7[1] = hc_byte_perm (w5[3], w5[2], selector); + w7[0] = hc_byte_perm (w5[2], w5[1], selector); + w6[3] = hc_byte_perm (w5[1], w5[0], selector); + w6[2] = hc_byte_perm (w5[0], w4[3], selector); + w6[1] = hc_byte_perm (w4[3], w4[2], selector); + w6[0] = hc_byte_perm (w4[2], w4[1], selector); + w5[3] = hc_byte_perm (w4[1], w4[0], selector); + w5[2] = hc_byte_perm (w4[0], w3[3], selector); + w5[1] = hc_byte_perm (w3[3], w3[2], selector); + w5[0] = hc_byte_perm (w3[2], w3[1], selector); + w4[3] = hc_byte_perm (w3[1], w3[0], selector); + w4[2] = hc_byte_perm (w3[0], w2[3], selector); + w4[1] = hc_byte_perm (w2[3], w2[2], selector); + w4[0] = hc_byte_perm (w2[2], w2[1], selector); + w3[3] = hc_byte_perm (w2[1], w2[0], selector); + w3[2] = hc_byte_perm (w2[0], w1[3], selector); + w3[1] = hc_byte_perm (w1[3], w1[2], selector); + w3[0] = hc_byte_perm (w1[2], w1[1], selector); + w2[3] = hc_byte_perm (w1[1], w1[0], selector); + w2[2] = hc_byte_perm (w1[0], w0[3], selector); + w2[1] = hc_byte_perm (w0[3], w0[2], selector); + w2[0] = hc_byte_perm (w0[2], w0[1], selector); + w1[3] = hc_byte_perm (w0[1], w0[0], selector); + w1[2] = hc_byte_perm (w0[0], 0, selector); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -10248,39 +10248,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 7: - c1[3] = __byte_perm ( 0, w7[3], selector); - c1[2] = __byte_perm (w7[3], w7[2], selector); - c1[1] = __byte_perm (w7[2], w7[1], selector); - c1[0] = __byte_perm (w7[1], w7[0], selector); - c0[3] = __byte_perm (w7[0], w6[3], selector); - c0[2] = __byte_perm (w6[3], w6[2], selector); - c0[1] = __byte_perm (w6[2], w6[1], selector); - c0[0] = __byte_perm (w6[1], w6[0], selector); - w7[3] = __byte_perm (w6[0], w5[3], selector); - w7[2] = __byte_perm (w5[3], w5[2], selector); - w7[1] = __byte_perm (w5[2], w5[1], selector); - w7[0] = __byte_perm (w5[1], w5[0], selector); - w6[3] = __byte_perm (w5[0], w4[3], selector); - w6[2] = __byte_perm (w4[3], w4[2], selector); - w6[1] = __byte_perm (w4[2], w4[1], selector); - w6[0] = __byte_perm (w4[1], w4[0], selector); - w5[3] = __byte_perm (w4[0], w3[3], selector); - w5[2] = __byte_perm (w3[3], w3[2], selector); - w5[1] = __byte_perm (w3[2], w3[1], selector); - w5[0] = __byte_perm (w3[1], w3[0], selector); - w4[3] = __byte_perm (w3[0], w2[3], selector); - w4[2] = __byte_perm (w2[3], w2[2], selector); - w4[1] = __byte_perm (w2[2], w2[1], selector); - w4[0] = __byte_perm (w2[1], w2[0], selector); - w3[3] = __byte_perm (w2[0], w1[3], selector); - w3[2] = __byte_perm (w1[3], w1[2], selector); - w3[1] = __byte_perm (w1[2], w1[1], selector); - w3[0] = __byte_perm (w1[1], w1[0], selector); - w2[3] = __byte_perm (w1[0], w0[3], selector); - w2[2] = __byte_perm (w0[3], w0[2], selector); - w2[1] = __byte_perm (w0[2], w0[1], selector); - w2[0] = __byte_perm (w0[1], w0[0], selector); - w1[3] = __byte_perm (w0[0], 0, selector); + c1[3] = hc_byte_perm ( 0, w7[3], selector); + c1[2] = hc_byte_perm (w7[3], w7[2], selector); + c1[1] = hc_byte_perm (w7[2], w7[1], selector); + c1[0] = hc_byte_perm (w7[1], w7[0], selector); + c0[3] = hc_byte_perm (w7[0], w6[3], selector); + c0[2] = hc_byte_perm (w6[3], w6[2], selector); + c0[1] = hc_byte_perm (w6[2], w6[1], selector); + c0[0] = hc_byte_perm (w6[1], w6[0], selector); + w7[3] = hc_byte_perm (w6[0], w5[3], selector); + w7[2] = hc_byte_perm (w5[3], w5[2], selector); + w7[1] = hc_byte_perm (w5[2], w5[1], selector); + w7[0] = hc_byte_perm (w5[1], w5[0], selector); + w6[3] = hc_byte_perm (w5[0], w4[3], selector); + w6[2] = hc_byte_perm (w4[3], w4[2], selector); + w6[1] = hc_byte_perm (w4[2], w4[1], selector); + w6[0] = hc_byte_perm (w4[1], w4[0], selector); + w5[3] = hc_byte_perm (w4[0], w3[3], selector); + w5[2] = hc_byte_perm (w3[3], w3[2], selector); + w5[1] = hc_byte_perm (w3[2], w3[1], selector); + w5[0] = hc_byte_perm (w3[1], w3[0], selector); + w4[3] = hc_byte_perm (w3[0], w2[3], selector); + w4[2] = hc_byte_perm (w2[3], w2[2], selector); + w4[1] = hc_byte_perm (w2[2], w2[1], selector); + w4[0] = hc_byte_perm (w2[1], w2[0], selector); + w3[3] = hc_byte_perm (w2[0], w1[3], selector); + w3[2] = hc_byte_perm (w1[3], w1[2], selector); + w3[1] = hc_byte_perm (w1[2], w1[1], selector); + w3[0] = hc_byte_perm (w1[1], w1[0], selector); + w2[3] = hc_byte_perm (w1[0], w0[3], selector); + w2[2] = hc_byte_perm (w0[3], w0[2], selector); + w2[1] = hc_byte_perm (w0[2], w0[1], selector); + w2[0] = hc_byte_perm (w0[1], w0[0], selector); + w1[3] = hc_byte_perm (w0[0], 0, selector); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -10292,39 +10292,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 8: - c2[0] = __byte_perm ( 0, w7[3], selector); - c1[3] = __byte_perm (w7[3], w7[2], selector); - c1[2] = __byte_perm (w7[2], w7[1], selector); - c1[1] = __byte_perm (w7[1], w7[0], selector); - c1[0] = __byte_perm (w7[0], w6[3], selector); - c0[3] = __byte_perm (w6[3], w6[2], selector); - c0[2] = __byte_perm (w6[2], w6[1], selector); - c0[1] = __byte_perm (w6[1], w6[0], selector); - c0[0] = __byte_perm (w6[0], w5[3], selector); - w7[3] = __byte_perm (w5[3], w5[2], selector); - w7[2] = __byte_perm (w5[2], w5[1], selector); - w7[1] = __byte_perm (w5[1], w5[0], selector); - w7[0] = __byte_perm (w5[0], w4[3], selector); - w6[3] = __byte_perm (w4[3], w4[2], selector); - w6[2] = __byte_perm (w4[2], w4[1], selector); - w6[1] = __byte_perm (w4[1], w4[0], selector); - w6[0] = __byte_perm (w4[0], w3[3], selector); - w5[3] = __byte_perm (w3[3], w3[2], selector); - w5[2] = __byte_perm (w3[2], w3[1], selector); - w5[1] = __byte_perm (w3[1], w3[0], selector); - w5[0] = __byte_perm (w3[0], w2[3], selector); - w4[3] = __byte_perm (w2[3], w2[2], selector); - w4[2] = __byte_perm (w2[2], w2[1], selector); - w4[1] = __byte_perm (w2[1], w2[0], selector); - w4[0] = __byte_perm (w2[0], w1[3], selector); - w3[3] = __byte_perm (w1[3], w1[2], selector); - w3[2] = __byte_perm (w1[2], w1[1], selector); - w3[1] = __byte_perm (w1[1], w1[0], selector); - w3[0] = __byte_perm (w1[0], w0[3], selector); - w2[3] = __byte_perm (w0[3], w0[2], selector); - w2[2] = __byte_perm (w0[2], w0[1], selector); - w2[1] = __byte_perm (w0[1], w0[0], selector); - w2[0] = __byte_perm (w0[0], 0, selector); + c2[0] = hc_byte_perm ( 0, w7[3], selector); + c1[3] = hc_byte_perm (w7[3], w7[2], selector); + c1[2] = hc_byte_perm (w7[2], w7[1], selector); + c1[1] = hc_byte_perm (w7[1], w7[0], selector); + c1[0] = hc_byte_perm (w7[0], w6[3], selector); + c0[3] = hc_byte_perm (w6[3], w6[2], selector); + c0[2] = hc_byte_perm (w6[2], w6[1], selector); + c0[1] = hc_byte_perm (w6[1], w6[0], selector); + c0[0] = hc_byte_perm (w6[0], w5[3], selector); + w7[3] = hc_byte_perm (w5[3], w5[2], selector); + w7[2] = hc_byte_perm (w5[2], w5[1], selector); + w7[1] = hc_byte_perm (w5[1], w5[0], selector); + w7[0] = hc_byte_perm (w5[0], w4[3], selector); + w6[3] = hc_byte_perm (w4[3], w4[2], selector); + w6[2] = hc_byte_perm (w4[2], w4[1], selector); + w6[1] = hc_byte_perm (w4[1], w4[0], selector); + w6[0] = hc_byte_perm (w4[0], w3[3], selector); + w5[3] = hc_byte_perm (w3[3], w3[2], selector); + w5[2] = hc_byte_perm (w3[2], w3[1], selector); + w5[1] = hc_byte_perm (w3[1], w3[0], selector); + w5[0] = hc_byte_perm (w3[0], w2[3], selector); + w4[3] = hc_byte_perm (w2[3], w2[2], selector); + w4[2] = hc_byte_perm (w2[2], w2[1], selector); + w4[1] = hc_byte_perm (w2[1], w2[0], selector); + w4[0] = hc_byte_perm (w2[0], w1[3], selector); + w3[3] = hc_byte_perm (w1[3], w1[2], selector); + w3[2] = hc_byte_perm (w1[2], w1[1], selector); + w3[1] = hc_byte_perm (w1[1], w1[0], selector); + w3[0] = hc_byte_perm (w1[0], w0[3], selector); + w2[3] = hc_byte_perm (w0[3], w0[2], selector); + w2[2] = hc_byte_perm (w0[2], w0[1], selector); + w2[1] = hc_byte_perm (w0[1], w0[0], selector); + w2[0] = hc_byte_perm (w0[0], 0, selector); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -10337,39 +10337,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 9: - c2[1] = __byte_perm ( 0, w7[3], selector); - c2[0] = __byte_perm (w7[3], w7[2], selector); - c1[3] = __byte_perm (w7[2], w7[1], selector); - c1[2] = __byte_perm (w7[1], w7[0], selector); - c1[1] = __byte_perm (w7[0], w6[3], selector); - c1[0] = __byte_perm (w6[3], w6[2], selector); - c0[3] = __byte_perm (w6[2], w6[1], selector); - c0[2] = __byte_perm (w6[1], w6[0], selector); - c0[1] = __byte_perm (w6[0], w5[3], selector); - c0[0] = __byte_perm (w5[3], w5[2], selector); - w7[3] = __byte_perm (w5[2], w5[1], selector); - w7[2] = __byte_perm (w5[1], w5[0], selector); - w7[1] = __byte_perm (w5[0], w4[3], selector); - w7[0] = __byte_perm (w4[3], w4[2], selector); - w6[3] = __byte_perm (w4[2], w4[1], selector); - w6[2] = __byte_perm (w4[1], w4[0], selector); - w6[1] = __byte_perm (w4[0], w3[3], selector); - w6[0] = __byte_perm (w3[3], w3[2], selector); - w5[3] = __byte_perm (w3[2], w3[1], selector); - w5[2] = __byte_perm (w3[1], w3[0], selector); - w5[1] = __byte_perm (w3[0], w2[3], selector); - w5[0] = __byte_perm (w2[3], w2[2], selector); - w4[3] = __byte_perm (w2[2], w2[1], selector); - w4[2] = __byte_perm (w2[1], w2[0], selector); - w4[1] = __byte_perm (w2[0], w1[3], selector); - w4[0] = __byte_perm (w1[3], w1[2], selector); - w3[3] = __byte_perm (w1[2], w1[1], selector); - w3[2] = __byte_perm (w1[1], w1[0], selector); - w3[1] = __byte_perm (w1[0], w0[3], selector); - w3[0] = __byte_perm (w0[3], w0[2], selector); - w2[3] = __byte_perm (w0[2], w0[1], selector); - w2[2] = __byte_perm (w0[1], w0[0], selector); - w2[1] = __byte_perm (w0[0], 0, selector); + c2[1] = hc_byte_perm ( 0, w7[3], selector); + c2[0] = hc_byte_perm (w7[3], w7[2], selector); + c1[3] = hc_byte_perm (w7[2], w7[1], selector); + c1[2] = hc_byte_perm (w7[1], w7[0], selector); + c1[1] = hc_byte_perm (w7[0], w6[3], selector); + c1[0] = hc_byte_perm (w6[3], w6[2], selector); + c0[3] = hc_byte_perm (w6[2], w6[1], selector); + c0[2] = hc_byte_perm (w6[1], w6[0], selector); + c0[1] = hc_byte_perm (w6[0], w5[3], selector); + c0[0] = hc_byte_perm (w5[3], w5[2], selector); + w7[3] = hc_byte_perm (w5[2], w5[1], selector); + w7[2] = hc_byte_perm (w5[1], w5[0], selector); + w7[1] = hc_byte_perm (w5[0], w4[3], selector); + w7[0] = hc_byte_perm (w4[3], w4[2], selector); + w6[3] = hc_byte_perm (w4[2], w4[1], selector); + w6[2] = hc_byte_perm (w4[1], w4[0], selector); + w6[1] = hc_byte_perm (w4[0], w3[3], selector); + w6[0] = hc_byte_perm (w3[3], w3[2], selector); + w5[3] = hc_byte_perm (w3[2], w3[1], selector); + w5[2] = hc_byte_perm (w3[1], w3[0], selector); + w5[1] = hc_byte_perm (w3[0], w2[3], selector); + w5[0] = hc_byte_perm (w2[3], w2[2], selector); + w4[3] = hc_byte_perm (w2[2], w2[1], selector); + w4[2] = hc_byte_perm (w2[1], w2[0], selector); + w4[1] = hc_byte_perm (w2[0], w1[3], selector); + w4[0] = hc_byte_perm (w1[3], w1[2], selector); + w3[3] = hc_byte_perm (w1[2], w1[1], selector); + w3[2] = hc_byte_perm (w1[1], w1[0], selector); + w3[1] = hc_byte_perm (w1[0], w0[3], selector); + w3[0] = hc_byte_perm (w0[3], w0[2], selector); + w2[3] = hc_byte_perm (w0[2], w0[1], selector); + w2[2] = hc_byte_perm (w0[1], w0[0], selector); + w2[1] = hc_byte_perm (w0[0], 0, selector); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -10383,39 +10383,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 10: - c2[2] = __byte_perm ( 0, w7[3], selector); - c2[1] = __byte_perm (w7[3], w7[2], selector); - c2[0] = __byte_perm (w7[2], w7[1], selector); - c1[3] = __byte_perm (w7[1], w7[0], selector); - c1[2] = __byte_perm (w7[0], w6[3], selector); - c1[1] = __byte_perm (w6[3], w6[2], selector); - c1[0] = __byte_perm (w6[2], w6[1], selector); - c0[3] = __byte_perm (w6[1], w6[0], selector); - c0[2] = __byte_perm (w6[0], w5[3], selector); - c0[1] = __byte_perm (w5[3], w5[2], selector); - c0[0] = __byte_perm (w5[2], w5[1], selector); - w7[3] = __byte_perm (w5[1], w5[0], selector); - w7[2] = __byte_perm (w5[0], w4[3], selector); - w7[1] = __byte_perm (w4[3], w4[2], selector); - w7[0] = __byte_perm (w4[2], w4[1], selector); - w6[3] = __byte_perm (w4[1], w4[0], selector); - w6[2] = __byte_perm (w4[0], w3[3], selector); - w6[1] = __byte_perm (w3[3], w3[2], selector); - w6[0] = __byte_perm (w3[2], w3[1], selector); - w5[3] = __byte_perm (w3[1], w3[0], selector); - w5[2] = __byte_perm (w3[0], w2[3], selector); - w5[1] = __byte_perm (w2[3], w2[2], selector); - w5[0] = __byte_perm (w2[2], w2[1], selector); - w4[3] = __byte_perm (w2[1], w2[0], selector); - w4[2] = __byte_perm (w2[0], w1[3], selector); - w4[1] = __byte_perm (w1[3], w1[2], selector); - w4[0] = __byte_perm (w1[2], w1[1], selector); - w3[3] = __byte_perm (w1[1], w1[0], selector); - w3[2] = __byte_perm (w1[0], w0[3], selector); - w3[1] = __byte_perm (w0[3], w0[2], selector); - w3[0] = __byte_perm (w0[2], w0[1], selector); - w2[3] = __byte_perm (w0[1], w0[0], selector); - w2[2] = __byte_perm (w0[0], 0, selector); + c2[2] = hc_byte_perm ( 0, w7[3], selector); + c2[1] = hc_byte_perm (w7[3], w7[2], selector); + c2[0] = hc_byte_perm (w7[2], w7[1], selector); + c1[3] = hc_byte_perm (w7[1], w7[0], selector); + c1[2] = hc_byte_perm (w7[0], w6[3], selector); + c1[1] = hc_byte_perm (w6[3], w6[2], selector); + c1[0] = hc_byte_perm (w6[2], w6[1], selector); + c0[3] = hc_byte_perm (w6[1], w6[0], selector); + c0[2] = hc_byte_perm (w6[0], w5[3], selector); + c0[1] = hc_byte_perm (w5[3], w5[2], selector); + c0[0] = hc_byte_perm (w5[2], w5[1], selector); + w7[3] = hc_byte_perm (w5[1], w5[0], selector); + w7[2] = hc_byte_perm (w5[0], w4[3], selector); + w7[1] = hc_byte_perm (w4[3], w4[2], selector); + w7[0] = hc_byte_perm (w4[2], w4[1], selector); + w6[3] = hc_byte_perm (w4[1], w4[0], selector); + w6[2] = hc_byte_perm (w4[0], w3[3], selector); + w6[1] = hc_byte_perm (w3[3], w3[2], selector); + w6[0] = hc_byte_perm (w3[2], w3[1], selector); + w5[3] = hc_byte_perm (w3[1], w3[0], selector); + w5[2] = hc_byte_perm (w3[0], w2[3], selector); + w5[1] = hc_byte_perm (w2[3], w2[2], selector); + w5[0] = hc_byte_perm (w2[2], w2[1], selector); + w4[3] = hc_byte_perm (w2[1], w2[0], selector); + w4[2] = hc_byte_perm (w2[0], w1[3], selector); + w4[1] = hc_byte_perm (w1[3], w1[2], selector); + w4[0] = hc_byte_perm (w1[2], w1[1], selector); + w3[3] = hc_byte_perm (w1[1], w1[0], selector); + w3[2] = hc_byte_perm (w1[0], w0[3], selector); + w3[1] = hc_byte_perm (w0[3], w0[2], selector); + w3[0] = hc_byte_perm (w0[2], w0[1], selector); + w2[3] = hc_byte_perm (w0[1], w0[0], selector); + w2[2] = hc_byte_perm (w0[0], 0, selector); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -10430,39 +10430,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 11: - c2[3] = __byte_perm ( 0, w7[3], selector); - c2[2] = __byte_perm (w7[3], w7[2], selector); - c2[1] = __byte_perm (w7[2], w7[1], selector); - c2[0] = __byte_perm (w7[1], w7[0], selector); - c1[3] = __byte_perm (w7[0], w6[3], selector); - c1[2] = __byte_perm (w6[3], w6[2], selector); - c1[1] = __byte_perm (w6[2], w6[1], selector); - c1[0] = __byte_perm (w6[1], w6[0], selector); - c0[3] = __byte_perm (w6[0], w5[3], selector); - c0[2] = __byte_perm (w5[3], w5[2], selector); - c0[1] = __byte_perm (w5[2], w5[1], selector); - c0[0] = __byte_perm (w5[1], w5[0], selector); - w7[3] = __byte_perm (w5[0], w4[3], selector); - w7[2] = __byte_perm (w4[3], w4[2], selector); - w7[1] = __byte_perm (w4[2], w4[1], selector); - w7[0] = __byte_perm (w4[1], w4[0], selector); - w6[3] = __byte_perm (w4[0], w3[3], selector); - w6[2] = __byte_perm (w3[3], w3[2], selector); - w6[1] = __byte_perm (w3[2], w3[1], selector); - w6[0] = __byte_perm (w3[1], w3[0], selector); - w5[3] = __byte_perm (w3[0], w2[3], selector); - w5[2] = __byte_perm (w2[3], w2[2], selector); - w5[1] = __byte_perm (w2[2], w2[1], selector); - w5[0] = __byte_perm (w2[1], w2[0], selector); - w4[3] = __byte_perm (w2[0], w1[3], selector); - w4[2] = __byte_perm (w1[3], w1[2], selector); - w4[1] = __byte_perm (w1[2], w1[1], selector); - w4[0] = __byte_perm (w1[1], w1[0], selector); - w3[3] = __byte_perm (w1[0], w0[3], selector); - w3[2] = __byte_perm (w0[3], w0[2], selector); - w3[1] = __byte_perm (w0[2], w0[1], selector); - w3[0] = __byte_perm (w0[1], w0[0], selector); - w2[3] = __byte_perm (w0[0], 0, selector); + c2[3] = hc_byte_perm ( 0, w7[3], selector); + c2[2] = hc_byte_perm (w7[3], w7[2], selector); + c2[1] = hc_byte_perm (w7[2], w7[1], selector); + c2[0] = hc_byte_perm (w7[1], w7[0], selector); + c1[3] = hc_byte_perm (w7[0], w6[3], selector); + c1[2] = hc_byte_perm (w6[3], w6[2], selector); + c1[1] = hc_byte_perm (w6[2], w6[1], selector); + c1[0] = hc_byte_perm (w6[1], w6[0], selector); + c0[3] = hc_byte_perm (w6[0], w5[3], selector); + c0[2] = hc_byte_perm (w5[3], w5[2], selector); + c0[1] = hc_byte_perm (w5[2], w5[1], selector); + c0[0] = hc_byte_perm (w5[1], w5[0], selector); + w7[3] = hc_byte_perm (w5[0], w4[3], selector); + w7[2] = hc_byte_perm (w4[3], w4[2], selector); + w7[1] = hc_byte_perm (w4[2], w4[1], selector); + w7[0] = hc_byte_perm (w4[1], w4[0], selector); + w6[3] = hc_byte_perm (w4[0], w3[3], selector); + w6[2] = hc_byte_perm (w3[3], w3[2], selector); + w6[1] = hc_byte_perm (w3[2], w3[1], selector); + w6[0] = hc_byte_perm (w3[1], w3[0], selector); + w5[3] = hc_byte_perm (w3[0], w2[3], selector); + w5[2] = hc_byte_perm (w2[3], w2[2], selector); + w5[1] = hc_byte_perm (w2[2], w2[1], selector); + w5[0] = hc_byte_perm (w2[1], w2[0], selector); + w4[3] = hc_byte_perm (w2[0], w1[3], selector); + w4[2] = hc_byte_perm (w1[3], w1[2], selector); + w4[1] = hc_byte_perm (w1[2], w1[1], selector); + w4[0] = hc_byte_perm (w1[1], w1[0], selector); + w3[3] = hc_byte_perm (w1[0], w0[3], selector); + w3[2] = hc_byte_perm (w0[3], w0[2], selector); + w3[1] = hc_byte_perm (w0[2], w0[1], selector); + w3[0] = hc_byte_perm (w0[1], w0[0], selector); + w2[3] = hc_byte_perm (w0[0], 0, selector); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -10478,39 +10478,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 12: - c3[0] = __byte_perm ( 0, w7[3], selector); - c2[3] = __byte_perm (w7[3], w7[2], selector); - c2[2] = __byte_perm (w7[2], w7[1], selector); - c2[1] = __byte_perm (w7[1], w7[0], selector); - c2[0] = __byte_perm (w7[0], w6[3], selector); - c1[3] = __byte_perm (w6[3], w6[2], selector); - c1[2] = __byte_perm (w6[2], w6[1], selector); - c1[1] = __byte_perm (w6[1], w6[0], selector); - c1[0] = __byte_perm (w6[0], w5[3], selector); - c0[3] = __byte_perm (w5[3], w5[2], selector); - c0[2] = __byte_perm (w5[2], w5[1], selector); - c0[1] = __byte_perm (w5[1], w5[0], selector); - c0[0] = __byte_perm (w5[0], w4[3], selector); - w7[3] = __byte_perm (w4[3], w4[2], selector); - w7[2] = __byte_perm (w4[2], w4[1], selector); - w7[1] = __byte_perm (w4[1], w4[0], selector); - w7[0] = __byte_perm (w4[0], w3[3], selector); - w6[3] = __byte_perm (w3[3], w3[2], selector); - w6[2] = __byte_perm (w3[2], w3[1], selector); - w6[1] = __byte_perm (w3[1], w3[0], selector); - w6[0] = __byte_perm (w3[0], w2[3], selector); - w5[3] = __byte_perm (w2[3], w2[2], selector); - w5[2] = __byte_perm (w2[2], w2[1], selector); - w5[1] = __byte_perm (w2[1], w2[0], selector); - w5[0] = __byte_perm (w2[0], w1[3], selector); - w4[3] = __byte_perm (w1[3], w1[2], selector); - w4[2] = __byte_perm (w1[2], w1[1], selector); - w4[1] = __byte_perm (w1[1], w1[0], selector); - w4[0] = __byte_perm (w1[0], w0[3], selector); - w3[3] = __byte_perm (w0[3], w0[2], selector); - w3[2] = __byte_perm (w0[2], w0[1], selector); - w3[1] = __byte_perm (w0[1], w0[0], selector); - w3[0] = __byte_perm (w0[0], 0, selector); + c3[0] = hc_byte_perm ( 0, w7[3], selector); + c2[3] = hc_byte_perm (w7[3], w7[2], selector); + c2[2] = hc_byte_perm (w7[2], w7[1], selector); + c2[1] = hc_byte_perm (w7[1], w7[0], selector); + c2[0] = hc_byte_perm (w7[0], w6[3], selector); + c1[3] = hc_byte_perm (w6[3], w6[2], selector); + c1[2] = hc_byte_perm (w6[2], w6[1], selector); + c1[1] = hc_byte_perm (w6[1], w6[0], selector); + c1[0] = hc_byte_perm (w6[0], w5[3], selector); + c0[3] = hc_byte_perm (w5[3], w5[2], selector); + c0[2] = hc_byte_perm (w5[2], w5[1], selector); + c0[1] = hc_byte_perm (w5[1], w5[0], selector); + c0[0] = hc_byte_perm (w5[0], w4[3], selector); + w7[3] = hc_byte_perm (w4[3], w4[2], selector); + w7[2] = hc_byte_perm (w4[2], w4[1], selector); + w7[1] = hc_byte_perm (w4[1], w4[0], selector); + w7[0] = hc_byte_perm (w4[0], w3[3], selector); + w6[3] = hc_byte_perm (w3[3], w3[2], selector); + w6[2] = hc_byte_perm (w3[2], w3[1], selector); + w6[1] = hc_byte_perm (w3[1], w3[0], selector); + w6[0] = hc_byte_perm (w3[0], w2[3], selector); + w5[3] = hc_byte_perm (w2[3], w2[2], selector); + w5[2] = hc_byte_perm (w2[2], w2[1], selector); + w5[1] = hc_byte_perm (w2[1], w2[0], selector); + w5[0] = hc_byte_perm (w2[0], w1[3], selector); + w4[3] = hc_byte_perm (w1[3], w1[2], selector); + w4[2] = hc_byte_perm (w1[2], w1[1], selector); + w4[1] = hc_byte_perm (w1[1], w1[0], selector); + w4[0] = hc_byte_perm (w1[0], w0[3], selector); + w3[3] = hc_byte_perm (w0[3], w0[2], selector); + w3[2] = hc_byte_perm (w0[2], w0[1], selector); + w3[1] = hc_byte_perm (w0[1], w0[0], selector); + w3[0] = hc_byte_perm (w0[0], 0, selector); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -10527,39 +10527,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 13: - c3[1] = __byte_perm ( 0, w7[3], selector); - c3[0] = __byte_perm (w7[3], w7[2], selector); - c2[3] = __byte_perm (w7[2], w7[1], selector); - c2[2] = __byte_perm (w7[1], w7[0], selector); - c2[1] = __byte_perm (w7[0], w6[3], selector); - c2[0] = __byte_perm (w6[3], w6[2], selector); - c1[3] = __byte_perm (w6[2], w6[1], selector); - c1[2] = __byte_perm (w6[1], w6[0], selector); - c1[1] = __byte_perm (w6[0], w5[3], selector); - c1[0] = __byte_perm (w5[3], w5[2], selector); - c0[3] = __byte_perm (w5[2], w5[1], selector); - c0[2] = __byte_perm (w5[1], w5[0], selector); - c0[1] = __byte_perm (w5[0], w4[3], selector); - c0[0] = __byte_perm (w4[3], w4[2], selector); - w7[3] = __byte_perm (w4[2], w4[1], selector); - w7[2] = __byte_perm (w4[1], w4[0], selector); - w7[1] = __byte_perm (w4[0], w3[3], selector); - w7[0] = __byte_perm (w3[3], w3[2], selector); - w6[3] = __byte_perm (w3[2], w3[1], selector); - w6[2] = __byte_perm (w3[1], w3[0], selector); - w6[1] = __byte_perm (w3[0], w2[3], selector); - w6[0] = __byte_perm (w2[3], w2[2], selector); - w5[3] = __byte_perm (w2[2], w2[1], selector); - w5[2] = __byte_perm (w2[1], w2[0], selector); - w5[1] = __byte_perm (w2[0], w1[3], selector); - w5[0] = __byte_perm (w1[3], w1[2], selector); - w4[3] = __byte_perm (w1[2], w1[1], selector); - w4[2] = __byte_perm (w1[1], w1[0], selector); - w4[1] = __byte_perm (w1[0], w0[3], selector); - w4[0] = __byte_perm (w0[3], w0[2], selector); - w3[3] = __byte_perm (w0[2], w0[1], selector); - w3[2] = __byte_perm (w0[1], w0[0], selector); - w3[1] = __byte_perm (w0[0], 0, selector); + c3[1] = hc_byte_perm ( 0, w7[3], selector); + c3[0] = hc_byte_perm (w7[3], w7[2], selector); + c2[3] = hc_byte_perm (w7[2], w7[1], selector); + c2[2] = hc_byte_perm (w7[1], w7[0], selector); + c2[1] = hc_byte_perm (w7[0], w6[3], selector); + c2[0] = hc_byte_perm (w6[3], w6[2], selector); + c1[3] = hc_byte_perm (w6[2], w6[1], selector); + c1[2] = hc_byte_perm (w6[1], w6[0], selector); + c1[1] = hc_byte_perm (w6[0], w5[3], selector); + c1[0] = hc_byte_perm (w5[3], w5[2], selector); + c0[3] = hc_byte_perm (w5[2], w5[1], selector); + c0[2] = hc_byte_perm (w5[1], w5[0], selector); + c0[1] = hc_byte_perm (w5[0], w4[3], selector); + c0[0] = hc_byte_perm (w4[3], w4[2], selector); + w7[3] = hc_byte_perm (w4[2], w4[1], selector); + w7[2] = hc_byte_perm (w4[1], w4[0], selector); + w7[1] = hc_byte_perm (w4[0], w3[3], selector); + w7[0] = hc_byte_perm (w3[3], w3[2], selector); + w6[3] = hc_byte_perm (w3[2], w3[1], selector); + w6[2] = hc_byte_perm (w3[1], w3[0], selector); + w6[1] = hc_byte_perm (w3[0], w2[3], selector); + w6[0] = hc_byte_perm (w2[3], w2[2], selector); + w5[3] = hc_byte_perm (w2[2], w2[1], selector); + w5[2] = hc_byte_perm (w2[1], w2[0], selector); + w5[1] = hc_byte_perm (w2[0], w1[3], selector); + w5[0] = hc_byte_perm (w1[3], w1[2], selector); + w4[3] = hc_byte_perm (w1[2], w1[1], selector); + w4[2] = hc_byte_perm (w1[1], w1[0], selector); + w4[1] = hc_byte_perm (w1[0], w0[3], selector); + w4[0] = hc_byte_perm (w0[3], w0[2], selector); + w3[3] = hc_byte_perm (w0[2], w0[1], selector); + w3[2] = hc_byte_perm (w0[1], w0[0], selector); + w3[1] = hc_byte_perm (w0[0], 0, selector); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -10577,39 +10577,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 14: - c3[2] = __byte_perm ( 0, w7[3], selector); - c3[1] = __byte_perm (w7[3], w7[2], selector); - c3[0] = __byte_perm (w7[2], w7[1], selector); - c2[3] = __byte_perm (w7[1], w7[0], selector); - c2[2] = __byte_perm (w7[0], w6[3], selector); - c2[1] = __byte_perm (w6[3], w6[2], selector); - c2[0] = __byte_perm (w6[2], w6[1], selector); - c1[3] = __byte_perm (w6[1], w6[0], selector); - c1[2] = __byte_perm (w6[0], w5[3], selector); - c1[1] = __byte_perm (w5[3], w5[2], selector); - c1[0] = __byte_perm (w5[2], w5[1], selector); - c0[3] = __byte_perm (w5[1], w5[0], selector); - c0[2] = __byte_perm (w5[0], w4[3], selector); - c0[1] = __byte_perm (w4[3], w4[2], selector); - c0[0] = __byte_perm (w4[2], w4[1], selector); - w7[3] = __byte_perm (w4[1], w4[0], selector); - w7[2] = __byte_perm (w4[0], w3[3], selector); - w7[1] = __byte_perm (w3[3], w3[2], selector); - w7[0] = __byte_perm (w3[2], w3[1], selector); - w6[3] = __byte_perm (w3[1], w3[0], selector); - w6[2] = __byte_perm (w3[0], w2[3], selector); - w6[1] = __byte_perm (w2[3], w2[2], selector); - w6[0] = __byte_perm (w2[2], w2[1], selector); - w5[3] = __byte_perm (w2[1], w2[0], selector); - w5[2] = __byte_perm (w2[0], w1[3], selector); - w5[1] = __byte_perm (w1[3], w1[2], selector); - w5[0] = __byte_perm (w1[2], w1[1], selector); - w4[3] = __byte_perm (w1[1], w1[0], selector); - w4[2] = __byte_perm (w1[0], w0[3], selector); - w4[1] = __byte_perm (w0[3], w0[2], selector); - w4[0] = __byte_perm (w0[2], w0[1], selector); - w3[3] = __byte_perm (w0[1], w0[0], selector); - w3[2] = __byte_perm (w0[0], 0, selector); + c3[2] = hc_byte_perm ( 0, w7[3], selector); + c3[1] = hc_byte_perm (w7[3], w7[2], selector); + c3[0] = hc_byte_perm (w7[2], w7[1], selector); + c2[3] = hc_byte_perm (w7[1], w7[0], selector); + c2[2] = hc_byte_perm (w7[0], w6[3], selector); + c2[1] = hc_byte_perm (w6[3], w6[2], selector); + c2[0] = hc_byte_perm (w6[2], w6[1], selector); + c1[3] = hc_byte_perm (w6[1], w6[0], selector); + c1[2] = hc_byte_perm (w6[0], w5[3], selector); + c1[1] = hc_byte_perm (w5[3], w5[2], selector); + c1[0] = hc_byte_perm (w5[2], w5[1], selector); + c0[3] = hc_byte_perm (w5[1], w5[0], selector); + c0[2] = hc_byte_perm (w5[0], w4[3], selector); + c0[1] = hc_byte_perm (w4[3], w4[2], selector); + c0[0] = hc_byte_perm (w4[2], w4[1], selector); + w7[3] = hc_byte_perm (w4[1], w4[0], selector); + w7[2] = hc_byte_perm (w4[0], w3[3], selector); + w7[1] = hc_byte_perm (w3[3], w3[2], selector); + w7[0] = hc_byte_perm (w3[2], w3[1], selector); + w6[3] = hc_byte_perm (w3[1], w3[0], selector); + w6[2] = hc_byte_perm (w3[0], w2[3], selector); + w6[1] = hc_byte_perm (w2[3], w2[2], selector); + w6[0] = hc_byte_perm (w2[2], w2[1], selector); + w5[3] = hc_byte_perm (w2[1], w2[0], selector); + w5[2] = hc_byte_perm (w2[0], w1[3], selector); + w5[1] = hc_byte_perm (w1[3], w1[2], selector); + w5[0] = hc_byte_perm (w1[2], w1[1], selector); + w4[3] = hc_byte_perm (w1[1], w1[0], selector); + w4[2] = hc_byte_perm (w1[0], w0[3], selector); + w4[1] = hc_byte_perm (w0[3], w0[2], selector); + w4[0] = hc_byte_perm (w0[2], w0[1], selector); + w3[3] = hc_byte_perm (w0[1], w0[0], selector); + w3[2] = hc_byte_perm (w0[0], 0, selector); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -10628,39 +10628,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 15: - c3[3] = __byte_perm ( 0, w7[3], selector); - c3[2] = __byte_perm (w7[3], w7[2], selector); - c3[1] = __byte_perm (w7[2], w7[1], selector); - c3[0] = __byte_perm (w7[1], w7[0], selector); - c2[3] = __byte_perm (w7[0], w6[3], selector); - c2[2] = __byte_perm (w6[3], w6[2], selector); - c2[1] = __byte_perm (w6[2], w6[1], selector); - c2[0] = __byte_perm (w6[1], w6[0], selector); - c1[3] = __byte_perm (w6[0], w5[3], selector); - c1[2] = __byte_perm (w5[3], w5[2], selector); - c1[1] = __byte_perm (w5[2], w5[1], selector); - c1[0] = __byte_perm (w5[1], w5[0], selector); - c0[3] = __byte_perm (w5[0], w4[3], selector); - c0[2] = __byte_perm (w4[3], w4[2], selector); - c0[1] = __byte_perm (w4[2], w4[1], selector); - c0[0] = __byte_perm (w4[1], w4[0], selector); - w7[3] = __byte_perm (w4[0], w3[3], selector); - w7[2] = __byte_perm (w3[3], w3[2], selector); - w7[1] = __byte_perm (w3[2], w3[1], selector); - w7[0] = __byte_perm (w3[1], w3[0], selector); - w6[3] = __byte_perm (w3[0], w2[3], selector); - w6[2] = __byte_perm (w2[3], w2[2], selector); - w6[1] = __byte_perm (w2[2], w2[1], selector); - w6[0] = __byte_perm (w2[1], w2[0], selector); - w5[3] = __byte_perm (w2[0], w1[3], selector); - w5[2] = __byte_perm (w1[3], w1[2], selector); - w5[1] = __byte_perm (w1[2], w1[1], selector); - w5[0] = __byte_perm (w1[1], w1[0], selector); - w4[3] = __byte_perm (w1[0], w0[3], selector); - w4[2] = __byte_perm (w0[3], w0[2], selector); - w4[1] = __byte_perm (w0[2], w0[1], selector); - w4[0] = __byte_perm (w0[1], w0[0], selector); - w3[3] = __byte_perm (w0[0], 0, selector); + c3[3] = hc_byte_perm ( 0, w7[3], selector); + c3[2] = hc_byte_perm (w7[3], w7[2], selector); + c3[1] = hc_byte_perm (w7[2], w7[1], selector); + c3[0] = hc_byte_perm (w7[1], w7[0], selector); + c2[3] = hc_byte_perm (w7[0], w6[3], selector); + c2[2] = hc_byte_perm (w6[3], w6[2], selector); + c2[1] = hc_byte_perm (w6[2], w6[1], selector); + c2[0] = hc_byte_perm (w6[1], w6[0], selector); + c1[3] = hc_byte_perm (w6[0], w5[3], selector); + c1[2] = hc_byte_perm (w5[3], w5[2], selector); + c1[1] = hc_byte_perm (w5[2], w5[1], selector); + c1[0] = hc_byte_perm (w5[1], w5[0], selector); + c0[3] = hc_byte_perm (w5[0], w4[3], selector); + c0[2] = hc_byte_perm (w4[3], w4[2], selector); + c0[1] = hc_byte_perm (w4[2], w4[1], selector); + c0[0] = hc_byte_perm (w4[1], w4[0], selector); + w7[3] = hc_byte_perm (w4[0], w3[3], selector); + w7[2] = hc_byte_perm (w3[3], w3[2], selector); + w7[1] = hc_byte_perm (w3[2], w3[1], selector); + w7[0] = hc_byte_perm (w3[1], w3[0], selector); + w6[3] = hc_byte_perm (w3[0], w2[3], selector); + w6[2] = hc_byte_perm (w2[3], w2[2], selector); + w6[1] = hc_byte_perm (w2[2], w2[1], selector); + w6[0] = hc_byte_perm (w2[1], w2[0], selector); + w5[3] = hc_byte_perm (w2[0], w1[3], selector); + w5[2] = hc_byte_perm (w1[3], w1[2], selector); + w5[1] = hc_byte_perm (w1[2], w1[1], selector); + w5[0] = hc_byte_perm (w1[1], w1[0], selector); + w4[3] = hc_byte_perm (w1[0], w0[3], selector); + w4[2] = hc_byte_perm (w0[3], w0[2], selector); + w4[1] = hc_byte_perm (w0[2], w0[1], selector); + w4[0] = hc_byte_perm (w0[1], w0[0], selector); + w3[3] = hc_byte_perm (w0[0], 0, selector); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -10680,39 +10680,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 16: - c4[0] = __byte_perm ( 0, w7[3], selector); - c3[3] = __byte_perm (w7[3], w7[2], selector); - c3[2] = __byte_perm (w7[2], w7[1], selector); - c3[1] = __byte_perm (w7[1], w7[0], selector); - c3[0] = __byte_perm (w7[0], w6[3], selector); - c2[3] = __byte_perm (w6[3], w6[2], selector); - c2[2] = __byte_perm (w6[2], w6[1], selector); - c2[1] = __byte_perm (w6[1], w6[0], selector); - c2[0] = __byte_perm (w6[0], w5[3], selector); - c1[3] = __byte_perm (w5[3], w5[2], selector); - c1[2] = __byte_perm (w5[2], w5[1], selector); - c1[1] = __byte_perm (w5[1], w5[0], selector); - c1[0] = __byte_perm (w5[0], w4[3], selector); - c0[3] = __byte_perm (w4[3], w4[2], selector); - c0[2] = __byte_perm (w4[2], w4[1], selector); - c0[1] = __byte_perm (w4[1], w4[0], selector); - c0[0] = __byte_perm (w4[0], w3[3], selector); - w7[3] = __byte_perm (w3[3], w3[2], selector); - w7[2] = __byte_perm (w3[2], w3[1], selector); - w7[1] = __byte_perm (w3[1], w3[0], selector); - w7[0] = __byte_perm (w3[0], w2[3], selector); - w6[3] = __byte_perm (w2[3], w2[2], selector); - w6[2] = __byte_perm (w2[2], w2[1], selector); - w6[1] = __byte_perm (w2[1], w2[0], selector); - w6[0] = __byte_perm (w2[0], w1[3], selector); - w5[3] = __byte_perm (w1[3], w1[2], selector); - w5[2] = __byte_perm (w1[2], w1[1], selector); - w5[1] = __byte_perm (w1[1], w1[0], selector); - w5[0] = __byte_perm (w1[0], w0[3], selector); - w4[3] = __byte_perm (w0[3], w0[2], selector); - w4[2] = __byte_perm (w0[2], w0[1], selector); - w4[1] = __byte_perm (w0[1], w0[0], selector); - w4[0] = __byte_perm (w0[0], 0, selector); + c4[0] = hc_byte_perm ( 0, w7[3], selector); + c3[3] = hc_byte_perm (w7[3], w7[2], selector); + c3[2] = hc_byte_perm (w7[2], w7[1], selector); + c3[1] = hc_byte_perm (w7[1], w7[0], selector); + c3[0] = hc_byte_perm (w7[0], w6[3], selector); + c2[3] = hc_byte_perm (w6[3], w6[2], selector); + c2[2] = hc_byte_perm (w6[2], w6[1], selector); + c2[1] = hc_byte_perm (w6[1], w6[0], selector); + c2[0] = hc_byte_perm (w6[0], w5[3], selector); + c1[3] = hc_byte_perm (w5[3], w5[2], selector); + c1[2] = hc_byte_perm (w5[2], w5[1], selector); + c1[1] = hc_byte_perm (w5[1], w5[0], selector); + c1[0] = hc_byte_perm (w5[0], w4[3], selector); + c0[3] = hc_byte_perm (w4[3], w4[2], selector); + c0[2] = hc_byte_perm (w4[2], w4[1], selector); + c0[1] = hc_byte_perm (w4[1], w4[0], selector); + c0[0] = hc_byte_perm (w4[0], w3[3], selector); + w7[3] = hc_byte_perm (w3[3], w3[2], selector); + w7[2] = hc_byte_perm (w3[2], w3[1], selector); + w7[1] = hc_byte_perm (w3[1], w3[0], selector); + w7[0] = hc_byte_perm (w3[0], w2[3], selector); + w6[3] = hc_byte_perm (w2[3], w2[2], selector); + w6[2] = hc_byte_perm (w2[2], w2[1], selector); + w6[1] = hc_byte_perm (w2[1], w2[0], selector); + w6[0] = hc_byte_perm (w2[0], w1[3], selector); + w5[3] = hc_byte_perm (w1[3], w1[2], selector); + w5[2] = hc_byte_perm (w1[2], w1[1], selector); + w5[1] = hc_byte_perm (w1[1], w1[0], selector); + w5[0] = hc_byte_perm (w1[0], w0[3], selector); + w4[3] = hc_byte_perm (w0[3], w0[2], selector); + w4[2] = hc_byte_perm (w0[2], w0[1], selector); + w4[1] = hc_byte_perm (w0[1], w0[0], selector); + w4[0] = hc_byte_perm (w0[0], 0, selector); w3[3] = 0; w3[2] = 0; w3[1] = 0; @@ -10733,39 +10733,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 17: - c4[1] = __byte_perm ( 0, w7[3], selector); - c4[0] = __byte_perm (w7[3], w7[2], selector); - c3[3] = __byte_perm (w7[2], w7[1], selector); - c3[2] = __byte_perm (w7[1], w7[0], selector); - c3[1] = __byte_perm (w7[0], w6[3], selector); - c3[0] = __byte_perm (w6[3], w6[2], selector); - c2[3] = __byte_perm (w6[2], w6[1], selector); - c2[2] = __byte_perm (w6[1], w6[0], selector); - c2[1] = __byte_perm (w6[0], w5[3], selector); - c2[0] = __byte_perm (w5[3], w5[2], selector); - c1[3] = __byte_perm (w5[2], w5[1], selector); - c1[2] = __byte_perm (w5[1], w5[0], selector); - c1[1] = __byte_perm (w5[0], w4[3], selector); - c1[0] = __byte_perm (w4[3], w4[2], selector); - c0[3] = __byte_perm (w4[2], w4[1], selector); - c0[2] = __byte_perm (w4[1], w4[0], selector); - c0[1] = __byte_perm (w4[0], w3[3], selector); - c0[0] = __byte_perm (w3[3], w3[2], selector); - w7[3] = __byte_perm (w3[2], w3[1], selector); - w7[2] = __byte_perm (w3[1], w3[0], selector); - w7[1] = __byte_perm (w3[0], w2[3], selector); - w7[0] = __byte_perm (w2[3], w2[2], selector); - w6[3] = __byte_perm (w2[2], w2[1], selector); - w6[2] = __byte_perm (w2[1], w2[0], selector); - w6[1] = __byte_perm (w2[0], w1[3], selector); - w6[0] = __byte_perm (w1[3], w1[2], selector); - w5[3] = __byte_perm (w1[2], w1[1], selector); - w5[2] = __byte_perm (w1[1], w1[0], selector); - w5[1] = __byte_perm (w1[0], w0[3], selector); - w5[0] = __byte_perm (w0[3], w0[2], selector); - w4[3] = __byte_perm (w0[2], w0[1], selector); - w4[2] = __byte_perm (w0[1], w0[0], selector); - w4[1] = __byte_perm (w0[0], 0, selector); + c4[1] = hc_byte_perm ( 0, w7[3], selector); + c4[0] = hc_byte_perm (w7[3], w7[2], selector); + c3[3] = hc_byte_perm (w7[2], w7[1], selector); + c3[2] = hc_byte_perm (w7[1], w7[0], selector); + c3[1] = hc_byte_perm (w7[0], w6[3], selector); + c3[0] = hc_byte_perm (w6[3], w6[2], selector); + c2[3] = hc_byte_perm (w6[2], w6[1], selector); + c2[2] = hc_byte_perm (w6[1], w6[0], selector); + c2[1] = hc_byte_perm (w6[0], w5[3], selector); + c2[0] = hc_byte_perm (w5[3], w5[2], selector); + c1[3] = hc_byte_perm (w5[2], w5[1], selector); + c1[2] = hc_byte_perm (w5[1], w5[0], selector); + c1[1] = hc_byte_perm (w5[0], w4[3], selector); + c1[0] = hc_byte_perm (w4[3], w4[2], selector); + c0[3] = hc_byte_perm (w4[2], w4[1], selector); + c0[2] = hc_byte_perm (w4[1], w4[0], selector); + c0[1] = hc_byte_perm (w4[0], w3[3], selector); + c0[0] = hc_byte_perm (w3[3], w3[2], selector); + w7[3] = hc_byte_perm (w3[2], w3[1], selector); + w7[2] = hc_byte_perm (w3[1], w3[0], selector); + w7[1] = hc_byte_perm (w3[0], w2[3], selector); + w7[0] = hc_byte_perm (w2[3], w2[2], selector); + w6[3] = hc_byte_perm (w2[2], w2[1], selector); + w6[2] = hc_byte_perm (w2[1], w2[0], selector); + w6[1] = hc_byte_perm (w2[0], w1[3], selector); + w6[0] = hc_byte_perm (w1[3], w1[2], selector); + w5[3] = hc_byte_perm (w1[2], w1[1], selector); + w5[2] = hc_byte_perm (w1[1], w1[0], selector); + w5[1] = hc_byte_perm (w1[0], w0[3], selector); + w5[0] = hc_byte_perm (w0[3], w0[2], selector); + w4[3] = hc_byte_perm (w0[2], w0[1], selector); + w4[2] = hc_byte_perm (w0[1], w0[0], selector); + w4[1] = hc_byte_perm (w0[0], 0, selector); w4[0] = 0; w3[3] = 0; w3[2] = 0; @@ -10787,39 +10787,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 18: - c4[2] = __byte_perm ( 0, w7[3], selector); - c4[1] = __byte_perm (w7[3], w7[2], selector); - c4[0] = __byte_perm (w7[2], w7[1], selector); - c3[3] = __byte_perm (w7[1], w7[0], selector); - c3[2] = __byte_perm (w7[0], w6[3], selector); - c3[1] = __byte_perm (w6[3], w6[2], selector); - c3[0] = __byte_perm (w6[2], w6[1], selector); - c2[3] = __byte_perm (w6[1], w6[0], selector); - c2[2] = __byte_perm (w6[0], w5[3], selector); - c2[1] = __byte_perm (w5[3], w5[2], selector); - c2[0] = __byte_perm (w5[2], w5[1], selector); - c1[3] = __byte_perm (w5[1], w5[0], selector); - c1[2] = __byte_perm (w5[0], w4[3], selector); - c1[1] = __byte_perm (w4[3], w4[2], selector); - c1[0] = __byte_perm (w4[2], w4[1], selector); - c0[3] = __byte_perm (w4[1], w4[0], selector); - c0[2] = __byte_perm (w4[0], w3[3], selector); - c0[1] = __byte_perm (w3[3], w3[2], selector); - c0[0] = __byte_perm (w3[2], w3[1], selector); - w7[3] = __byte_perm (w3[1], w3[0], selector); - w7[2] = __byte_perm (w3[0], w2[3], selector); - w7[1] = __byte_perm (w2[3], w2[2], selector); - w7[0] = __byte_perm (w2[2], w2[1], selector); - w6[3] = __byte_perm (w2[1], w2[0], selector); - w6[2] = __byte_perm (w2[0], w1[3], selector); - w6[1] = __byte_perm (w1[3], w1[2], selector); - w6[0] = __byte_perm (w1[2], w1[1], selector); - w5[3] = __byte_perm (w1[1], w1[0], selector); - w5[2] = __byte_perm (w1[0], w0[3], selector); - w5[1] = __byte_perm (w0[3], w0[2], selector); - w5[0] = __byte_perm (w0[2], w0[1], selector); - w4[3] = __byte_perm (w0[1], w0[0], selector); - w4[2] = __byte_perm (w0[0], 0, selector); + c4[2] = hc_byte_perm ( 0, w7[3], selector); + c4[1] = hc_byte_perm (w7[3], w7[2], selector); + c4[0] = hc_byte_perm (w7[2], w7[1], selector); + c3[3] = hc_byte_perm (w7[1], w7[0], selector); + c3[2] = hc_byte_perm (w7[0], w6[3], selector); + c3[1] = hc_byte_perm (w6[3], w6[2], selector); + c3[0] = hc_byte_perm (w6[2], w6[1], selector); + c2[3] = hc_byte_perm (w6[1], w6[0], selector); + c2[2] = hc_byte_perm (w6[0], w5[3], selector); + c2[1] = hc_byte_perm (w5[3], w5[2], selector); + c2[0] = hc_byte_perm (w5[2], w5[1], selector); + c1[3] = hc_byte_perm (w5[1], w5[0], selector); + c1[2] = hc_byte_perm (w5[0], w4[3], selector); + c1[1] = hc_byte_perm (w4[3], w4[2], selector); + c1[0] = hc_byte_perm (w4[2], w4[1], selector); + c0[3] = hc_byte_perm (w4[1], w4[0], selector); + c0[2] = hc_byte_perm (w4[0], w3[3], selector); + c0[1] = hc_byte_perm (w3[3], w3[2], selector); + c0[0] = hc_byte_perm (w3[2], w3[1], selector); + w7[3] = hc_byte_perm (w3[1], w3[0], selector); + w7[2] = hc_byte_perm (w3[0], w2[3], selector); + w7[1] = hc_byte_perm (w2[3], w2[2], selector); + w7[0] = hc_byte_perm (w2[2], w2[1], selector); + w6[3] = hc_byte_perm (w2[1], w2[0], selector); + w6[2] = hc_byte_perm (w2[0], w1[3], selector); + w6[1] = hc_byte_perm (w1[3], w1[2], selector); + w6[0] = hc_byte_perm (w1[2], w1[1], selector); + w5[3] = hc_byte_perm (w1[1], w1[0], selector); + w5[2] = hc_byte_perm (w1[0], w0[3], selector); + w5[1] = hc_byte_perm (w0[3], w0[2], selector); + w5[0] = hc_byte_perm (w0[2], w0[1], selector); + w4[3] = hc_byte_perm (w0[1], w0[0], selector); + w4[2] = hc_byte_perm (w0[0], 0, selector); w4[1] = 0; w4[0] = 0; w3[3] = 0; @@ -10842,39 +10842,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 19: - c4[3] = __byte_perm ( 0, w7[3], selector); - c4[2] = __byte_perm (w7[3], w7[2], selector); - c4[1] = __byte_perm (w7[2], w7[1], selector); - c4[0] = __byte_perm (w7[1], w7[0], selector); - c3[3] = __byte_perm (w7[0], w6[3], selector); - c3[2] = __byte_perm (w6[3], w6[2], selector); - c3[1] = __byte_perm (w6[2], w6[1], selector); - c3[0] = __byte_perm (w6[1], w6[0], selector); - c2[3] = __byte_perm (w6[0], w5[3], selector); - c2[2] = __byte_perm (w5[3], w5[2], selector); - c2[1] = __byte_perm (w5[2], w5[1], selector); - c2[0] = __byte_perm (w5[1], w5[0], selector); - c1[3] = __byte_perm (w5[0], w4[3], selector); - c1[2] = __byte_perm (w4[3], w4[2], selector); - c1[1] = __byte_perm (w4[2], w4[1], selector); - c1[0] = __byte_perm (w4[1], w4[0], selector); - c0[3] = __byte_perm (w4[0], w3[3], selector); - c0[2] = __byte_perm (w3[3], w3[2], selector); - c0[1] = __byte_perm (w3[2], w3[1], selector); - c0[0] = __byte_perm (w3[1], w3[0], selector); - w7[3] = __byte_perm (w3[0], w2[3], selector); - w7[2] = __byte_perm (w2[3], w2[2], selector); - w7[1] = __byte_perm (w2[2], w2[1], selector); - w7[0] = __byte_perm (w2[1], w2[0], selector); - w6[3] = __byte_perm (w2[0], w1[3], selector); - w6[2] = __byte_perm (w1[3], w1[2], selector); - w6[1] = __byte_perm (w1[2], w1[1], selector); - w6[0] = __byte_perm (w1[1], w1[0], selector); - w5[3] = __byte_perm (w1[0], w0[3], selector); - w5[2] = __byte_perm (w0[3], w0[2], selector); - w5[1] = __byte_perm (w0[2], w0[1], selector); - w5[0] = __byte_perm (w0[1], w0[0], selector); - w4[3] = __byte_perm (w0[0], 0, selector); + c4[3] = hc_byte_perm ( 0, w7[3], selector); + c4[2] = hc_byte_perm (w7[3], w7[2], selector); + c4[1] = hc_byte_perm (w7[2], w7[1], selector); + c4[0] = hc_byte_perm (w7[1], w7[0], selector); + c3[3] = hc_byte_perm (w7[0], w6[3], selector); + c3[2] = hc_byte_perm (w6[3], w6[2], selector); + c3[1] = hc_byte_perm (w6[2], w6[1], selector); + c3[0] = hc_byte_perm (w6[1], w6[0], selector); + c2[3] = hc_byte_perm (w6[0], w5[3], selector); + c2[2] = hc_byte_perm (w5[3], w5[2], selector); + c2[1] = hc_byte_perm (w5[2], w5[1], selector); + c2[0] = hc_byte_perm (w5[1], w5[0], selector); + c1[3] = hc_byte_perm (w5[0], w4[3], selector); + c1[2] = hc_byte_perm (w4[3], w4[2], selector); + c1[1] = hc_byte_perm (w4[2], w4[1], selector); + c1[0] = hc_byte_perm (w4[1], w4[0], selector); + c0[3] = hc_byte_perm (w4[0], w3[3], selector); + c0[2] = hc_byte_perm (w3[3], w3[2], selector); + c0[1] = hc_byte_perm (w3[2], w3[1], selector); + c0[0] = hc_byte_perm (w3[1], w3[0], selector); + w7[3] = hc_byte_perm (w3[0], w2[3], selector); + w7[2] = hc_byte_perm (w2[3], w2[2], selector); + w7[1] = hc_byte_perm (w2[2], w2[1], selector); + w7[0] = hc_byte_perm (w2[1], w2[0], selector); + w6[3] = hc_byte_perm (w2[0], w1[3], selector); + w6[2] = hc_byte_perm (w1[3], w1[2], selector); + w6[1] = hc_byte_perm (w1[2], w1[1], selector); + w6[0] = hc_byte_perm (w1[1], w1[0], selector); + w5[3] = hc_byte_perm (w1[0], w0[3], selector); + w5[2] = hc_byte_perm (w0[3], w0[2], selector); + w5[1] = hc_byte_perm (w0[2], w0[1], selector); + w5[0] = hc_byte_perm (w0[1], w0[0], selector); + w4[3] = hc_byte_perm (w0[0], 0, selector); w4[2] = 0; w4[1] = 0; w4[0] = 0; @@ -10898,39 +10898,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 20: - c5[0] = __byte_perm ( 0, w7[3], selector); - c4[3] = __byte_perm (w7[3], w7[2], selector); - c4[2] = __byte_perm (w7[2], w7[1], selector); - c4[1] = __byte_perm (w7[1], w7[0], selector); - c4[0] = __byte_perm (w7[0], w6[3], selector); - c3[3] = __byte_perm (w6[3], w6[2], selector); - c3[2] = __byte_perm (w6[2], w6[1], selector); - c3[1] = __byte_perm (w6[1], w6[0], selector); - c3[0] = __byte_perm (w6[0], w5[3], selector); - c2[3] = __byte_perm (w5[3], w5[2], selector); - c2[2] = __byte_perm (w5[2], w5[1], selector); - c2[1] = __byte_perm (w5[1], w5[0], selector); - c2[0] = __byte_perm (w5[0], w4[3], selector); - c1[3] = __byte_perm (w4[3], w4[2], selector); - c1[2] = __byte_perm (w4[2], w4[1], selector); - c1[1] = __byte_perm (w4[1], w4[0], selector); - c1[0] = __byte_perm (w4[0], w3[3], selector); - c0[3] = __byte_perm (w3[3], w3[2], selector); - c0[2] = __byte_perm (w3[2], w3[1], selector); - c0[1] = __byte_perm (w3[1], w3[0], selector); - c0[0] = __byte_perm (w3[0], w2[3], selector); - w7[3] = __byte_perm (w2[3], w2[2], selector); - w7[2] = __byte_perm (w2[2], w2[1], selector); - w7[1] = __byte_perm (w2[1], w2[0], selector); - w7[0] = __byte_perm (w2[0], w1[3], selector); - w6[3] = __byte_perm (w1[3], w1[2], selector); - w6[2] = __byte_perm (w1[2], w1[1], selector); - w6[1] = __byte_perm (w1[1], w1[0], selector); - w6[0] = __byte_perm (w1[0], w0[3], selector); - w5[3] = __byte_perm (w0[3], w0[2], selector); - w5[2] = __byte_perm (w0[2], w0[1], selector); - w5[1] = __byte_perm (w0[1], w0[0], selector); - w5[0] = __byte_perm (w0[0], 0, selector); + c5[0] = hc_byte_perm ( 0, w7[3], selector); + c4[3] = hc_byte_perm (w7[3], w7[2], selector); + c4[2] = hc_byte_perm (w7[2], w7[1], selector); + c4[1] = hc_byte_perm (w7[1], w7[0], selector); + c4[0] = hc_byte_perm (w7[0], w6[3], selector); + c3[3] = hc_byte_perm (w6[3], w6[2], selector); + c3[2] = hc_byte_perm (w6[2], w6[1], selector); + c3[1] = hc_byte_perm (w6[1], w6[0], selector); + c3[0] = hc_byte_perm (w6[0], w5[3], selector); + c2[3] = hc_byte_perm (w5[3], w5[2], selector); + c2[2] = hc_byte_perm (w5[2], w5[1], selector); + c2[1] = hc_byte_perm (w5[1], w5[0], selector); + c2[0] = hc_byte_perm (w5[0], w4[3], selector); + c1[3] = hc_byte_perm (w4[3], w4[2], selector); + c1[2] = hc_byte_perm (w4[2], w4[1], selector); + c1[1] = hc_byte_perm (w4[1], w4[0], selector); + c1[0] = hc_byte_perm (w4[0], w3[3], selector); + c0[3] = hc_byte_perm (w3[3], w3[2], selector); + c0[2] = hc_byte_perm (w3[2], w3[1], selector); + c0[1] = hc_byte_perm (w3[1], w3[0], selector); + c0[0] = hc_byte_perm (w3[0], w2[3], selector); + w7[3] = hc_byte_perm (w2[3], w2[2], selector); + w7[2] = hc_byte_perm (w2[2], w2[1], selector); + w7[1] = hc_byte_perm (w2[1], w2[0], selector); + w7[0] = hc_byte_perm (w2[0], w1[3], selector); + w6[3] = hc_byte_perm (w1[3], w1[2], selector); + w6[2] = hc_byte_perm (w1[2], w1[1], selector); + w6[1] = hc_byte_perm (w1[1], w1[0], selector); + w6[0] = hc_byte_perm (w1[0], w0[3], selector); + w5[3] = hc_byte_perm (w0[3], w0[2], selector); + w5[2] = hc_byte_perm (w0[2], w0[1], selector); + w5[1] = hc_byte_perm (w0[1], w0[0], selector); + w5[0] = hc_byte_perm (w0[0], 0, selector); w4[3] = 0; w4[2] = 0; w4[1] = 0; @@ -10955,39 +10955,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 21: - c5[1] = __byte_perm ( 0, w7[3], selector); - c5[0] = __byte_perm (w7[3], w7[2], selector); - c4[3] = __byte_perm (w7[2], w7[1], selector); - c4[2] = __byte_perm (w7[1], w7[0], selector); - c4[1] = __byte_perm (w7[0], w6[3], selector); - c4[0] = __byte_perm (w6[3], w6[2], selector); - c3[3] = __byte_perm (w6[2], w6[1], selector); - c3[2] = __byte_perm (w6[1], w6[0], selector); - c3[1] = __byte_perm (w6[0], w5[3], selector); - c3[0] = __byte_perm (w5[3], w5[2], selector); - c2[3] = __byte_perm (w5[2], w5[1], selector); - c2[2] = __byte_perm (w5[1], w5[0], selector); - c2[1] = __byte_perm (w5[0], w4[3], selector); - c2[0] = __byte_perm (w4[3], w4[2], selector); - c1[3] = __byte_perm (w4[2], w4[1], selector); - c1[2] = __byte_perm (w4[1], w4[0], selector); - c1[1] = __byte_perm (w4[0], w3[3], selector); - c1[0] = __byte_perm (w3[3], w3[2], selector); - c0[3] = __byte_perm (w3[2], w3[1], selector); - c0[2] = __byte_perm (w3[1], w3[0], selector); - c0[1] = __byte_perm (w3[0], w2[3], selector); - c0[0] = __byte_perm (w2[3], w2[2], selector); - w7[3] = __byte_perm (w2[2], w2[1], selector); - w7[2] = __byte_perm (w2[1], w2[0], selector); - w7[1] = __byte_perm (w2[0], w1[3], selector); - w7[0] = __byte_perm (w1[3], w1[2], selector); - w6[3] = __byte_perm (w1[2], w1[1], selector); - w6[2] = __byte_perm (w1[1], w1[0], selector); - w6[1] = __byte_perm (w1[0], w0[3], selector); - w6[0] = __byte_perm (w0[3], w0[2], selector); - w5[3] = __byte_perm (w0[2], w0[1], selector); - w5[2] = __byte_perm (w0[1], w0[0], selector); - w5[1] = __byte_perm (w0[0], 0, selector); + c5[1] = hc_byte_perm ( 0, w7[3], selector); + c5[0] = hc_byte_perm (w7[3], w7[2], selector); + c4[3] = hc_byte_perm (w7[2], w7[1], selector); + c4[2] = hc_byte_perm (w7[1], w7[0], selector); + c4[1] = hc_byte_perm (w7[0], w6[3], selector); + c4[0] = hc_byte_perm (w6[3], w6[2], selector); + c3[3] = hc_byte_perm (w6[2], w6[1], selector); + c3[2] = hc_byte_perm (w6[1], w6[0], selector); + c3[1] = hc_byte_perm (w6[0], w5[3], selector); + c3[0] = hc_byte_perm (w5[3], w5[2], selector); + c2[3] = hc_byte_perm (w5[2], w5[1], selector); + c2[2] = hc_byte_perm (w5[1], w5[0], selector); + c2[1] = hc_byte_perm (w5[0], w4[3], selector); + c2[0] = hc_byte_perm (w4[3], w4[2], selector); + c1[3] = hc_byte_perm (w4[2], w4[1], selector); + c1[2] = hc_byte_perm (w4[1], w4[0], selector); + c1[1] = hc_byte_perm (w4[0], w3[3], selector); + c1[0] = hc_byte_perm (w3[3], w3[2], selector); + c0[3] = hc_byte_perm (w3[2], w3[1], selector); + c0[2] = hc_byte_perm (w3[1], w3[0], selector); + c0[1] = hc_byte_perm (w3[0], w2[3], selector); + c0[0] = hc_byte_perm (w2[3], w2[2], selector); + w7[3] = hc_byte_perm (w2[2], w2[1], selector); + w7[2] = hc_byte_perm (w2[1], w2[0], selector); + w7[1] = hc_byte_perm (w2[0], w1[3], selector); + w7[0] = hc_byte_perm (w1[3], w1[2], selector); + w6[3] = hc_byte_perm (w1[2], w1[1], selector); + w6[2] = hc_byte_perm (w1[1], w1[0], selector); + w6[1] = hc_byte_perm (w1[0], w0[3], selector); + w6[0] = hc_byte_perm (w0[3], w0[2], selector); + w5[3] = hc_byte_perm (w0[2], w0[1], selector); + w5[2] = hc_byte_perm (w0[1], w0[0], selector); + w5[1] = hc_byte_perm (w0[0], 0, selector); w5[0] = 0; w4[3] = 0; w4[2] = 0; @@ -11013,39 +11013,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 22: - c5[2] = __byte_perm ( 0, w7[3], selector); - c5[1] = __byte_perm (w7[3], w7[2], selector); - c5[0] = __byte_perm (w7[2], w7[1], selector); - c4[3] = __byte_perm (w7[1], w7[0], selector); - c4[2] = __byte_perm (w7[0], w6[3], selector); - c4[1] = __byte_perm (w6[3], w6[2], selector); - c4[0] = __byte_perm (w6[2], w6[1], selector); - c3[3] = __byte_perm (w6[1], w6[0], selector); - c3[2] = __byte_perm (w6[0], w5[3], selector); - c3[1] = __byte_perm (w5[3], w5[2], selector); - c3[0] = __byte_perm (w5[2], w5[1], selector); - c2[3] = __byte_perm (w5[1], w5[0], selector); - c2[2] = __byte_perm (w5[0], w4[3], selector); - c2[1] = __byte_perm (w4[3], w4[2], selector); - c2[0] = __byte_perm (w4[2], w4[1], selector); - c1[3] = __byte_perm (w4[1], w4[0], selector); - c1[2] = __byte_perm (w4[0], w3[3], selector); - c1[1] = __byte_perm (w3[3], w3[2], selector); - c1[0] = __byte_perm (w3[2], w3[1], selector); - c0[3] = __byte_perm (w3[1], w3[0], selector); - c0[2] = __byte_perm (w3[0], w2[3], selector); - c0[1] = __byte_perm (w2[3], w2[2], selector); - c0[0] = __byte_perm (w2[2], w2[1], selector); - w7[3] = __byte_perm (w2[1], w2[0], selector); - w7[2] = __byte_perm (w2[0], w1[3], selector); - w7[1] = __byte_perm (w1[3], w1[2], selector); - w7[0] = __byte_perm (w1[2], w1[1], selector); - w6[3] = __byte_perm (w1[1], w1[0], selector); - w6[2] = __byte_perm (w1[0], w0[3], selector); - w6[1] = __byte_perm (w0[3], w0[2], selector); - w6[0] = __byte_perm (w0[2], w0[1], selector); - w5[3] = __byte_perm (w0[1], w0[0], selector); - w5[2] = __byte_perm (w0[0], 0, selector); + c5[2] = hc_byte_perm ( 0, w7[3], selector); + c5[1] = hc_byte_perm (w7[3], w7[2], selector); + c5[0] = hc_byte_perm (w7[2], w7[1], selector); + c4[3] = hc_byte_perm (w7[1], w7[0], selector); + c4[2] = hc_byte_perm (w7[0], w6[3], selector); + c4[1] = hc_byte_perm (w6[3], w6[2], selector); + c4[0] = hc_byte_perm (w6[2], w6[1], selector); + c3[3] = hc_byte_perm (w6[1], w6[0], selector); + c3[2] = hc_byte_perm (w6[0], w5[3], selector); + c3[1] = hc_byte_perm (w5[3], w5[2], selector); + c3[0] = hc_byte_perm (w5[2], w5[1], selector); + c2[3] = hc_byte_perm (w5[1], w5[0], selector); + c2[2] = hc_byte_perm (w5[0], w4[3], selector); + c2[1] = hc_byte_perm (w4[3], w4[2], selector); + c2[0] = hc_byte_perm (w4[2], w4[1], selector); + c1[3] = hc_byte_perm (w4[1], w4[0], selector); + c1[2] = hc_byte_perm (w4[0], w3[3], selector); + c1[1] = hc_byte_perm (w3[3], w3[2], selector); + c1[0] = hc_byte_perm (w3[2], w3[1], selector); + c0[3] = hc_byte_perm (w3[1], w3[0], selector); + c0[2] = hc_byte_perm (w3[0], w2[3], selector); + c0[1] = hc_byte_perm (w2[3], w2[2], selector); + c0[0] = hc_byte_perm (w2[2], w2[1], selector); + w7[3] = hc_byte_perm (w2[1], w2[0], selector); + w7[2] = hc_byte_perm (w2[0], w1[3], selector); + w7[1] = hc_byte_perm (w1[3], w1[2], selector); + w7[0] = hc_byte_perm (w1[2], w1[1], selector); + w6[3] = hc_byte_perm (w1[1], w1[0], selector); + w6[2] = hc_byte_perm (w1[0], w0[3], selector); + w6[1] = hc_byte_perm (w0[3], w0[2], selector); + w6[0] = hc_byte_perm (w0[2], w0[1], selector); + w5[3] = hc_byte_perm (w0[1], w0[0], selector); + w5[2] = hc_byte_perm (w0[0], 0, selector); w5[1] = 0; w5[0] = 0; w4[3] = 0; @@ -11072,39 +11072,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 23: - c5[3] = __byte_perm ( 0, w7[3], selector); - c5[2] = __byte_perm (w7[3], w7[2], selector); - c5[1] = __byte_perm (w7[2], w7[1], selector); - c5[0] = __byte_perm (w7[1], w7[0], selector); - c4[3] = __byte_perm (w7[0], w6[3], selector); - c4[2] = __byte_perm (w6[3], w6[2], selector); - c4[1] = __byte_perm (w6[2], w6[1], selector); - c4[0] = __byte_perm (w6[1], w6[0], selector); - c3[3] = __byte_perm (w6[0], w5[3], selector); - c3[2] = __byte_perm (w5[3], w5[2], selector); - c3[1] = __byte_perm (w5[2], w5[1], selector); - c3[0] = __byte_perm (w5[1], w5[0], selector); - c2[3] = __byte_perm (w5[0], w4[3], selector); - c2[2] = __byte_perm (w4[3], w4[2], selector); - c2[1] = __byte_perm (w4[2], w4[1], selector); - c2[0] = __byte_perm (w4[1], w4[0], selector); - c1[3] = __byte_perm (w4[0], w3[3], selector); - c1[2] = __byte_perm (w3[3], w3[2], selector); - c1[1] = __byte_perm (w3[2], w3[1], selector); - c1[0] = __byte_perm (w3[1], w3[0], selector); - c0[3] = __byte_perm (w3[0], w2[3], selector); - c0[2] = __byte_perm (w2[3], w2[2], selector); - c0[1] = __byte_perm (w2[2], w2[1], selector); - c0[0] = __byte_perm (w2[1], w2[0], selector); - w7[3] = __byte_perm (w2[0], w1[3], selector); - w7[2] = __byte_perm (w1[3], w1[2], selector); - w7[1] = __byte_perm (w1[2], w1[1], selector); - w7[0] = __byte_perm (w1[1], w1[0], selector); - w6[3] = __byte_perm (w1[0], w0[3], selector); - w6[2] = __byte_perm (w0[3], w0[2], selector); - w6[1] = __byte_perm (w0[2], w0[1], selector); - w6[0] = __byte_perm (w0[1], w0[0], selector); - w5[3] = __byte_perm (w0[0], 0, selector); + c5[3] = hc_byte_perm ( 0, w7[3], selector); + c5[2] = hc_byte_perm (w7[3], w7[2], selector); + c5[1] = hc_byte_perm (w7[2], w7[1], selector); + c5[0] = hc_byte_perm (w7[1], w7[0], selector); + c4[3] = hc_byte_perm (w7[0], w6[3], selector); + c4[2] = hc_byte_perm (w6[3], w6[2], selector); + c4[1] = hc_byte_perm (w6[2], w6[1], selector); + c4[0] = hc_byte_perm (w6[1], w6[0], selector); + c3[3] = hc_byte_perm (w6[0], w5[3], selector); + c3[2] = hc_byte_perm (w5[3], w5[2], selector); + c3[1] = hc_byte_perm (w5[2], w5[1], selector); + c3[0] = hc_byte_perm (w5[1], w5[0], selector); + c2[3] = hc_byte_perm (w5[0], w4[3], selector); + c2[2] = hc_byte_perm (w4[3], w4[2], selector); + c2[1] = hc_byte_perm (w4[2], w4[1], selector); + c2[0] = hc_byte_perm (w4[1], w4[0], selector); + c1[3] = hc_byte_perm (w4[0], w3[3], selector); + c1[2] = hc_byte_perm (w3[3], w3[2], selector); + c1[1] = hc_byte_perm (w3[2], w3[1], selector); + c1[0] = hc_byte_perm (w3[1], w3[0], selector); + c0[3] = hc_byte_perm (w3[0], w2[3], selector); + c0[2] = hc_byte_perm (w2[3], w2[2], selector); + c0[1] = hc_byte_perm (w2[2], w2[1], selector); + c0[0] = hc_byte_perm (w2[1], w2[0], selector); + w7[3] = hc_byte_perm (w2[0], w1[3], selector); + w7[2] = hc_byte_perm (w1[3], w1[2], selector); + w7[1] = hc_byte_perm (w1[2], w1[1], selector); + w7[0] = hc_byte_perm (w1[1], w1[0], selector); + w6[3] = hc_byte_perm (w1[0], w0[3], selector); + w6[2] = hc_byte_perm (w0[3], w0[2], selector); + w6[1] = hc_byte_perm (w0[2], w0[1], selector); + w6[0] = hc_byte_perm (w0[1], w0[0], selector); + w5[3] = hc_byte_perm (w0[0], 0, selector); w5[2] = 0; w5[1] = 0; w5[0] = 0; @@ -11132,39 +11132,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 24: - c6[0] = __byte_perm ( 0, w7[3], selector); - c5[3] = __byte_perm (w7[3], w7[2], selector); - c5[2] = __byte_perm (w7[2], w7[1], selector); - c5[1] = __byte_perm (w7[1], w7[0], selector); - c5[0] = __byte_perm (w7[0], w6[3], selector); - c4[3] = __byte_perm (w6[3], w6[2], selector); - c4[2] = __byte_perm (w6[2], w6[1], selector); - c4[1] = __byte_perm (w6[1], w6[0], selector); - c4[0] = __byte_perm (w6[0], w5[3], selector); - c3[3] = __byte_perm (w5[3], w5[2], selector); - c3[2] = __byte_perm (w5[2], w5[1], selector); - c3[1] = __byte_perm (w5[1], w5[0], selector); - c3[0] = __byte_perm (w5[0], w4[3], selector); - c2[3] = __byte_perm (w4[3], w4[2], selector); - c2[2] = __byte_perm (w4[2], w4[1], selector); - c2[1] = __byte_perm (w4[1], w4[0], selector); - c2[0] = __byte_perm (w4[0], w3[3], selector); - c1[3] = __byte_perm (w3[3], w3[2], selector); - c1[2] = __byte_perm (w3[2], w3[1], selector); - c1[1] = __byte_perm (w3[1], w3[0], selector); - c1[0] = __byte_perm (w3[0], w2[3], selector); - c0[3] = __byte_perm (w2[3], w2[2], selector); - c0[2] = __byte_perm (w2[2], w2[1], selector); - c0[1] = __byte_perm (w2[1], w2[0], selector); - c0[0] = __byte_perm (w2[0], w1[3], selector); - w7[3] = __byte_perm (w1[3], w1[2], selector); - w7[2] = __byte_perm (w1[2], w1[1], selector); - w7[1] = __byte_perm (w1[1], w1[0], selector); - w7[0] = __byte_perm (w1[0], w0[3], selector); - w6[3] = __byte_perm (w0[3], w0[2], selector); - w6[2] = __byte_perm (w0[2], w0[1], selector); - w6[1] = __byte_perm (w0[1], w0[0], selector); - w6[0] = __byte_perm (w0[0], 0, selector); + c6[0] = hc_byte_perm ( 0, w7[3], selector); + c5[3] = hc_byte_perm (w7[3], w7[2], selector); + c5[2] = hc_byte_perm (w7[2], w7[1], selector); + c5[1] = hc_byte_perm (w7[1], w7[0], selector); + c5[0] = hc_byte_perm (w7[0], w6[3], selector); + c4[3] = hc_byte_perm (w6[3], w6[2], selector); + c4[2] = hc_byte_perm (w6[2], w6[1], selector); + c4[1] = hc_byte_perm (w6[1], w6[0], selector); + c4[0] = hc_byte_perm (w6[0], w5[3], selector); + c3[3] = hc_byte_perm (w5[3], w5[2], selector); + c3[2] = hc_byte_perm (w5[2], w5[1], selector); + c3[1] = hc_byte_perm (w5[1], w5[0], selector); + c3[0] = hc_byte_perm (w5[0], w4[3], selector); + c2[3] = hc_byte_perm (w4[3], w4[2], selector); + c2[2] = hc_byte_perm (w4[2], w4[1], selector); + c2[1] = hc_byte_perm (w4[1], w4[0], selector); + c2[0] = hc_byte_perm (w4[0], w3[3], selector); + c1[3] = hc_byte_perm (w3[3], w3[2], selector); + c1[2] = hc_byte_perm (w3[2], w3[1], selector); + c1[1] = hc_byte_perm (w3[1], w3[0], selector); + c1[0] = hc_byte_perm (w3[0], w2[3], selector); + c0[3] = hc_byte_perm (w2[3], w2[2], selector); + c0[2] = hc_byte_perm (w2[2], w2[1], selector); + c0[1] = hc_byte_perm (w2[1], w2[0], selector); + c0[0] = hc_byte_perm (w2[0], w1[3], selector); + w7[3] = hc_byte_perm (w1[3], w1[2], selector); + w7[2] = hc_byte_perm (w1[2], w1[1], selector); + w7[1] = hc_byte_perm (w1[1], w1[0], selector); + w7[0] = hc_byte_perm (w1[0], w0[3], selector); + w6[3] = hc_byte_perm (w0[3], w0[2], selector); + w6[2] = hc_byte_perm (w0[2], w0[1], selector); + w6[1] = hc_byte_perm (w0[1], w0[0], selector); + w6[0] = hc_byte_perm (w0[0], 0, selector); w5[3] = 0; w5[2] = 0; w5[1] = 0; @@ -11193,39 +11193,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 25: - c6[1] = __byte_perm ( 0, w7[3], selector); - c6[0] = __byte_perm (w7[3], w7[2], selector); - c5[3] = __byte_perm (w7[2], w7[1], selector); - c5[2] = __byte_perm (w7[1], w7[0], selector); - c5[1] = __byte_perm (w7[0], w6[3], selector); - c5[0] = __byte_perm (w6[3], w6[2], selector); - c4[3] = __byte_perm (w6[2], w6[1], selector); - c4[2] = __byte_perm (w6[1], w6[0], selector); - c4[1] = __byte_perm (w6[0], w5[3], selector); - c4[0] = __byte_perm (w5[3], w5[2], selector); - c3[3] = __byte_perm (w5[2], w5[1], selector); - c3[2] = __byte_perm (w5[1], w5[0], selector); - c3[1] = __byte_perm (w5[0], w4[3], selector); - c3[0] = __byte_perm (w4[3], w4[2], selector); - c2[3] = __byte_perm (w4[2], w4[1], selector); - c2[2] = __byte_perm (w4[1], w4[0], selector); - c2[1] = __byte_perm (w4[0], w3[3], selector); - c2[0] = __byte_perm (w3[3], w3[2], selector); - c1[3] = __byte_perm (w3[2], w3[1], selector); - c1[2] = __byte_perm (w3[1], w3[0], selector); - c1[1] = __byte_perm (w3[0], w2[3], selector); - c1[0] = __byte_perm (w2[3], w2[2], selector); - c0[3] = __byte_perm (w2[2], w2[1], selector); - c0[2] = __byte_perm (w2[1], w2[0], selector); - c0[1] = __byte_perm (w2[0], w1[3], selector); - c0[0] = __byte_perm (w1[3], w1[2], selector); - w7[3] = __byte_perm (w1[2], w1[1], selector); - w7[2] = __byte_perm (w1[1], w1[0], selector); - w7[1] = __byte_perm (w1[0], w0[3], selector); - w7[0] = __byte_perm (w0[3], w0[2], selector); - w6[3] = __byte_perm (w0[2], w0[1], selector); - w6[2] = __byte_perm (w0[1], w0[0], selector); - w6[1] = __byte_perm (w0[0], 0, selector); + c6[1] = hc_byte_perm ( 0, w7[3], selector); + c6[0] = hc_byte_perm (w7[3], w7[2], selector); + c5[3] = hc_byte_perm (w7[2], w7[1], selector); + c5[2] = hc_byte_perm (w7[1], w7[0], selector); + c5[1] = hc_byte_perm (w7[0], w6[3], selector); + c5[0] = hc_byte_perm (w6[3], w6[2], selector); + c4[3] = hc_byte_perm (w6[2], w6[1], selector); + c4[2] = hc_byte_perm (w6[1], w6[0], selector); + c4[1] = hc_byte_perm (w6[0], w5[3], selector); + c4[0] = hc_byte_perm (w5[3], w5[2], selector); + c3[3] = hc_byte_perm (w5[2], w5[1], selector); + c3[2] = hc_byte_perm (w5[1], w5[0], selector); + c3[1] = hc_byte_perm (w5[0], w4[3], selector); + c3[0] = hc_byte_perm (w4[3], w4[2], selector); + c2[3] = hc_byte_perm (w4[2], w4[1], selector); + c2[2] = hc_byte_perm (w4[1], w4[0], selector); + c2[1] = hc_byte_perm (w4[0], w3[3], selector); + c2[0] = hc_byte_perm (w3[3], w3[2], selector); + c1[3] = hc_byte_perm (w3[2], w3[1], selector); + c1[2] = hc_byte_perm (w3[1], w3[0], selector); + c1[1] = hc_byte_perm (w3[0], w2[3], selector); + c1[0] = hc_byte_perm (w2[3], w2[2], selector); + c0[3] = hc_byte_perm (w2[2], w2[1], selector); + c0[2] = hc_byte_perm (w2[1], w2[0], selector); + c0[1] = hc_byte_perm (w2[0], w1[3], selector); + c0[0] = hc_byte_perm (w1[3], w1[2], selector); + w7[3] = hc_byte_perm (w1[2], w1[1], selector); + w7[2] = hc_byte_perm (w1[1], w1[0], selector); + w7[1] = hc_byte_perm (w1[0], w0[3], selector); + w7[0] = hc_byte_perm (w0[3], w0[2], selector); + w6[3] = hc_byte_perm (w0[2], w0[1], selector); + w6[2] = hc_byte_perm (w0[1], w0[0], selector); + w6[1] = hc_byte_perm (w0[0], 0, selector); w6[0] = 0; w5[3] = 0; w5[2] = 0; @@ -11255,39 +11255,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 26: - c6[2] = __byte_perm ( 0, w7[3], selector); - c6[1] = __byte_perm (w7[3], w7[2], selector); - c6[0] = __byte_perm (w7[2], w7[1], selector); - c5[3] = __byte_perm (w7[1], w7[0], selector); - c5[2] = __byte_perm (w7[0], w6[3], selector); - c5[1] = __byte_perm (w6[3], w6[2], selector); - c5[0] = __byte_perm (w6[2], w6[1], selector); - c4[3] = __byte_perm (w6[1], w6[0], selector); - c4[2] = __byte_perm (w6[0], w5[3], selector); - c4[1] = __byte_perm (w5[3], w5[2], selector); - c4[0] = __byte_perm (w5[2], w5[1], selector); - c3[3] = __byte_perm (w5[1], w5[0], selector); - c3[2] = __byte_perm (w5[0], w4[3], selector); - c3[1] = __byte_perm (w4[3], w4[2], selector); - c3[0] = __byte_perm (w4[2], w4[1], selector); - c2[3] = __byte_perm (w4[1], w4[0], selector); - c2[2] = __byte_perm (w4[0], w3[3], selector); - c2[1] = __byte_perm (w3[3], w3[2], selector); - c2[0] = __byte_perm (w3[2], w3[1], selector); - c1[3] = __byte_perm (w3[1], w3[0], selector); - c1[2] = __byte_perm (w3[0], w2[3], selector); - c1[1] = __byte_perm (w2[3], w2[2], selector); - c1[0] = __byte_perm (w2[2], w2[1], selector); - c0[3] = __byte_perm (w2[1], w2[0], selector); - c0[2] = __byte_perm (w2[0], w1[3], selector); - c0[1] = __byte_perm (w1[3], w1[2], selector); - c0[0] = __byte_perm (w1[2], w1[1], selector); - w7[3] = __byte_perm (w1[1], w1[0], selector); - w7[2] = __byte_perm (w1[0], w0[3], selector); - w7[1] = __byte_perm (w0[3], w0[2], selector); - w7[0] = __byte_perm (w0[2], w0[1], selector); - w6[3] = __byte_perm (w0[1], w0[0], selector); - w6[2] = __byte_perm (w0[0], 0, selector); + c6[2] = hc_byte_perm ( 0, w7[3], selector); + c6[1] = hc_byte_perm (w7[3], w7[2], selector); + c6[0] = hc_byte_perm (w7[2], w7[1], selector); + c5[3] = hc_byte_perm (w7[1], w7[0], selector); + c5[2] = hc_byte_perm (w7[0], w6[3], selector); + c5[1] = hc_byte_perm (w6[3], w6[2], selector); + c5[0] = hc_byte_perm (w6[2], w6[1], selector); + c4[3] = hc_byte_perm (w6[1], w6[0], selector); + c4[2] = hc_byte_perm (w6[0], w5[3], selector); + c4[1] = hc_byte_perm (w5[3], w5[2], selector); + c4[0] = hc_byte_perm (w5[2], w5[1], selector); + c3[3] = hc_byte_perm (w5[1], w5[0], selector); + c3[2] = hc_byte_perm (w5[0], w4[3], selector); + c3[1] = hc_byte_perm (w4[3], w4[2], selector); + c3[0] = hc_byte_perm (w4[2], w4[1], selector); + c2[3] = hc_byte_perm (w4[1], w4[0], selector); + c2[2] = hc_byte_perm (w4[0], w3[3], selector); + c2[1] = hc_byte_perm (w3[3], w3[2], selector); + c2[0] = hc_byte_perm (w3[2], w3[1], selector); + c1[3] = hc_byte_perm (w3[1], w3[0], selector); + c1[2] = hc_byte_perm (w3[0], w2[3], selector); + c1[1] = hc_byte_perm (w2[3], w2[2], selector); + c1[0] = hc_byte_perm (w2[2], w2[1], selector); + c0[3] = hc_byte_perm (w2[1], w2[0], selector); + c0[2] = hc_byte_perm (w2[0], w1[3], selector); + c0[1] = hc_byte_perm (w1[3], w1[2], selector); + c0[0] = hc_byte_perm (w1[2], w1[1], selector); + w7[3] = hc_byte_perm (w1[1], w1[0], selector); + w7[2] = hc_byte_perm (w1[0], w0[3], selector); + w7[1] = hc_byte_perm (w0[3], w0[2], selector); + w7[0] = hc_byte_perm (w0[2], w0[1], selector); + w6[3] = hc_byte_perm (w0[1], w0[0], selector); + w6[2] = hc_byte_perm (w0[0], 0, selector); w6[1] = 0; w6[0] = 0; w5[3] = 0; @@ -11318,39 +11318,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 27: - c6[3] = __byte_perm ( 0, w7[3], selector); - c6[2] = __byte_perm (w7[3], w7[2], selector); - c6[1] = __byte_perm (w7[2], w7[1], selector); - c6[0] = __byte_perm (w7[1], w7[0], selector); - c5[3] = __byte_perm (w7[0], w6[3], selector); - c5[2] = __byte_perm (w6[3], w6[2], selector); - c5[1] = __byte_perm (w6[2], w6[1], selector); - c5[0] = __byte_perm (w6[1], w6[0], selector); - c4[3] = __byte_perm (w6[0], w5[3], selector); - c4[2] = __byte_perm (w5[3], w5[2], selector); - c4[1] = __byte_perm (w5[2], w5[1], selector); - c4[0] = __byte_perm (w5[1], w5[0], selector); - c3[3] = __byte_perm (w5[0], w4[3], selector); - c3[2] = __byte_perm (w4[3], w4[2], selector); - c3[1] = __byte_perm (w4[2], w4[1], selector); - c3[0] = __byte_perm (w4[1], w4[0], selector); - c2[3] = __byte_perm (w4[0], w3[3], selector); - c2[2] = __byte_perm (w3[3], w3[2], selector); - c2[1] = __byte_perm (w3[2], w3[1], selector); - c2[0] = __byte_perm (w3[1], w3[0], selector); - c1[3] = __byte_perm (w3[0], w2[3], selector); - c1[2] = __byte_perm (w2[3], w2[2], selector); - c1[1] = __byte_perm (w2[2], w2[1], selector); - c1[0] = __byte_perm (w2[1], w2[0], selector); - c0[3] = __byte_perm (w2[0], w1[3], selector); - c0[2] = __byte_perm (w1[3], w1[2], selector); - c0[1] = __byte_perm (w1[2], w1[1], selector); - c0[0] = __byte_perm (w1[1], w1[0], selector); - w7[3] = __byte_perm (w1[0], w0[3], selector); - w7[2] = __byte_perm (w0[3], w0[2], selector); - w7[1] = __byte_perm (w0[2], w0[1], selector); - w7[0] = __byte_perm (w0[1], w0[0], selector); - w6[3] = __byte_perm (w0[0], 0, selector); + c6[3] = hc_byte_perm ( 0, w7[3], selector); + c6[2] = hc_byte_perm (w7[3], w7[2], selector); + c6[1] = hc_byte_perm (w7[2], w7[1], selector); + c6[0] = hc_byte_perm (w7[1], w7[0], selector); + c5[3] = hc_byte_perm (w7[0], w6[3], selector); + c5[2] = hc_byte_perm (w6[3], w6[2], selector); + c5[1] = hc_byte_perm (w6[2], w6[1], selector); + c5[0] = hc_byte_perm (w6[1], w6[0], selector); + c4[3] = hc_byte_perm (w6[0], w5[3], selector); + c4[2] = hc_byte_perm (w5[3], w5[2], selector); + c4[1] = hc_byte_perm (w5[2], w5[1], selector); + c4[0] = hc_byte_perm (w5[1], w5[0], selector); + c3[3] = hc_byte_perm (w5[0], w4[3], selector); + c3[2] = hc_byte_perm (w4[3], w4[2], selector); + c3[1] = hc_byte_perm (w4[2], w4[1], selector); + c3[0] = hc_byte_perm (w4[1], w4[0], selector); + c2[3] = hc_byte_perm (w4[0], w3[3], selector); + c2[2] = hc_byte_perm (w3[3], w3[2], selector); + c2[1] = hc_byte_perm (w3[2], w3[1], selector); + c2[0] = hc_byte_perm (w3[1], w3[0], selector); + c1[3] = hc_byte_perm (w3[0], w2[3], selector); + c1[2] = hc_byte_perm (w2[3], w2[2], selector); + c1[1] = hc_byte_perm (w2[2], w2[1], selector); + c1[0] = hc_byte_perm (w2[1], w2[0], selector); + c0[3] = hc_byte_perm (w2[0], w1[3], selector); + c0[2] = hc_byte_perm (w1[3], w1[2], selector); + c0[1] = hc_byte_perm (w1[2], w1[1], selector); + c0[0] = hc_byte_perm (w1[1], w1[0], selector); + w7[3] = hc_byte_perm (w1[0], w0[3], selector); + w7[2] = hc_byte_perm (w0[3], w0[2], selector); + w7[1] = hc_byte_perm (w0[2], w0[1], selector); + w7[0] = hc_byte_perm (w0[1], w0[0], selector); + w6[3] = hc_byte_perm (w0[0], 0, selector); w6[2] = 0; w6[1] = 0; w6[0] = 0; @@ -11382,39 +11382,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 28: - c7[0] = __byte_perm ( 0, w7[3], selector); - c6[3] = __byte_perm (w7[3], w7[2], selector); - c6[2] = __byte_perm (w7[2], w7[1], selector); - c6[1] = __byte_perm (w7[1], w7[0], selector); - c6[0] = __byte_perm (w7[0], w6[3], selector); - c5[3] = __byte_perm (w6[3], w6[2], selector); - c5[2] = __byte_perm (w6[2], w6[1], selector); - c5[1] = __byte_perm (w6[1], w6[0], selector); - c5[0] = __byte_perm (w6[0], w5[3], selector); - c4[3] = __byte_perm (w5[3], w5[2], selector); - c4[2] = __byte_perm (w5[2], w5[1], selector); - c4[1] = __byte_perm (w5[1], w5[0], selector); - c4[0] = __byte_perm (w5[0], w4[3], selector); - c3[3] = __byte_perm (w4[3], w4[2], selector); - c3[2] = __byte_perm (w4[2], w4[1], selector); - c3[1] = __byte_perm (w4[1], w4[0], selector); - c3[0] = __byte_perm (w4[0], w3[3], selector); - c2[3] = __byte_perm (w3[3], w3[2], selector); - c2[2] = __byte_perm (w3[2], w3[1], selector); - c2[1] = __byte_perm (w3[1], w3[0], selector); - c2[0] = __byte_perm (w3[0], w2[3], selector); - c1[3] = __byte_perm (w2[3], w2[2], selector); - c1[2] = __byte_perm (w2[2], w2[1], selector); - c1[1] = __byte_perm (w2[1], w2[0], selector); - c1[0] = __byte_perm (w2[0], w1[3], selector); - c0[3] = __byte_perm (w1[3], w1[2], selector); - c0[2] = __byte_perm (w1[2], w1[1], selector); - c0[1] = __byte_perm (w1[1], w1[0], selector); - c0[0] = __byte_perm (w1[0], w0[3], selector); - w7[3] = __byte_perm (w0[3], w0[2], selector); - w7[2] = __byte_perm (w0[2], w0[1], selector); - w7[1] = __byte_perm (w0[1], w0[0], selector); - w7[0] = __byte_perm (w0[0], 0, selector); + c7[0] = hc_byte_perm ( 0, w7[3], selector); + c6[3] = hc_byte_perm (w7[3], w7[2], selector); + c6[2] = hc_byte_perm (w7[2], w7[1], selector); + c6[1] = hc_byte_perm (w7[1], w7[0], selector); + c6[0] = hc_byte_perm (w7[0], w6[3], selector); + c5[3] = hc_byte_perm (w6[3], w6[2], selector); + c5[2] = hc_byte_perm (w6[2], w6[1], selector); + c5[1] = hc_byte_perm (w6[1], w6[0], selector); + c5[0] = hc_byte_perm (w6[0], w5[3], selector); + c4[3] = hc_byte_perm (w5[3], w5[2], selector); + c4[2] = hc_byte_perm (w5[2], w5[1], selector); + c4[1] = hc_byte_perm (w5[1], w5[0], selector); + c4[0] = hc_byte_perm (w5[0], w4[3], selector); + c3[3] = hc_byte_perm (w4[3], w4[2], selector); + c3[2] = hc_byte_perm (w4[2], w4[1], selector); + c3[1] = hc_byte_perm (w4[1], w4[0], selector); + c3[0] = hc_byte_perm (w4[0], w3[3], selector); + c2[3] = hc_byte_perm (w3[3], w3[2], selector); + c2[2] = hc_byte_perm (w3[2], w3[1], selector); + c2[1] = hc_byte_perm (w3[1], w3[0], selector); + c2[0] = hc_byte_perm (w3[0], w2[3], selector); + c1[3] = hc_byte_perm (w2[3], w2[2], selector); + c1[2] = hc_byte_perm (w2[2], w2[1], selector); + c1[1] = hc_byte_perm (w2[1], w2[0], selector); + c1[0] = hc_byte_perm (w2[0], w1[3], selector); + c0[3] = hc_byte_perm (w1[3], w1[2], selector); + c0[2] = hc_byte_perm (w1[2], w1[1], selector); + c0[1] = hc_byte_perm (w1[1], w1[0], selector); + c0[0] = hc_byte_perm (w1[0], w0[3], selector); + w7[3] = hc_byte_perm (w0[3], w0[2], selector); + w7[2] = hc_byte_perm (w0[2], w0[1], selector); + w7[1] = hc_byte_perm (w0[1], w0[0], selector); + w7[0] = hc_byte_perm (w0[0], 0, selector); w6[3] = 0; w6[2] = 0; w6[1] = 0; @@ -11447,39 +11447,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 29: - c7[1] = __byte_perm ( 0, w7[3], selector); - c7[0] = __byte_perm (w7[3], w7[2], selector); - c6[3] = __byte_perm (w7[2], w7[1], selector); - c6[2] = __byte_perm (w7[1], w7[0], selector); - c6[1] = __byte_perm (w7[0], w6[3], selector); - c6[0] = __byte_perm (w6[3], w6[2], selector); - c5[3] = __byte_perm (w6[2], w6[1], selector); - c5[2] = __byte_perm (w6[1], w6[0], selector); - c5[1] = __byte_perm (w6[0], w5[3], selector); - c5[0] = __byte_perm (w5[3], w5[2], selector); - c4[3] = __byte_perm (w5[2], w5[1], selector); - c4[2] = __byte_perm (w5[1], w5[0], selector); - c4[1] = __byte_perm (w5[0], w4[3], selector); - c4[0] = __byte_perm (w4[3], w4[2], selector); - c3[3] = __byte_perm (w4[2], w4[1], selector); - c3[2] = __byte_perm (w4[1], w4[0], selector); - c3[1] = __byte_perm (w4[0], w3[3], selector); - c3[0] = __byte_perm (w3[3], w3[2], selector); - c2[3] = __byte_perm (w3[2], w3[1], selector); - c2[2] = __byte_perm (w3[1], w3[0], selector); - c2[1] = __byte_perm (w3[0], w2[3], selector); - c2[0] = __byte_perm (w2[3], w2[2], selector); - c1[3] = __byte_perm (w2[2], w2[1], selector); - c1[2] = __byte_perm (w2[1], w2[0], selector); - c1[1] = __byte_perm (w2[0], w1[3], selector); - c1[0] = __byte_perm (w1[3], w1[2], selector); - c0[3] = __byte_perm (w1[2], w1[1], selector); - c0[2] = __byte_perm (w1[1], w1[0], selector); - c0[1] = __byte_perm (w1[0], w0[3], selector); - c0[0] = __byte_perm (w0[3], w0[2], selector); - w7[3] = __byte_perm (w0[2], w0[1], selector); - w7[2] = __byte_perm (w0[1], w0[0], selector); - w7[1] = __byte_perm (w0[0], 0, selector); + c7[1] = hc_byte_perm ( 0, w7[3], selector); + c7[0] = hc_byte_perm (w7[3], w7[2], selector); + c6[3] = hc_byte_perm (w7[2], w7[1], selector); + c6[2] = hc_byte_perm (w7[1], w7[0], selector); + c6[1] = hc_byte_perm (w7[0], w6[3], selector); + c6[0] = hc_byte_perm (w6[3], w6[2], selector); + c5[3] = hc_byte_perm (w6[2], w6[1], selector); + c5[2] = hc_byte_perm (w6[1], w6[0], selector); + c5[1] = hc_byte_perm (w6[0], w5[3], selector); + c5[0] = hc_byte_perm (w5[3], w5[2], selector); + c4[3] = hc_byte_perm (w5[2], w5[1], selector); + c4[2] = hc_byte_perm (w5[1], w5[0], selector); + c4[1] = hc_byte_perm (w5[0], w4[3], selector); + c4[0] = hc_byte_perm (w4[3], w4[2], selector); + c3[3] = hc_byte_perm (w4[2], w4[1], selector); + c3[2] = hc_byte_perm (w4[1], w4[0], selector); + c3[1] = hc_byte_perm (w4[0], w3[3], selector); + c3[0] = hc_byte_perm (w3[3], w3[2], selector); + c2[3] = hc_byte_perm (w3[2], w3[1], selector); + c2[2] = hc_byte_perm (w3[1], w3[0], selector); + c2[1] = hc_byte_perm (w3[0], w2[3], selector); + c2[0] = hc_byte_perm (w2[3], w2[2], selector); + c1[3] = hc_byte_perm (w2[2], w2[1], selector); + c1[2] = hc_byte_perm (w2[1], w2[0], selector); + c1[1] = hc_byte_perm (w2[0], w1[3], selector); + c1[0] = hc_byte_perm (w1[3], w1[2], selector); + c0[3] = hc_byte_perm (w1[2], w1[1], selector); + c0[2] = hc_byte_perm (w1[1], w1[0], selector); + c0[1] = hc_byte_perm (w1[0], w0[3], selector); + c0[0] = hc_byte_perm (w0[3], w0[2], selector); + w7[3] = hc_byte_perm (w0[2], w0[1], selector); + w7[2] = hc_byte_perm (w0[1], w0[0], selector); + w7[1] = hc_byte_perm (w0[0], 0, selector); w7[0] = 0; w6[3] = 0; w6[2] = 0; @@ -11513,39 +11513,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 30: - c7[2] = __byte_perm ( 0, w7[3], selector); - c7[1] = __byte_perm (w7[3], w7[2], selector); - c7[0] = __byte_perm (w7[2], w7[1], selector); - c6[3] = __byte_perm (w7[1], w7[0], selector); - c6[2] = __byte_perm (w7[0], w6[3], selector); - c6[1] = __byte_perm (w6[3], w6[2], selector); - c6[0] = __byte_perm (w6[2], w6[1], selector); - c5[3] = __byte_perm (w6[1], w6[0], selector); - c5[2] = __byte_perm (w6[0], w5[3], selector); - c5[1] = __byte_perm (w5[3], w5[2], selector); - c5[0] = __byte_perm (w5[2], w5[1], selector); - c4[3] = __byte_perm (w5[1], w5[0], selector); - c4[2] = __byte_perm (w5[0], w4[3], selector); - c4[1] = __byte_perm (w4[3], w4[2], selector); - c4[0] = __byte_perm (w4[2], w4[1], selector); - c3[3] = __byte_perm (w4[1], w4[0], selector); - c3[2] = __byte_perm (w4[0], w3[3], selector); - c3[1] = __byte_perm (w3[3], w3[2], selector); - c3[0] = __byte_perm (w3[2], w3[1], selector); - c2[3] = __byte_perm (w3[1], w3[0], selector); - c2[2] = __byte_perm (w3[0], w2[3], selector); - c2[1] = __byte_perm (w2[3], w2[2], selector); - c2[0] = __byte_perm (w2[2], w2[1], selector); - c1[3] = __byte_perm (w2[1], w2[0], selector); - c1[2] = __byte_perm (w2[0], w1[3], selector); - c1[1] = __byte_perm (w1[3], w1[2], selector); - c1[0] = __byte_perm (w1[2], w1[1], selector); - c0[3] = __byte_perm (w1[1], w1[0], selector); - c0[2] = __byte_perm (w1[0], w0[3], selector); - c0[1] = __byte_perm (w0[3], w0[2], selector); - c0[0] = __byte_perm (w0[2], w0[1], selector); - w7[3] = __byte_perm (w0[1], w0[0], selector); - w7[2] = __byte_perm (w0[0], 0, selector); + c7[2] = hc_byte_perm ( 0, w7[3], selector); + c7[1] = hc_byte_perm (w7[3], w7[2], selector); + c7[0] = hc_byte_perm (w7[2], w7[1], selector); + c6[3] = hc_byte_perm (w7[1], w7[0], selector); + c6[2] = hc_byte_perm (w7[0], w6[3], selector); + c6[1] = hc_byte_perm (w6[3], w6[2], selector); + c6[0] = hc_byte_perm (w6[2], w6[1], selector); + c5[3] = hc_byte_perm (w6[1], w6[0], selector); + c5[2] = hc_byte_perm (w6[0], w5[3], selector); + c5[1] = hc_byte_perm (w5[3], w5[2], selector); + c5[0] = hc_byte_perm (w5[2], w5[1], selector); + c4[3] = hc_byte_perm (w5[1], w5[0], selector); + c4[2] = hc_byte_perm (w5[0], w4[3], selector); + c4[1] = hc_byte_perm (w4[3], w4[2], selector); + c4[0] = hc_byte_perm (w4[2], w4[1], selector); + c3[3] = hc_byte_perm (w4[1], w4[0], selector); + c3[2] = hc_byte_perm (w4[0], w3[3], selector); + c3[1] = hc_byte_perm (w3[3], w3[2], selector); + c3[0] = hc_byte_perm (w3[2], w3[1], selector); + c2[3] = hc_byte_perm (w3[1], w3[0], selector); + c2[2] = hc_byte_perm (w3[0], w2[3], selector); + c2[1] = hc_byte_perm (w2[3], w2[2], selector); + c2[0] = hc_byte_perm (w2[2], w2[1], selector); + c1[3] = hc_byte_perm (w2[1], w2[0], selector); + c1[2] = hc_byte_perm (w2[0], w1[3], selector); + c1[1] = hc_byte_perm (w1[3], w1[2], selector); + c1[0] = hc_byte_perm (w1[2], w1[1], selector); + c0[3] = hc_byte_perm (w1[1], w1[0], selector); + c0[2] = hc_byte_perm (w1[0], w0[3], selector); + c0[1] = hc_byte_perm (w0[3], w0[2], selector); + c0[0] = hc_byte_perm (w0[2], w0[1], selector); + w7[3] = hc_byte_perm (w0[1], w0[0], selector); + w7[2] = hc_byte_perm (w0[0], 0, selector); w7[1] = 0; w7[0] = 0; w6[3] = 0; @@ -11580,39 +11580,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 31: - c7[3] = __byte_perm ( 0, w7[3], selector); - c7[2] = __byte_perm (w7[3], w7[2], selector); - c7[1] = __byte_perm (w7[2], w7[1], selector); - c7[0] = __byte_perm (w7[1], w7[0], selector); - c6[3] = __byte_perm (w7[0], w6[3], selector); - c6[2] = __byte_perm (w6[3], w6[2], selector); - c6[1] = __byte_perm (w6[2], w6[1], selector); - c6[0] = __byte_perm (w6[1], w6[0], selector); - c5[3] = __byte_perm (w6[0], w5[3], selector); - c5[2] = __byte_perm (w5[3], w5[2], selector); - c5[1] = __byte_perm (w5[2], w5[1], selector); - c5[0] = __byte_perm (w5[1], w5[0], selector); - c4[3] = __byte_perm (w5[0], w4[3], selector); - c4[2] = __byte_perm (w4[3], w4[2], selector); - c4[1] = __byte_perm (w4[2], w4[1], selector); - c4[0] = __byte_perm (w4[1], w4[0], selector); - c3[3] = __byte_perm (w4[0], w3[3], selector); - c3[2] = __byte_perm (w3[3], w3[2], selector); - c3[1] = __byte_perm (w3[2], w3[1], selector); - c3[0] = __byte_perm (w3[1], w3[0], selector); - c2[3] = __byte_perm (w3[0], w2[3], selector); - c2[2] = __byte_perm (w2[3], w2[2], selector); - c2[1] = __byte_perm (w2[2], w2[1], selector); - c2[0] = __byte_perm (w2[1], w2[0], selector); - c1[3] = __byte_perm (w2[0], w1[3], selector); - c1[2] = __byte_perm (w1[3], w1[2], selector); - c1[1] = __byte_perm (w1[2], w1[1], selector); - c1[0] = __byte_perm (w1[1], w1[0], selector); - c0[3] = __byte_perm (w1[0], w0[3], selector); - c0[2] = __byte_perm (w0[3], w0[2], selector); - c0[1] = __byte_perm (w0[2], w0[1], selector); - c0[0] = __byte_perm (w0[1], w0[0], selector); - w7[3] = __byte_perm (w0[0], 0, selector); + c7[3] = hc_byte_perm ( 0, w7[3], selector); + c7[2] = hc_byte_perm (w7[3], w7[2], selector); + c7[1] = hc_byte_perm (w7[2], w7[1], selector); + c7[0] = hc_byte_perm (w7[1], w7[0], selector); + c6[3] = hc_byte_perm (w7[0], w6[3], selector); + c6[2] = hc_byte_perm (w6[3], w6[2], selector); + c6[1] = hc_byte_perm (w6[2], w6[1], selector); + c6[0] = hc_byte_perm (w6[1], w6[0], selector); + c5[3] = hc_byte_perm (w6[0], w5[3], selector); + c5[2] = hc_byte_perm (w5[3], w5[2], selector); + c5[1] = hc_byte_perm (w5[2], w5[1], selector); + c5[0] = hc_byte_perm (w5[1], w5[0], selector); + c4[3] = hc_byte_perm (w5[0], w4[3], selector); + c4[2] = hc_byte_perm (w4[3], w4[2], selector); + c4[1] = hc_byte_perm (w4[2], w4[1], selector); + c4[0] = hc_byte_perm (w4[1], w4[0], selector); + c3[3] = hc_byte_perm (w4[0], w3[3], selector); + c3[2] = hc_byte_perm (w3[3], w3[2], selector); + c3[1] = hc_byte_perm (w3[2], w3[1], selector); + c3[0] = hc_byte_perm (w3[1], w3[0], selector); + c2[3] = hc_byte_perm (w3[0], w2[3], selector); + c2[2] = hc_byte_perm (w2[3], w2[2], selector); + c2[1] = hc_byte_perm (w2[2], w2[1], selector); + c2[0] = hc_byte_perm (w2[1], w2[0], selector); + c1[3] = hc_byte_perm (w2[0], w1[3], selector); + c1[2] = hc_byte_perm (w1[3], w1[2], selector); + c1[1] = hc_byte_perm (w1[2], w1[1], selector); + c1[0] = hc_byte_perm (w1[1], w1[0], selector); + c0[3] = hc_byte_perm (w1[0], w0[3], selector); + c0[2] = hc_byte_perm (w0[3], w0[2], selector); + c0[1] = hc_byte_perm (w0[2], w0[1], selector); + c0[0] = hc_byte_perm (w0[1], w0[0], selector); + w7[3] = hc_byte_perm (w0[0], 0, selector); w7[2] = 0; w7[1] = 0; w7[0] = 0; @@ -11666,271 +11666,271 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) switch (offset_switch) { case 0: - w[63] = amd_bytealign (w[62], w[63], offset); - w[62] = amd_bytealign (w[61], w[62], offset); - w[61] = amd_bytealign (w[60], w[61], offset); - w[60] = amd_bytealign (w[59], w[60], offset); - w[59] = amd_bytealign (w[58], w[59], offset); - w[58] = amd_bytealign (w[57], w[58], offset); - w[57] = amd_bytealign (w[56], w[57], offset); - w[56] = amd_bytealign (w[55], w[56], offset); - w[55] = amd_bytealign (w[54], w[55], offset); - w[54] = amd_bytealign (w[53], w[54], offset); - w[53] = amd_bytealign (w[52], w[53], offset); - w[52] = amd_bytealign (w[51], w[52], offset); - w[51] = amd_bytealign (w[50], w[51], offset); - w[50] = amd_bytealign (w[49], w[50], offset); - w[49] = amd_bytealign (w[48], w[49], offset); - w[48] = amd_bytealign (w[47], w[48], offset); - w[47] = amd_bytealign (w[46], w[47], offset); - w[46] = amd_bytealign (w[45], w[46], offset); - w[45] = amd_bytealign (w[44], w[45], offset); - w[44] = amd_bytealign (w[43], w[44], offset); - w[43] = amd_bytealign (w[42], w[43], offset); - w[42] = amd_bytealign (w[41], w[42], offset); - w[41] = amd_bytealign (w[40], w[41], offset); - w[40] = amd_bytealign (w[39], w[40], offset); - w[39] = amd_bytealign (w[38], w[39], offset); - w[38] = amd_bytealign (w[37], w[38], offset); - w[37] = amd_bytealign (w[36], w[37], offset); - w[36] = amd_bytealign (w[35], w[36], offset); - w[35] = amd_bytealign (w[34], w[35], offset); - w[34] = amd_bytealign (w[33], w[34], offset); - w[33] = amd_bytealign (w[32], w[33], offset); - w[32] = amd_bytealign (w[31], w[32], offset); - w[31] = amd_bytealign (w[30], w[31], offset); - w[30] = amd_bytealign (w[29], w[30], offset); - w[29] = amd_bytealign (w[28], w[29], offset); - w[28] = amd_bytealign (w[27], w[28], offset); - w[27] = amd_bytealign (w[26], w[27], offset); - w[26] = amd_bytealign (w[25], w[26], offset); - w[25] = amd_bytealign (w[24], w[25], offset); - w[24] = amd_bytealign (w[23], w[24], offset); - w[23] = amd_bytealign (w[22], w[23], offset); - w[22] = amd_bytealign (w[21], w[22], offset); - w[21] = amd_bytealign (w[20], w[21], offset); - w[20] = amd_bytealign (w[19], w[20], offset); - w[19] = amd_bytealign (w[18], w[19], offset); - w[18] = amd_bytealign (w[17], w[18], offset); - w[17] = amd_bytealign (w[16], w[17], offset); - w[16] = amd_bytealign (w[15], w[16], offset); - w[15] = amd_bytealign (w[14], w[15], offset); - w[14] = amd_bytealign (w[13], w[14], offset); - w[13] = amd_bytealign (w[12], w[13], offset); - w[12] = amd_bytealign (w[11], w[12], offset); - w[11] = amd_bytealign (w[10], w[11], offset); - w[10] = amd_bytealign (w[ 9], w[10], offset); - w[ 9] = amd_bytealign (w[ 8], w[ 9], offset); - w[ 8] = amd_bytealign (w[ 7], w[ 8], offset); - w[ 7] = amd_bytealign (w[ 6], w[ 7], offset); - w[ 6] = amd_bytealign (w[ 5], w[ 6], offset); - w[ 5] = amd_bytealign (w[ 4], w[ 5], offset); - w[ 4] = amd_bytealign (w[ 3], w[ 4], offset); - w[ 3] = amd_bytealign (w[ 2], w[ 3], offset); - w[ 2] = amd_bytealign (w[ 1], w[ 2], offset); - w[ 1] = amd_bytealign (w[ 0], w[ 1], offset); - w[ 0] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[62], w[63], offset); + w[62] = hc_bytealign (w[61], w[62], offset); + w[61] = hc_bytealign (w[60], w[61], offset); + w[60] = hc_bytealign (w[59], w[60], offset); + w[59] = hc_bytealign (w[58], w[59], offset); + w[58] = hc_bytealign (w[57], w[58], offset); + w[57] = hc_bytealign (w[56], w[57], offset); + w[56] = hc_bytealign (w[55], w[56], offset); + w[55] = hc_bytealign (w[54], w[55], offset); + w[54] = hc_bytealign (w[53], w[54], offset); + w[53] = hc_bytealign (w[52], w[53], offset); + w[52] = hc_bytealign (w[51], w[52], offset); + w[51] = hc_bytealign (w[50], w[51], offset); + w[50] = hc_bytealign (w[49], w[50], offset); + w[49] = hc_bytealign (w[48], w[49], offset); + w[48] = hc_bytealign (w[47], w[48], offset); + w[47] = hc_bytealign (w[46], w[47], offset); + w[46] = hc_bytealign (w[45], w[46], offset); + w[45] = hc_bytealign (w[44], w[45], offset); + w[44] = hc_bytealign (w[43], w[44], offset); + w[43] = hc_bytealign (w[42], w[43], offset); + w[42] = hc_bytealign (w[41], w[42], offset); + w[41] = hc_bytealign (w[40], w[41], offset); + w[40] = hc_bytealign (w[39], w[40], offset); + w[39] = hc_bytealign (w[38], w[39], offset); + w[38] = hc_bytealign (w[37], w[38], offset); + w[37] = hc_bytealign (w[36], w[37], offset); + w[36] = hc_bytealign (w[35], w[36], offset); + w[35] = hc_bytealign (w[34], w[35], offset); + w[34] = hc_bytealign (w[33], w[34], offset); + w[33] = hc_bytealign (w[32], w[33], offset); + w[32] = hc_bytealign (w[31], w[32], offset); + w[31] = hc_bytealign (w[30], w[31], offset); + w[30] = hc_bytealign (w[29], w[30], offset); + w[29] = hc_bytealign (w[28], w[29], offset); + w[28] = hc_bytealign (w[27], w[28], offset); + w[27] = hc_bytealign (w[26], w[27], offset); + w[26] = hc_bytealign (w[25], w[26], offset); + w[25] = hc_bytealign (w[24], w[25], offset); + w[24] = hc_bytealign (w[23], w[24], offset); + w[23] = hc_bytealign (w[22], w[23], offset); + w[22] = hc_bytealign (w[21], w[22], offset); + w[21] = hc_bytealign (w[20], w[21], offset); + w[20] = hc_bytealign (w[19], w[20], offset); + w[19] = hc_bytealign (w[18], w[19], offset); + w[18] = hc_bytealign (w[17], w[18], offset); + w[17] = hc_bytealign (w[16], w[17], offset); + w[16] = hc_bytealign (w[15], w[16], offset); + w[15] = hc_bytealign (w[14], w[15], offset); + w[14] = hc_bytealign (w[13], w[14], offset); + w[13] = hc_bytealign (w[12], w[13], offset); + w[12] = hc_bytealign (w[11], w[12], offset); + w[11] = hc_bytealign (w[10], w[11], offset); + w[10] = hc_bytealign (w[ 9], w[10], offset); + w[ 9] = hc_bytealign (w[ 8], w[ 9], offset); + w[ 8] = hc_bytealign (w[ 7], w[ 8], offset); + w[ 7] = hc_bytealign (w[ 6], w[ 7], offset); + w[ 6] = hc_bytealign (w[ 5], w[ 6], offset); + w[ 5] = hc_bytealign (w[ 4], w[ 5], offset); + w[ 4] = hc_bytealign (w[ 3], w[ 4], offset); + w[ 3] = hc_bytealign (w[ 2], w[ 3], offset); + w[ 2] = hc_bytealign (w[ 1], w[ 2], offset); + w[ 1] = hc_bytealign (w[ 0], w[ 1], offset); + w[ 0] = hc_bytealign ( 0, w[ 0], offset); break; case 1: - w[63] = amd_bytealign (w[61], w[62], offset); - w[62] = amd_bytealign (w[60], w[61], offset); - w[61] = amd_bytealign (w[59], w[60], offset); - w[60] = amd_bytealign (w[58], w[59], offset); - w[59] = amd_bytealign (w[57], w[58], offset); - w[58] = amd_bytealign (w[56], w[57], offset); - w[57] = amd_bytealign (w[55], w[56], offset); - w[56] = amd_bytealign (w[54], w[55], offset); - w[55] = amd_bytealign (w[53], w[54], offset); - w[54] = amd_bytealign (w[52], w[53], offset); - w[53] = amd_bytealign (w[51], w[52], offset); - w[52] = amd_bytealign (w[50], w[51], offset); - w[51] = amd_bytealign (w[49], w[50], offset); - w[50] = amd_bytealign (w[48], w[49], offset); - w[49] = amd_bytealign (w[47], w[48], offset); - w[48] = amd_bytealign (w[46], w[47], offset); - w[47] = amd_bytealign (w[45], w[46], offset); - w[46] = amd_bytealign (w[44], w[45], offset); - w[45] = amd_bytealign (w[43], w[44], offset); - w[44] = amd_bytealign (w[42], w[43], offset); - w[43] = amd_bytealign (w[41], w[42], offset); - w[42] = amd_bytealign (w[40], w[41], offset); - w[41] = amd_bytealign (w[39], w[40], offset); - w[40] = amd_bytealign (w[38], w[39], offset); - w[39] = amd_bytealign (w[37], w[38], offset); - w[38] = amd_bytealign (w[36], w[37], offset); - w[37] = amd_bytealign (w[35], w[36], offset); - w[36] = amd_bytealign (w[34], w[35], offset); - w[35] = amd_bytealign (w[33], w[34], offset); - w[34] = amd_bytealign (w[32], w[33], offset); - w[33] = amd_bytealign (w[31], w[32], offset); - w[32] = amd_bytealign (w[30], w[31], offset); - w[31] = amd_bytealign (w[29], w[30], offset); - w[30] = amd_bytealign (w[28], w[29], offset); - w[29] = amd_bytealign (w[27], w[28], offset); - w[28] = amd_bytealign (w[26], w[27], offset); - w[27] = amd_bytealign (w[25], w[26], offset); - w[26] = amd_bytealign (w[24], w[25], offset); - w[25] = amd_bytealign (w[23], w[24], offset); - w[24] = amd_bytealign (w[22], w[23], offset); - w[23] = amd_bytealign (w[21], w[22], offset); - w[22] = amd_bytealign (w[20], w[21], offset); - w[21] = amd_bytealign (w[19], w[20], offset); - w[20] = amd_bytealign (w[18], w[19], offset); - w[19] = amd_bytealign (w[17], w[18], offset); - w[18] = amd_bytealign (w[16], w[17], offset); - w[17] = amd_bytealign (w[15], w[16], offset); - w[16] = amd_bytealign (w[14], w[15], offset); - w[15] = amd_bytealign (w[13], w[14], offset); - w[14] = amd_bytealign (w[12], w[13], offset); - w[13] = amd_bytealign (w[11], w[12], offset); - w[12] = amd_bytealign (w[10], w[11], offset); - w[11] = amd_bytealign (w[ 9], w[10], offset); - w[10] = amd_bytealign (w[ 8], w[ 9], offset); - w[ 9] = amd_bytealign (w[ 7], w[ 8], offset); - w[ 8] = amd_bytealign (w[ 6], w[ 7], offset); - w[ 7] = amd_bytealign (w[ 5], w[ 6], offset); - w[ 6] = amd_bytealign (w[ 4], w[ 5], offset); - w[ 5] = amd_bytealign (w[ 3], w[ 4], offset); - w[ 4] = amd_bytealign (w[ 2], w[ 3], offset); - w[ 3] = amd_bytealign (w[ 1], w[ 2], offset); - w[ 2] = amd_bytealign (w[ 0], w[ 1], offset); - w[ 1] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[61], w[62], offset); + w[62] = hc_bytealign (w[60], w[61], offset); + w[61] = hc_bytealign (w[59], w[60], offset); + w[60] = hc_bytealign (w[58], w[59], offset); + w[59] = hc_bytealign (w[57], w[58], offset); + w[58] = hc_bytealign (w[56], w[57], offset); + w[57] = hc_bytealign (w[55], w[56], offset); + w[56] = hc_bytealign (w[54], w[55], offset); + w[55] = hc_bytealign (w[53], w[54], offset); + w[54] = hc_bytealign (w[52], w[53], offset); + w[53] = hc_bytealign (w[51], w[52], offset); + w[52] = hc_bytealign (w[50], w[51], offset); + w[51] = hc_bytealign (w[49], w[50], offset); + w[50] = hc_bytealign (w[48], w[49], offset); + w[49] = hc_bytealign (w[47], w[48], offset); + w[48] = hc_bytealign (w[46], w[47], offset); + w[47] = hc_bytealign (w[45], w[46], offset); + w[46] = hc_bytealign (w[44], w[45], offset); + w[45] = hc_bytealign (w[43], w[44], offset); + w[44] = hc_bytealign (w[42], w[43], offset); + w[43] = hc_bytealign (w[41], w[42], offset); + w[42] = hc_bytealign (w[40], w[41], offset); + w[41] = hc_bytealign (w[39], w[40], offset); + w[40] = hc_bytealign (w[38], w[39], offset); + w[39] = hc_bytealign (w[37], w[38], offset); + w[38] = hc_bytealign (w[36], w[37], offset); + w[37] = hc_bytealign (w[35], w[36], offset); + w[36] = hc_bytealign (w[34], w[35], offset); + w[35] = hc_bytealign (w[33], w[34], offset); + w[34] = hc_bytealign (w[32], w[33], offset); + w[33] = hc_bytealign (w[31], w[32], offset); + w[32] = hc_bytealign (w[30], w[31], offset); + w[31] = hc_bytealign (w[29], w[30], offset); + w[30] = hc_bytealign (w[28], w[29], offset); + w[29] = hc_bytealign (w[27], w[28], offset); + w[28] = hc_bytealign (w[26], w[27], offset); + w[27] = hc_bytealign (w[25], w[26], offset); + w[26] = hc_bytealign (w[24], w[25], offset); + w[25] = hc_bytealign (w[23], w[24], offset); + w[24] = hc_bytealign (w[22], w[23], offset); + w[23] = hc_bytealign (w[21], w[22], offset); + w[22] = hc_bytealign (w[20], w[21], offset); + w[21] = hc_bytealign (w[19], w[20], offset); + w[20] = hc_bytealign (w[18], w[19], offset); + w[19] = hc_bytealign (w[17], w[18], offset); + w[18] = hc_bytealign (w[16], w[17], offset); + w[17] = hc_bytealign (w[15], w[16], offset); + w[16] = hc_bytealign (w[14], w[15], offset); + w[15] = hc_bytealign (w[13], w[14], offset); + w[14] = hc_bytealign (w[12], w[13], offset); + w[13] = hc_bytealign (w[11], w[12], offset); + w[12] = hc_bytealign (w[10], w[11], offset); + w[11] = hc_bytealign (w[ 9], w[10], offset); + w[10] = hc_bytealign (w[ 8], w[ 9], offset); + w[ 9] = hc_bytealign (w[ 7], w[ 8], offset); + w[ 8] = hc_bytealign (w[ 6], w[ 7], offset); + w[ 7] = hc_bytealign (w[ 5], w[ 6], offset); + w[ 6] = hc_bytealign (w[ 4], w[ 5], offset); + w[ 5] = hc_bytealign (w[ 3], w[ 4], offset); + w[ 4] = hc_bytealign (w[ 2], w[ 3], offset); + w[ 3] = hc_bytealign (w[ 1], w[ 2], offset); + w[ 2] = hc_bytealign (w[ 0], w[ 1], offset); + w[ 1] = hc_bytealign ( 0, w[ 0], offset); w[ 0] = 0; break; case 2: - w[63] = amd_bytealign (w[60], w[61], offset); - w[62] = amd_bytealign (w[59], w[60], offset); - w[61] = amd_bytealign (w[58], w[59], offset); - w[60] = amd_bytealign (w[57], w[58], offset); - w[59] = amd_bytealign (w[56], w[57], offset); - w[58] = amd_bytealign (w[55], w[56], offset); - w[57] = amd_bytealign (w[54], w[55], offset); - w[56] = amd_bytealign (w[53], w[54], offset); - w[55] = amd_bytealign (w[52], w[53], offset); - w[54] = amd_bytealign (w[51], w[52], offset); - w[53] = amd_bytealign (w[50], w[51], offset); - w[52] = amd_bytealign (w[49], w[50], offset); - w[51] = amd_bytealign (w[48], w[49], offset); - w[50] = amd_bytealign (w[47], w[48], offset); - w[49] = amd_bytealign (w[46], w[47], offset); - w[48] = amd_bytealign (w[45], w[46], offset); - w[47] = amd_bytealign (w[44], w[45], offset); - w[46] = amd_bytealign (w[43], w[44], offset); - w[45] = amd_bytealign (w[42], w[43], offset); - w[44] = amd_bytealign (w[41], w[42], offset); - w[43] = amd_bytealign (w[40], w[41], offset); - w[42] = amd_bytealign (w[39], w[40], offset); - w[41] = amd_bytealign (w[38], w[39], offset); - w[40] = amd_bytealign (w[37], w[38], offset); - w[39] = amd_bytealign (w[36], w[37], offset); - w[38] = amd_bytealign (w[35], w[36], offset); - w[37] = amd_bytealign (w[34], w[35], offset); - w[36] = amd_bytealign (w[33], w[34], offset); - w[35] = amd_bytealign (w[32], w[33], offset); - w[34] = amd_bytealign (w[31], w[32], offset); - w[33] = amd_bytealign (w[30], w[31], offset); - w[32] = amd_bytealign (w[29], w[30], offset); - w[31] = amd_bytealign (w[28], w[29], offset); - w[30] = amd_bytealign (w[27], w[28], offset); - w[29] = amd_bytealign (w[26], w[27], offset); - w[28] = amd_bytealign (w[25], w[26], offset); - w[27] = amd_bytealign (w[24], w[25], offset); - w[26] = amd_bytealign (w[23], w[24], offset); - w[25] = amd_bytealign (w[22], w[23], offset); - w[24] = amd_bytealign (w[21], w[22], offset); - w[23] = amd_bytealign (w[20], w[21], offset); - w[22] = amd_bytealign (w[19], w[20], offset); - w[21] = amd_bytealign (w[18], w[19], offset); - w[20] = amd_bytealign (w[17], w[18], offset); - w[19] = amd_bytealign (w[16], w[17], offset); - w[18] = amd_bytealign (w[15], w[16], offset); - w[17] = amd_bytealign (w[14], w[15], offset); - w[16] = amd_bytealign (w[13], w[14], offset); - w[15] = amd_bytealign (w[12], w[13], offset); - w[14] = amd_bytealign (w[11], w[12], offset); - w[13] = amd_bytealign (w[10], w[11], offset); - w[12] = amd_bytealign (w[ 9], w[10], offset); - w[11] = amd_bytealign (w[ 8], w[ 9], offset); - w[10] = amd_bytealign (w[ 7], w[ 8], offset); - w[ 9] = amd_bytealign (w[ 6], w[ 7], offset); - w[ 8] = amd_bytealign (w[ 5], w[ 6], offset); - w[ 7] = amd_bytealign (w[ 4], w[ 5], offset); - w[ 6] = amd_bytealign (w[ 3], w[ 4], offset); - w[ 5] = amd_bytealign (w[ 2], w[ 3], offset); - w[ 4] = amd_bytealign (w[ 1], w[ 2], offset); - w[ 3] = amd_bytealign (w[ 0], w[ 1], offset); - w[ 2] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[60], w[61], offset); + w[62] = hc_bytealign (w[59], w[60], offset); + w[61] = hc_bytealign (w[58], w[59], offset); + w[60] = hc_bytealign (w[57], w[58], offset); + w[59] = hc_bytealign (w[56], w[57], offset); + w[58] = hc_bytealign (w[55], w[56], offset); + w[57] = hc_bytealign (w[54], w[55], offset); + w[56] = hc_bytealign (w[53], w[54], offset); + w[55] = hc_bytealign (w[52], w[53], offset); + w[54] = hc_bytealign (w[51], w[52], offset); + w[53] = hc_bytealign (w[50], w[51], offset); + w[52] = hc_bytealign (w[49], w[50], offset); + w[51] = hc_bytealign (w[48], w[49], offset); + w[50] = hc_bytealign (w[47], w[48], offset); + w[49] = hc_bytealign (w[46], w[47], offset); + w[48] = hc_bytealign (w[45], w[46], offset); + w[47] = hc_bytealign (w[44], w[45], offset); + w[46] = hc_bytealign (w[43], w[44], offset); + w[45] = hc_bytealign (w[42], w[43], offset); + w[44] = hc_bytealign (w[41], w[42], offset); + w[43] = hc_bytealign (w[40], w[41], offset); + w[42] = hc_bytealign (w[39], w[40], offset); + w[41] = hc_bytealign (w[38], w[39], offset); + w[40] = hc_bytealign (w[37], w[38], offset); + w[39] = hc_bytealign (w[36], w[37], offset); + w[38] = hc_bytealign (w[35], w[36], offset); + w[37] = hc_bytealign (w[34], w[35], offset); + w[36] = hc_bytealign (w[33], w[34], offset); + w[35] = hc_bytealign (w[32], w[33], offset); + w[34] = hc_bytealign (w[31], w[32], offset); + w[33] = hc_bytealign (w[30], w[31], offset); + w[32] = hc_bytealign (w[29], w[30], offset); + w[31] = hc_bytealign (w[28], w[29], offset); + w[30] = hc_bytealign (w[27], w[28], offset); + w[29] = hc_bytealign (w[26], w[27], offset); + w[28] = hc_bytealign (w[25], w[26], offset); + w[27] = hc_bytealign (w[24], w[25], offset); + w[26] = hc_bytealign (w[23], w[24], offset); + w[25] = hc_bytealign (w[22], w[23], offset); + w[24] = hc_bytealign (w[21], w[22], offset); + w[23] = hc_bytealign (w[20], w[21], offset); + w[22] = hc_bytealign (w[19], w[20], offset); + w[21] = hc_bytealign (w[18], w[19], offset); + w[20] = hc_bytealign (w[17], w[18], offset); + w[19] = hc_bytealign (w[16], w[17], offset); + w[18] = hc_bytealign (w[15], w[16], offset); + w[17] = hc_bytealign (w[14], w[15], offset); + w[16] = hc_bytealign (w[13], w[14], offset); + w[15] = hc_bytealign (w[12], w[13], offset); + w[14] = hc_bytealign (w[11], w[12], offset); + w[13] = hc_bytealign (w[10], w[11], offset); + w[12] = hc_bytealign (w[ 9], w[10], offset); + w[11] = hc_bytealign (w[ 8], w[ 9], offset); + w[10] = hc_bytealign (w[ 7], w[ 8], offset); + w[ 9] = hc_bytealign (w[ 6], w[ 7], offset); + w[ 8] = hc_bytealign (w[ 5], w[ 6], offset); + w[ 7] = hc_bytealign (w[ 4], w[ 5], offset); + w[ 6] = hc_bytealign (w[ 3], w[ 4], offset); + w[ 5] = hc_bytealign (w[ 2], w[ 3], offset); + w[ 4] = hc_bytealign (w[ 1], w[ 2], offset); + w[ 3] = hc_bytealign (w[ 0], w[ 1], offset); + w[ 2] = hc_bytealign ( 0, w[ 0], offset); w[ 1] = 0; w[ 0] = 0; break; case 3: - w[63] = amd_bytealign (w[59], w[60], offset); - w[62] = amd_bytealign (w[58], w[59], offset); - w[61] = amd_bytealign (w[57], w[58], offset); - w[60] = amd_bytealign (w[56], w[57], offset); - w[59] = amd_bytealign (w[55], w[56], offset); - w[58] = amd_bytealign (w[54], w[55], offset); - w[57] = amd_bytealign (w[53], w[54], offset); - w[56] = amd_bytealign (w[52], w[53], offset); - w[55] = amd_bytealign (w[51], w[52], offset); - w[54] = amd_bytealign (w[50], w[51], offset); - w[53] = amd_bytealign (w[49], w[50], offset); - w[52] = amd_bytealign (w[48], w[49], offset); - w[51] = amd_bytealign (w[47], w[48], offset); - w[50] = amd_bytealign (w[46], w[47], offset); - w[49] = amd_bytealign (w[45], w[46], offset); - w[48] = amd_bytealign (w[44], w[45], offset); - w[47] = amd_bytealign (w[43], w[44], offset); - w[46] = amd_bytealign (w[42], w[43], offset); - w[45] = amd_bytealign (w[41], w[42], offset); - w[44] = amd_bytealign (w[40], w[41], offset); - w[43] = amd_bytealign (w[39], w[40], offset); - w[42] = amd_bytealign (w[38], w[39], offset); - w[41] = amd_bytealign (w[37], w[38], offset); - w[40] = amd_bytealign (w[36], w[37], offset); - w[39] = amd_bytealign (w[35], w[36], offset); - w[38] = amd_bytealign (w[34], w[35], offset); - w[37] = amd_bytealign (w[33], w[34], offset); - w[36] = amd_bytealign (w[32], w[33], offset); - w[35] = amd_bytealign (w[31], w[32], offset); - w[34] = amd_bytealign (w[30], w[31], offset); - w[33] = amd_bytealign (w[29], w[30], offset); - w[32] = amd_bytealign (w[28], w[29], offset); - w[31] = amd_bytealign (w[27], w[28], offset); - w[30] = amd_bytealign (w[26], w[27], offset); - w[29] = amd_bytealign (w[25], w[26], offset); - w[28] = amd_bytealign (w[24], w[25], offset); - w[27] = amd_bytealign (w[23], w[24], offset); - w[26] = amd_bytealign (w[22], w[23], offset); - w[25] = amd_bytealign (w[21], w[22], offset); - w[24] = amd_bytealign (w[20], w[21], offset); - w[23] = amd_bytealign (w[19], w[20], offset); - w[22] = amd_bytealign (w[18], w[19], offset); - w[21] = amd_bytealign (w[17], w[18], offset); - w[20] = amd_bytealign (w[16], w[17], offset); - w[19] = amd_bytealign (w[15], w[16], offset); - w[18] = amd_bytealign (w[14], w[15], offset); - w[17] = amd_bytealign (w[13], w[14], offset); - w[16] = amd_bytealign (w[12], w[13], offset); - w[15] = amd_bytealign (w[11], w[12], offset); - w[14] = amd_bytealign (w[10], w[11], offset); - w[13] = amd_bytealign (w[ 9], w[10], offset); - w[12] = amd_bytealign (w[ 8], w[ 9], offset); - w[11] = amd_bytealign (w[ 7], w[ 8], offset); - w[10] = amd_bytealign (w[ 6], w[ 7], offset); - w[ 9] = amd_bytealign (w[ 5], w[ 6], offset); - w[ 8] = amd_bytealign (w[ 4], w[ 5], offset); - w[ 7] = amd_bytealign (w[ 3], w[ 4], offset); - w[ 6] = amd_bytealign (w[ 2], w[ 3], offset); - w[ 5] = amd_bytealign (w[ 1], w[ 2], offset); - w[ 4] = amd_bytealign (w[ 0], w[ 1], offset); - w[ 3] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[59], w[60], offset); + w[62] = hc_bytealign (w[58], w[59], offset); + w[61] = hc_bytealign (w[57], w[58], offset); + w[60] = hc_bytealign (w[56], w[57], offset); + w[59] = hc_bytealign (w[55], w[56], offset); + w[58] = hc_bytealign (w[54], w[55], offset); + w[57] = hc_bytealign (w[53], w[54], offset); + w[56] = hc_bytealign (w[52], w[53], offset); + w[55] = hc_bytealign (w[51], w[52], offset); + w[54] = hc_bytealign (w[50], w[51], offset); + w[53] = hc_bytealign (w[49], w[50], offset); + w[52] = hc_bytealign (w[48], w[49], offset); + w[51] = hc_bytealign (w[47], w[48], offset); + w[50] = hc_bytealign (w[46], w[47], offset); + w[49] = hc_bytealign (w[45], w[46], offset); + w[48] = hc_bytealign (w[44], w[45], offset); + w[47] = hc_bytealign (w[43], w[44], offset); + w[46] = hc_bytealign (w[42], w[43], offset); + w[45] = hc_bytealign (w[41], w[42], offset); + w[44] = hc_bytealign (w[40], w[41], offset); + w[43] = hc_bytealign (w[39], w[40], offset); + w[42] = hc_bytealign (w[38], w[39], offset); + w[41] = hc_bytealign (w[37], w[38], offset); + w[40] = hc_bytealign (w[36], w[37], offset); + w[39] = hc_bytealign (w[35], w[36], offset); + w[38] = hc_bytealign (w[34], w[35], offset); + w[37] = hc_bytealign (w[33], w[34], offset); + w[36] = hc_bytealign (w[32], w[33], offset); + w[35] = hc_bytealign (w[31], w[32], offset); + w[34] = hc_bytealign (w[30], w[31], offset); + w[33] = hc_bytealign (w[29], w[30], offset); + w[32] = hc_bytealign (w[28], w[29], offset); + w[31] = hc_bytealign (w[27], w[28], offset); + w[30] = hc_bytealign (w[26], w[27], offset); + w[29] = hc_bytealign (w[25], w[26], offset); + w[28] = hc_bytealign (w[24], w[25], offset); + w[27] = hc_bytealign (w[23], w[24], offset); + w[26] = hc_bytealign (w[22], w[23], offset); + w[25] = hc_bytealign (w[21], w[22], offset); + w[24] = hc_bytealign (w[20], w[21], offset); + w[23] = hc_bytealign (w[19], w[20], offset); + w[22] = hc_bytealign (w[18], w[19], offset); + w[21] = hc_bytealign (w[17], w[18], offset); + w[20] = hc_bytealign (w[16], w[17], offset); + w[19] = hc_bytealign (w[15], w[16], offset); + w[18] = hc_bytealign (w[14], w[15], offset); + w[17] = hc_bytealign (w[13], w[14], offset); + w[16] = hc_bytealign (w[12], w[13], offset); + w[15] = hc_bytealign (w[11], w[12], offset); + w[14] = hc_bytealign (w[10], w[11], offset); + w[13] = hc_bytealign (w[ 9], w[10], offset); + w[12] = hc_bytealign (w[ 8], w[ 9], offset); + w[11] = hc_bytealign (w[ 7], w[ 8], offset); + w[10] = hc_bytealign (w[ 6], w[ 7], offset); + w[ 9] = hc_bytealign (w[ 5], w[ 6], offset); + w[ 8] = hc_bytealign (w[ 4], w[ 5], offset); + w[ 7] = hc_bytealign (w[ 3], w[ 4], offset); + w[ 6] = hc_bytealign (w[ 2], w[ 3], offset); + w[ 5] = hc_bytealign (w[ 1], w[ 2], offset); + w[ 4] = hc_bytealign (w[ 0], w[ 1], offset); + w[ 3] = hc_bytealign ( 0, w[ 0], offset); w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; @@ -11938,66 +11938,66 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 4: - w[63] = amd_bytealign (w[58], w[59], offset); - w[62] = amd_bytealign (w[57], w[58], offset); - w[61] = amd_bytealign (w[56], w[57], offset); - w[60] = amd_bytealign (w[55], w[56], offset); - w[59] = amd_bytealign (w[54], w[55], offset); - w[58] = amd_bytealign (w[53], w[54], offset); - w[57] = amd_bytealign (w[52], w[53], offset); - w[56] = amd_bytealign (w[51], w[52], offset); - w[55] = amd_bytealign (w[50], w[51], offset); - w[54] = amd_bytealign (w[49], w[50], offset); - w[53] = amd_bytealign (w[48], w[49], offset); - w[52] = amd_bytealign (w[47], w[48], offset); - w[51] = amd_bytealign (w[46], w[47], offset); - w[50] = amd_bytealign (w[45], w[46], offset); - w[49] = amd_bytealign (w[44], w[45], offset); - w[48] = amd_bytealign (w[43], w[44], offset); - w[47] = amd_bytealign (w[42], w[43], offset); - w[46] = amd_bytealign (w[41], w[42], offset); - w[45] = amd_bytealign (w[40], w[41], offset); - w[44] = amd_bytealign (w[39], w[40], offset); - w[43] = amd_bytealign (w[38], w[39], offset); - w[42] = amd_bytealign (w[37], w[38], offset); - w[41] = amd_bytealign (w[36], w[37], offset); - w[40] = amd_bytealign (w[35], w[36], offset); - w[39] = amd_bytealign (w[34], w[35], offset); - w[38] = amd_bytealign (w[33], w[34], offset); - w[37] = amd_bytealign (w[32], w[33], offset); - w[36] = amd_bytealign (w[31], w[32], offset); - w[35] = amd_bytealign (w[30], w[31], offset); - w[34] = amd_bytealign (w[29], w[30], offset); - w[33] = amd_bytealign (w[28], w[29], offset); - w[32] = amd_bytealign (w[27], w[28], offset); - w[31] = amd_bytealign (w[26], w[27], offset); - w[30] = amd_bytealign (w[25], w[26], offset); - w[29] = amd_bytealign (w[24], w[25], offset); - w[28] = amd_bytealign (w[23], w[24], offset); - w[27] = amd_bytealign (w[22], w[23], offset); - w[26] = amd_bytealign (w[21], w[22], offset); - w[25] = amd_bytealign (w[20], w[21], offset); - w[24] = amd_bytealign (w[19], w[20], offset); - w[23] = amd_bytealign (w[18], w[19], offset); - w[22] = amd_bytealign (w[17], w[18], offset); - w[21] = amd_bytealign (w[16], w[17], offset); - w[20] = amd_bytealign (w[15], w[16], offset); - w[19] = amd_bytealign (w[14], w[15], offset); - w[18] = amd_bytealign (w[13], w[14], offset); - w[17] = amd_bytealign (w[12], w[13], offset); - w[16] = amd_bytealign (w[11], w[12], offset); - w[15] = amd_bytealign (w[10], w[11], offset); - w[14] = amd_bytealign (w[ 9], w[10], offset); - w[13] = amd_bytealign (w[ 8], w[ 9], offset); - w[12] = amd_bytealign (w[ 7], w[ 8], offset); - w[11] = amd_bytealign (w[ 6], w[ 7], offset); - w[10] = amd_bytealign (w[ 5], w[ 6], offset); - w[ 9] = amd_bytealign (w[ 4], w[ 5], offset); - w[ 8] = amd_bytealign (w[ 3], w[ 4], offset); - w[ 7] = amd_bytealign (w[ 2], w[ 3], offset); - w[ 6] = amd_bytealign (w[ 1], w[ 2], offset); - w[ 5] = amd_bytealign (w[ 0], w[ 1], offset); - w[ 4] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[58], w[59], offset); + w[62] = hc_bytealign (w[57], w[58], offset); + w[61] = hc_bytealign (w[56], w[57], offset); + w[60] = hc_bytealign (w[55], w[56], offset); + w[59] = hc_bytealign (w[54], w[55], offset); + w[58] = hc_bytealign (w[53], w[54], offset); + w[57] = hc_bytealign (w[52], w[53], offset); + w[56] = hc_bytealign (w[51], w[52], offset); + w[55] = hc_bytealign (w[50], w[51], offset); + w[54] = hc_bytealign (w[49], w[50], offset); + w[53] = hc_bytealign (w[48], w[49], offset); + w[52] = hc_bytealign (w[47], w[48], offset); + w[51] = hc_bytealign (w[46], w[47], offset); + w[50] = hc_bytealign (w[45], w[46], offset); + w[49] = hc_bytealign (w[44], w[45], offset); + w[48] = hc_bytealign (w[43], w[44], offset); + w[47] = hc_bytealign (w[42], w[43], offset); + w[46] = hc_bytealign (w[41], w[42], offset); + w[45] = hc_bytealign (w[40], w[41], offset); + w[44] = hc_bytealign (w[39], w[40], offset); + w[43] = hc_bytealign (w[38], w[39], offset); + w[42] = hc_bytealign (w[37], w[38], offset); + w[41] = hc_bytealign (w[36], w[37], offset); + w[40] = hc_bytealign (w[35], w[36], offset); + w[39] = hc_bytealign (w[34], w[35], offset); + w[38] = hc_bytealign (w[33], w[34], offset); + w[37] = hc_bytealign (w[32], w[33], offset); + w[36] = hc_bytealign (w[31], w[32], offset); + w[35] = hc_bytealign (w[30], w[31], offset); + w[34] = hc_bytealign (w[29], w[30], offset); + w[33] = hc_bytealign (w[28], w[29], offset); + w[32] = hc_bytealign (w[27], w[28], offset); + w[31] = hc_bytealign (w[26], w[27], offset); + w[30] = hc_bytealign (w[25], w[26], offset); + w[29] = hc_bytealign (w[24], w[25], offset); + w[28] = hc_bytealign (w[23], w[24], offset); + w[27] = hc_bytealign (w[22], w[23], offset); + w[26] = hc_bytealign (w[21], w[22], offset); + w[25] = hc_bytealign (w[20], w[21], offset); + w[24] = hc_bytealign (w[19], w[20], offset); + w[23] = hc_bytealign (w[18], w[19], offset); + w[22] = hc_bytealign (w[17], w[18], offset); + w[21] = hc_bytealign (w[16], w[17], offset); + w[20] = hc_bytealign (w[15], w[16], offset); + w[19] = hc_bytealign (w[14], w[15], offset); + w[18] = hc_bytealign (w[13], w[14], offset); + w[17] = hc_bytealign (w[12], w[13], offset); + w[16] = hc_bytealign (w[11], w[12], offset); + w[15] = hc_bytealign (w[10], w[11], offset); + w[14] = hc_bytealign (w[ 9], w[10], offset); + w[13] = hc_bytealign (w[ 8], w[ 9], offset); + w[12] = hc_bytealign (w[ 7], w[ 8], offset); + w[11] = hc_bytealign (w[ 6], w[ 7], offset); + w[10] = hc_bytealign (w[ 5], w[ 6], offset); + w[ 9] = hc_bytealign (w[ 4], w[ 5], offset); + w[ 8] = hc_bytealign (w[ 3], w[ 4], offset); + w[ 7] = hc_bytealign (w[ 2], w[ 3], offset); + w[ 6] = hc_bytealign (w[ 1], w[ 2], offset); + w[ 5] = hc_bytealign (w[ 0], w[ 1], offset); + w[ 4] = hc_bytealign ( 0, w[ 0], offset); w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; @@ -12006,65 +12006,65 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 5: - w[63] = amd_bytealign (w[57], w[58], offset); - w[62] = amd_bytealign (w[56], w[57], offset); - w[61] = amd_bytealign (w[55], w[56], offset); - w[60] = amd_bytealign (w[54], w[55], offset); - w[59] = amd_bytealign (w[53], w[54], offset); - w[58] = amd_bytealign (w[52], w[53], offset); - w[57] = amd_bytealign (w[51], w[52], offset); - w[56] = amd_bytealign (w[50], w[51], offset); - w[55] = amd_bytealign (w[49], w[50], offset); - w[54] = amd_bytealign (w[48], w[49], offset); - w[53] = amd_bytealign (w[47], w[48], offset); - w[52] = amd_bytealign (w[46], w[47], offset); - w[51] = amd_bytealign (w[45], w[46], offset); - w[50] = amd_bytealign (w[44], w[45], offset); - w[49] = amd_bytealign (w[43], w[44], offset); - w[48] = amd_bytealign (w[42], w[43], offset); - w[47] = amd_bytealign (w[41], w[42], offset); - w[46] = amd_bytealign (w[40], w[41], offset); - w[45] = amd_bytealign (w[39], w[40], offset); - w[44] = amd_bytealign (w[38], w[39], offset); - w[43] = amd_bytealign (w[37], w[38], offset); - w[42] = amd_bytealign (w[36], w[37], offset); - w[41] = amd_bytealign (w[35], w[36], offset); - w[40] = amd_bytealign (w[34], w[35], offset); - w[39] = amd_bytealign (w[33], w[34], offset); - w[38] = amd_bytealign (w[32], w[33], offset); - w[37] = amd_bytealign (w[31], w[32], offset); - w[36] = amd_bytealign (w[30], w[31], offset); - w[35] = amd_bytealign (w[29], w[30], offset); - w[34] = amd_bytealign (w[28], w[29], offset); - w[33] = amd_bytealign (w[27], w[28], offset); - w[32] = amd_bytealign (w[26], w[27], offset); - w[31] = amd_bytealign (w[25], w[26], offset); - w[30] = amd_bytealign (w[24], w[25], offset); - w[29] = amd_bytealign (w[23], w[24], offset); - w[28] = amd_bytealign (w[22], w[23], offset); - w[27] = amd_bytealign (w[21], w[22], offset); - w[26] = amd_bytealign (w[20], w[21], offset); - w[25] = amd_bytealign (w[19], w[20], offset); - w[24] = amd_bytealign (w[18], w[19], offset); - w[23] = amd_bytealign (w[17], w[18], offset); - w[22] = amd_bytealign (w[16], w[17], offset); - w[21] = amd_bytealign (w[15], w[16], offset); - w[20] = amd_bytealign (w[14], w[15], offset); - w[19] = amd_bytealign (w[13], w[14], offset); - w[18] = amd_bytealign (w[12], w[13], offset); - w[17] = amd_bytealign (w[11], w[12], offset); - w[16] = amd_bytealign (w[10], w[11], offset); - w[15] = amd_bytealign (w[ 9], w[10], offset); - w[14] = amd_bytealign (w[ 8], w[ 9], offset); - w[13] = amd_bytealign (w[ 7], w[ 8], offset); - w[12] = amd_bytealign (w[ 6], w[ 7], offset); - w[11] = amd_bytealign (w[ 5], w[ 6], offset); - w[10] = amd_bytealign (w[ 4], w[ 5], offset); - w[ 9] = amd_bytealign (w[ 3], w[ 4], offset); - w[ 8] = amd_bytealign (w[ 2], w[ 3], offset); - w[ 7] = amd_bytealign (w[ 1], w[ 2], offset); - w[ 6] = amd_bytealign (w[ 0], w[ 1], offset); - w[ 5] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[57], w[58], offset); + w[62] = hc_bytealign (w[56], w[57], offset); + w[61] = hc_bytealign (w[55], w[56], offset); + w[60] = hc_bytealign (w[54], w[55], offset); + w[59] = hc_bytealign (w[53], w[54], offset); + w[58] = hc_bytealign (w[52], w[53], offset); + w[57] = hc_bytealign (w[51], w[52], offset); + w[56] = hc_bytealign (w[50], w[51], offset); + w[55] = hc_bytealign (w[49], w[50], offset); + w[54] = hc_bytealign (w[48], w[49], offset); + w[53] = hc_bytealign (w[47], w[48], offset); + w[52] = hc_bytealign (w[46], w[47], offset); + w[51] = hc_bytealign (w[45], w[46], offset); + w[50] = hc_bytealign (w[44], w[45], offset); + w[49] = hc_bytealign (w[43], w[44], offset); + w[48] = hc_bytealign (w[42], w[43], offset); + w[47] = hc_bytealign (w[41], w[42], offset); + w[46] = hc_bytealign (w[40], w[41], offset); + w[45] = hc_bytealign (w[39], w[40], offset); + w[44] = hc_bytealign (w[38], w[39], offset); + w[43] = hc_bytealign (w[37], w[38], offset); + w[42] = hc_bytealign (w[36], w[37], offset); + w[41] = hc_bytealign (w[35], w[36], offset); + w[40] = hc_bytealign (w[34], w[35], offset); + w[39] = hc_bytealign (w[33], w[34], offset); + w[38] = hc_bytealign (w[32], w[33], offset); + w[37] = hc_bytealign (w[31], w[32], offset); + w[36] = hc_bytealign (w[30], w[31], offset); + w[35] = hc_bytealign (w[29], w[30], offset); + w[34] = hc_bytealign (w[28], w[29], offset); + w[33] = hc_bytealign (w[27], w[28], offset); + w[32] = hc_bytealign (w[26], w[27], offset); + w[31] = hc_bytealign (w[25], w[26], offset); + w[30] = hc_bytealign (w[24], w[25], offset); + w[29] = hc_bytealign (w[23], w[24], offset); + w[28] = hc_bytealign (w[22], w[23], offset); + w[27] = hc_bytealign (w[21], w[22], offset); + w[26] = hc_bytealign (w[20], w[21], offset); + w[25] = hc_bytealign (w[19], w[20], offset); + w[24] = hc_bytealign (w[18], w[19], offset); + w[23] = hc_bytealign (w[17], w[18], offset); + w[22] = hc_bytealign (w[16], w[17], offset); + w[21] = hc_bytealign (w[15], w[16], offset); + w[20] = hc_bytealign (w[14], w[15], offset); + w[19] = hc_bytealign (w[13], w[14], offset); + w[18] = hc_bytealign (w[12], w[13], offset); + w[17] = hc_bytealign (w[11], w[12], offset); + w[16] = hc_bytealign (w[10], w[11], offset); + w[15] = hc_bytealign (w[ 9], w[10], offset); + w[14] = hc_bytealign (w[ 8], w[ 9], offset); + w[13] = hc_bytealign (w[ 7], w[ 8], offset); + w[12] = hc_bytealign (w[ 6], w[ 7], offset); + w[11] = hc_bytealign (w[ 5], w[ 6], offset); + w[10] = hc_bytealign (w[ 4], w[ 5], offset); + w[ 9] = hc_bytealign (w[ 3], w[ 4], offset); + w[ 8] = hc_bytealign (w[ 2], w[ 3], offset); + w[ 7] = hc_bytealign (w[ 1], w[ 2], offset); + w[ 6] = hc_bytealign (w[ 0], w[ 1], offset); + w[ 5] = hc_bytealign ( 0, w[ 0], offset); w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; @@ -12074,64 +12074,64 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 6: - w[63] = amd_bytealign (w[56], w[57], offset); - w[62] = amd_bytealign (w[55], w[56], offset); - w[61] = amd_bytealign (w[54], w[55], offset); - w[60] = amd_bytealign (w[53], w[54], offset); - w[59] = amd_bytealign (w[52], w[53], offset); - w[58] = amd_bytealign (w[51], w[52], offset); - w[57] = amd_bytealign (w[50], w[51], offset); - w[56] = amd_bytealign (w[49], w[50], offset); - w[55] = amd_bytealign (w[48], w[49], offset); - w[54] = amd_bytealign (w[47], w[48], offset); - w[53] = amd_bytealign (w[46], w[47], offset); - w[52] = amd_bytealign (w[45], w[46], offset); - w[51] = amd_bytealign (w[44], w[45], offset); - w[50] = amd_bytealign (w[43], w[44], offset); - w[49] = amd_bytealign (w[42], w[43], offset); - w[48] = amd_bytealign (w[41], w[42], offset); - w[47] = amd_bytealign (w[40], w[41], offset); - w[46] = amd_bytealign (w[39], w[40], offset); - w[45] = amd_bytealign (w[38], w[39], offset); - w[44] = amd_bytealign (w[37], w[38], offset); - w[43] = amd_bytealign (w[36], w[37], offset); - w[42] = amd_bytealign (w[35], w[36], offset); - w[41] = amd_bytealign (w[34], w[35], offset); - w[40] = amd_bytealign (w[33], w[34], offset); - w[39] = amd_bytealign (w[32], w[33], offset); - w[38] = amd_bytealign (w[31], w[32], offset); - w[37] = amd_bytealign (w[30], w[31], offset); - w[36] = amd_bytealign (w[29], w[30], offset); - w[35] = amd_bytealign (w[28], w[29], offset); - w[34] = amd_bytealign (w[27], w[28], offset); - w[33] = amd_bytealign (w[26], w[27], offset); - w[32] = amd_bytealign (w[25], w[26], offset); - w[31] = amd_bytealign (w[24], w[25], offset); - w[30] = amd_bytealign (w[23], w[24], offset); - w[29] = amd_bytealign (w[22], w[23], offset); - w[28] = amd_bytealign (w[21], w[22], offset); - w[27] = amd_bytealign (w[20], w[21], offset); - w[26] = amd_bytealign (w[19], w[20], offset); - w[25] = amd_bytealign (w[18], w[19], offset); - w[24] = amd_bytealign (w[17], w[18], offset); - w[23] = amd_bytealign (w[16], w[17], offset); - w[22] = amd_bytealign (w[15], w[16], offset); - w[21] = amd_bytealign (w[14], w[15], offset); - w[20] = amd_bytealign (w[13], w[14], offset); - w[19] = amd_bytealign (w[12], w[13], offset); - w[18] = amd_bytealign (w[11], w[12], offset); - w[17] = amd_bytealign (w[10], w[11], offset); - w[16] = amd_bytealign (w[ 9], w[10], offset); - w[15] = amd_bytealign (w[ 8], w[ 9], offset); - w[14] = amd_bytealign (w[ 7], w[ 8], offset); - w[13] = amd_bytealign (w[ 6], w[ 7], offset); - w[12] = amd_bytealign (w[ 5], w[ 6], offset); - w[11] = amd_bytealign (w[ 4], w[ 5], offset); - w[10] = amd_bytealign (w[ 3], w[ 4], offset); - w[ 9] = amd_bytealign (w[ 2], w[ 3], offset); - w[ 8] = amd_bytealign (w[ 1], w[ 2], offset); - w[ 7] = amd_bytealign (w[ 0], w[ 1], offset); - w[ 6] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[56], w[57], offset); + w[62] = hc_bytealign (w[55], w[56], offset); + w[61] = hc_bytealign (w[54], w[55], offset); + w[60] = hc_bytealign (w[53], w[54], offset); + w[59] = hc_bytealign (w[52], w[53], offset); + w[58] = hc_bytealign (w[51], w[52], offset); + w[57] = hc_bytealign (w[50], w[51], offset); + w[56] = hc_bytealign (w[49], w[50], offset); + w[55] = hc_bytealign (w[48], w[49], offset); + w[54] = hc_bytealign (w[47], w[48], offset); + w[53] = hc_bytealign (w[46], w[47], offset); + w[52] = hc_bytealign (w[45], w[46], offset); + w[51] = hc_bytealign (w[44], w[45], offset); + w[50] = hc_bytealign (w[43], w[44], offset); + w[49] = hc_bytealign (w[42], w[43], offset); + w[48] = hc_bytealign (w[41], w[42], offset); + w[47] = hc_bytealign (w[40], w[41], offset); + w[46] = hc_bytealign (w[39], w[40], offset); + w[45] = hc_bytealign (w[38], w[39], offset); + w[44] = hc_bytealign (w[37], w[38], offset); + w[43] = hc_bytealign (w[36], w[37], offset); + w[42] = hc_bytealign (w[35], w[36], offset); + w[41] = hc_bytealign (w[34], w[35], offset); + w[40] = hc_bytealign (w[33], w[34], offset); + w[39] = hc_bytealign (w[32], w[33], offset); + w[38] = hc_bytealign (w[31], w[32], offset); + w[37] = hc_bytealign (w[30], w[31], offset); + w[36] = hc_bytealign (w[29], w[30], offset); + w[35] = hc_bytealign (w[28], w[29], offset); + w[34] = hc_bytealign (w[27], w[28], offset); + w[33] = hc_bytealign (w[26], w[27], offset); + w[32] = hc_bytealign (w[25], w[26], offset); + w[31] = hc_bytealign (w[24], w[25], offset); + w[30] = hc_bytealign (w[23], w[24], offset); + w[29] = hc_bytealign (w[22], w[23], offset); + w[28] = hc_bytealign (w[21], w[22], offset); + w[27] = hc_bytealign (w[20], w[21], offset); + w[26] = hc_bytealign (w[19], w[20], offset); + w[25] = hc_bytealign (w[18], w[19], offset); + w[24] = hc_bytealign (w[17], w[18], offset); + w[23] = hc_bytealign (w[16], w[17], offset); + w[22] = hc_bytealign (w[15], w[16], offset); + w[21] = hc_bytealign (w[14], w[15], offset); + w[20] = hc_bytealign (w[13], w[14], offset); + w[19] = hc_bytealign (w[12], w[13], offset); + w[18] = hc_bytealign (w[11], w[12], offset); + w[17] = hc_bytealign (w[10], w[11], offset); + w[16] = hc_bytealign (w[ 9], w[10], offset); + w[15] = hc_bytealign (w[ 8], w[ 9], offset); + w[14] = hc_bytealign (w[ 7], w[ 8], offset); + w[13] = hc_bytealign (w[ 6], w[ 7], offset); + w[12] = hc_bytealign (w[ 5], w[ 6], offset); + w[11] = hc_bytealign (w[ 4], w[ 5], offset); + w[10] = hc_bytealign (w[ 3], w[ 4], offset); + w[ 9] = hc_bytealign (w[ 2], w[ 3], offset); + w[ 8] = hc_bytealign (w[ 1], w[ 2], offset); + w[ 7] = hc_bytealign (w[ 0], w[ 1], offset); + w[ 6] = hc_bytealign ( 0, w[ 0], offset); w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; @@ -12142,63 +12142,63 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 7: - w[63] = amd_bytealign (w[55], w[56], offset); - w[62] = amd_bytealign (w[54], w[55], offset); - w[61] = amd_bytealign (w[53], w[54], offset); - w[60] = amd_bytealign (w[52], w[53], offset); - w[59] = amd_bytealign (w[51], w[52], offset); - w[58] = amd_bytealign (w[50], w[51], offset); - w[57] = amd_bytealign (w[49], w[50], offset); - w[56] = amd_bytealign (w[48], w[49], offset); - w[55] = amd_bytealign (w[47], w[48], offset); - w[54] = amd_bytealign (w[46], w[47], offset); - w[53] = amd_bytealign (w[45], w[46], offset); - w[52] = amd_bytealign (w[44], w[45], offset); - w[51] = amd_bytealign (w[43], w[44], offset); - w[50] = amd_bytealign (w[42], w[43], offset); - w[49] = amd_bytealign (w[41], w[42], offset); - w[48] = amd_bytealign (w[40], w[41], offset); - w[47] = amd_bytealign (w[39], w[40], offset); - w[46] = amd_bytealign (w[38], w[39], offset); - w[45] = amd_bytealign (w[37], w[38], offset); - w[44] = amd_bytealign (w[36], w[37], offset); - w[43] = amd_bytealign (w[35], w[36], offset); - w[42] = amd_bytealign (w[34], w[35], offset); - w[41] = amd_bytealign (w[33], w[34], offset); - w[40] = amd_bytealign (w[32], w[33], offset); - w[39] = amd_bytealign (w[31], w[32], offset); - w[38] = amd_bytealign (w[30], w[31], offset); - w[37] = amd_bytealign (w[29], w[30], offset); - w[36] = amd_bytealign (w[28], w[29], offset); - w[35] = amd_bytealign (w[27], w[28], offset); - w[34] = amd_bytealign (w[26], w[27], offset); - w[33] = amd_bytealign (w[25], w[26], offset); - w[32] = amd_bytealign (w[24], w[25], offset); - w[31] = amd_bytealign (w[23], w[24], offset); - w[30] = amd_bytealign (w[22], w[23], offset); - w[29] = amd_bytealign (w[21], w[22], offset); - w[28] = amd_bytealign (w[20], w[21], offset); - w[27] = amd_bytealign (w[19], w[20], offset); - w[26] = amd_bytealign (w[18], w[19], offset); - w[25] = amd_bytealign (w[17], w[18], offset); - w[24] = amd_bytealign (w[16], w[17], offset); - w[23] = amd_bytealign (w[15], w[16], offset); - w[22] = amd_bytealign (w[14], w[15], offset); - w[21] = amd_bytealign (w[13], w[14], offset); - w[20] = amd_bytealign (w[12], w[13], offset); - w[19] = amd_bytealign (w[11], w[12], offset); - w[18] = amd_bytealign (w[10], w[11], offset); - w[17] = amd_bytealign (w[ 9], w[10], offset); - w[16] = amd_bytealign (w[ 8], w[ 9], offset); - w[15] = amd_bytealign (w[ 7], w[ 8], offset); - w[14] = amd_bytealign (w[ 6], w[ 7], offset); - w[13] = amd_bytealign (w[ 5], w[ 6], offset); - w[12] = amd_bytealign (w[ 4], w[ 5], offset); - w[11] = amd_bytealign (w[ 3], w[ 4], offset); - w[10] = amd_bytealign (w[ 2], w[ 3], offset); - w[ 9] = amd_bytealign (w[ 1], w[ 2], offset); - w[ 8] = amd_bytealign (w[ 0], w[ 1], offset); - w[ 7] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[55], w[56], offset); + w[62] = hc_bytealign (w[54], w[55], offset); + w[61] = hc_bytealign (w[53], w[54], offset); + w[60] = hc_bytealign (w[52], w[53], offset); + w[59] = hc_bytealign (w[51], w[52], offset); + w[58] = hc_bytealign (w[50], w[51], offset); + w[57] = hc_bytealign (w[49], w[50], offset); + w[56] = hc_bytealign (w[48], w[49], offset); + w[55] = hc_bytealign (w[47], w[48], offset); + w[54] = hc_bytealign (w[46], w[47], offset); + w[53] = hc_bytealign (w[45], w[46], offset); + w[52] = hc_bytealign (w[44], w[45], offset); + w[51] = hc_bytealign (w[43], w[44], offset); + w[50] = hc_bytealign (w[42], w[43], offset); + w[49] = hc_bytealign (w[41], w[42], offset); + w[48] = hc_bytealign (w[40], w[41], offset); + w[47] = hc_bytealign (w[39], w[40], offset); + w[46] = hc_bytealign (w[38], w[39], offset); + w[45] = hc_bytealign (w[37], w[38], offset); + w[44] = hc_bytealign (w[36], w[37], offset); + w[43] = hc_bytealign (w[35], w[36], offset); + w[42] = hc_bytealign (w[34], w[35], offset); + w[41] = hc_bytealign (w[33], w[34], offset); + w[40] = hc_bytealign (w[32], w[33], offset); + w[39] = hc_bytealign (w[31], w[32], offset); + w[38] = hc_bytealign (w[30], w[31], offset); + w[37] = hc_bytealign (w[29], w[30], offset); + w[36] = hc_bytealign (w[28], w[29], offset); + w[35] = hc_bytealign (w[27], w[28], offset); + w[34] = hc_bytealign (w[26], w[27], offset); + w[33] = hc_bytealign (w[25], w[26], offset); + w[32] = hc_bytealign (w[24], w[25], offset); + w[31] = hc_bytealign (w[23], w[24], offset); + w[30] = hc_bytealign (w[22], w[23], offset); + w[29] = hc_bytealign (w[21], w[22], offset); + w[28] = hc_bytealign (w[20], w[21], offset); + w[27] = hc_bytealign (w[19], w[20], offset); + w[26] = hc_bytealign (w[18], w[19], offset); + w[25] = hc_bytealign (w[17], w[18], offset); + w[24] = hc_bytealign (w[16], w[17], offset); + w[23] = hc_bytealign (w[15], w[16], offset); + w[22] = hc_bytealign (w[14], w[15], offset); + w[21] = hc_bytealign (w[13], w[14], offset); + w[20] = hc_bytealign (w[12], w[13], offset); + w[19] = hc_bytealign (w[11], w[12], offset); + w[18] = hc_bytealign (w[10], w[11], offset); + w[17] = hc_bytealign (w[ 9], w[10], offset); + w[16] = hc_bytealign (w[ 8], w[ 9], offset); + w[15] = hc_bytealign (w[ 7], w[ 8], offset); + w[14] = hc_bytealign (w[ 6], w[ 7], offset); + w[13] = hc_bytealign (w[ 5], w[ 6], offset); + w[12] = hc_bytealign (w[ 4], w[ 5], offset); + w[11] = hc_bytealign (w[ 3], w[ 4], offset); + w[10] = hc_bytealign (w[ 2], w[ 3], offset); + w[ 9] = hc_bytealign (w[ 1], w[ 2], offset); + w[ 8] = hc_bytealign (w[ 0], w[ 1], offset); + w[ 7] = hc_bytealign ( 0, w[ 0], offset); w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; @@ -12210,62 +12210,62 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 8: - w[63] = amd_bytealign (w[54], w[55], offset); - w[62] = amd_bytealign (w[53], w[54], offset); - w[61] = amd_bytealign (w[52], w[53], offset); - w[60] = amd_bytealign (w[51], w[52], offset); - w[59] = amd_bytealign (w[50], w[51], offset); - w[58] = amd_bytealign (w[49], w[50], offset); - w[57] = amd_bytealign (w[48], w[49], offset); - w[56] = amd_bytealign (w[47], w[48], offset); - w[55] = amd_bytealign (w[46], w[47], offset); - w[54] = amd_bytealign (w[45], w[46], offset); - w[53] = amd_bytealign (w[44], w[45], offset); - w[52] = amd_bytealign (w[43], w[44], offset); - w[51] = amd_bytealign (w[42], w[43], offset); - w[50] = amd_bytealign (w[41], w[42], offset); - w[49] = amd_bytealign (w[40], w[41], offset); - w[48] = amd_bytealign (w[39], w[40], offset); - w[47] = amd_bytealign (w[38], w[39], offset); - w[46] = amd_bytealign (w[37], w[38], offset); - w[45] = amd_bytealign (w[36], w[37], offset); - w[44] = amd_bytealign (w[35], w[36], offset); - w[43] = amd_bytealign (w[34], w[35], offset); - w[42] = amd_bytealign (w[33], w[34], offset); - w[41] = amd_bytealign (w[32], w[33], offset); - w[40] = amd_bytealign (w[31], w[32], offset); - w[39] = amd_bytealign (w[30], w[31], offset); - w[38] = amd_bytealign (w[29], w[30], offset); - w[37] = amd_bytealign (w[28], w[29], offset); - w[36] = amd_bytealign (w[27], w[28], offset); - w[35] = amd_bytealign (w[26], w[27], offset); - w[34] = amd_bytealign (w[25], w[26], offset); - w[33] = amd_bytealign (w[24], w[25], offset); - w[32] = amd_bytealign (w[23], w[24], offset); - w[31] = amd_bytealign (w[22], w[23], offset); - w[30] = amd_bytealign (w[21], w[22], offset); - w[29] = amd_bytealign (w[20], w[21], offset); - w[28] = amd_bytealign (w[19], w[20], offset); - w[27] = amd_bytealign (w[18], w[19], offset); - w[26] = amd_bytealign (w[17], w[18], offset); - w[25] = amd_bytealign (w[16], w[17], offset); - w[24] = amd_bytealign (w[15], w[16], offset); - w[23] = amd_bytealign (w[14], w[15], offset); - w[22] = amd_bytealign (w[13], w[14], offset); - w[21] = amd_bytealign (w[12], w[13], offset); - w[20] = amd_bytealign (w[11], w[12], offset); - w[19] = amd_bytealign (w[10], w[11], offset); - w[18] = amd_bytealign (w[ 9], w[10], offset); - w[17] = amd_bytealign (w[ 8], w[ 9], offset); - w[16] = amd_bytealign (w[ 7], w[ 8], offset); - w[15] = amd_bytealign (w[ 6], w[ 7], offset); - w[14] = amd_bytealign (w[ 5], w[ 6], offset); - w[13] = amd_bytealign (w[ 4], w[ 5], offset); - w[12] = amd_bytealign (w[ 3], w[ 4], offset); - w[11] = amd_bytealign (w[ 2], w[ 3], offset); - w[10] = amd_bytealign (w[ 1], w[ 2], offset); - w[ 9] = amd_bytealign (w[ 0], w[ 1], offset); - w[ 8] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[54], w[55], offset); + w[62] = hc_bytealign (w[53], w[54], offset); + w[61] = hc_bytealign (w[52], w[53], offset); + w[60] = hc_bytealign (w[51], w[52], offset); + w[59] = hc_bytealign (w[50], w[51], offset); + w[58] = hc_bytealign (w[49], w[50], offset); + w[57] = hc_bytealign (w[48], w[49], offset); + w[56] = hc_bytealign (w[47], w[48], offset); + w[55] = hc_bytealign (w[46], w[47], offset); + w[54] = hc_bytealign (w[45], w[46], offset); + w[53] = hc_bytealign (w[44], w[45], offset); + w[52] = hc_bytealign (w[43], w[44], offset); + w[51] = hc_bytealign (w[42], w[43], offset); + w[50] = hc_bytealign (w[41], w[42], offset); + w[49] = hc_bytealign (w[40], w[41], offset); + w[48] = hc_bytealign (w[39], w[40], offset); + w[47] = hc_bytealign (w[38], w[39], offset); + w[46] = hc_bytealign (w[37], w[38], offset); + w[45] = hc_bytealign (w[36], w[37], offset); + w[44] = hc_bytealign (w[35], w[36], offset); + w[43] = hc_bytealign (w[34], w[35], offset); + w[42] = hc_bytealign (w[33], w[34], offset); + w[41] = hc_bytealign (w[32], w[33], offset); + w[40] = hc_bytealign (w[31], w[32], offset); + w[39] = hc_bytealign (w[30], w[31], offset); + w[38] = hc_bytealign (w[29], w[30], offset); + w[37] = hc_bytealign (w[28], w[29], offset); + w[36] = hc_bytealign (w[27], w[28], offset); + w[35] = hc_bytealign (w[26], w[27], offset); + w[34] = hc_bytealign (w[25], w[26], offset); + w[33] = hc_bytealign (w[24], w[25], offset); + w[32] = hc_bytealign (w[23], w[24], offset); + w[31] = hc_bytealign (w[22], w[23], offset); + w[30] = hc_bytealign (w[21], w[22], offset); + w[29] = hc_bytealign (w[20], w[21], offset); + w[28] = hc_bytealign (w[19], w[20], offset); + w[27] = hc_bytealign (w[18], w[19], offset); + w[26] = hc_bytealign (w[17], w[18], offset); + w[25] = hc_bytealign (w[16], w[17], offset); + w[24] = hc_bytealign (w[15], w[16], offset); + w[23] = hc_bytealign (w[14], w[15], offset); + w[22] = hc_bytealign (w[13], w[14], offset); + w[21] = hc_bytealign (w[12], w[13], offset); + w[20] = hc_bytealign (w[11], w[12], offset); + w[19] = hc_bytealign (w[10], w[11], offset); + w[18] = hc_bytealign (w[ 9], w[10], offset); + w[17] = hc_bytealign (w[ 8], w[ 9], offset); + w[16] = hc_bytealign (w[ 7], w[ 8], offset); + w[15] = hc_bytealign (w[ 6], w[ 7], offset); + w[14] = hc_bytealign (w[ 5], w[ 6], offset); + w[13] = hc_bytealign (w[ 4], w[ 5], offset); + w[12] = hc_bytealign (w[ 3], w[ 4], offset); + w[11] = hc_bytealign (w[ 2], w[ 3], offset); + w[10] = hc_bytealign (w[ 1], w[ 2], offset); + w[ 9] = hc_bytealign (w[ 0], w[ 1], offset); + w[ 8] = hc_bytealign ( 0, w[ 0], offset); w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; @@ -12278,61 +12278,61 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 9: - w[63] = amd_bytealign (w[53], w[54], offset); - w[62] = amd_bytealign (w[52], w[53], offset); - w[61] = amd_bytealign (w[51], w[52], offset); - w[60] = amd_bytealign (w[50], w[51], offset); - w[59] = amd_bytealign (w[49], w[50], offset); - w[58] = amd_bytealign (w[48], w[49], offset); - w[57] = amd_bytealign (w[47], w[48], offset); - w[56] = amd_bytealign (w[46], w[47], offset); - w[55] = amd_bytealign (w[45], w[46], offset); - w[54] = amd_bytealign (w[44], w[45], offset); - w[53] = amd_bytealign (w[43], w[44], offset); - w[52] = amd_bytealign (w[42], w[43], offset); - w[51] = amd_bytealign (w[41], w[42], offset); - w[50] = amd_bytealign (w[40], w[41], offset); - w[49] = amd_bytealign (w[39], w[40], offset); - w[48] = amd_bytealign (w[38], w[39], offset); - w[47] = amd_bytealign (w[37], w[38], offset); - w[46] = amd_bytealign (w[36], w[37], offset); - w[45] = amd_bytealign (w[35], w[36], offset); - w[44] = amd_bytealign (w[34], w[35], offset); - w[43] = amd_bytealign (w[33], w[34], offset); - w[42] = amd_bytealign (w[32], w[33], offset); - w[41] = amd_bytealign (w[31], w[32], offset); - w[40] = amd_bytealign (w[30], w[31], offset); - w[39] = amd_bytealign (w[29], w[30], offset); - w[38] = amd_bytealign (w[28], w[29], offset); - w[37] = amd_bytealign (w[27], w[28], offset); - w[36] = amd_bytealign (w[26], w[27], offset); - w[35] = amd_bytealign (w[25], w[26], offset); - w[34] = amd_bytealign (w[24], w[25], offset); - w[33] = amd_bytealign (w[23], w[24], offset); - w[32] = amd_bytealign (w[22], w[23], offset); - w[31] = amd_bytealign (w[21], w[22], offset); - w[30] = amd_bytealign (w[20], w[21], offset); - w[29] = amd_bytealign (w[19], w[20], offset); - w[28] = amd_bytealign (w[18], w[19], offset); - w[27] = amd_bytealign (w[17], w[18], offset); - w[26] = amd_bytealign (w[16], w[17], offset); - w[25] = amd_bytealign (w[15], w[16], offset); - w[24] = amd_bytealign (w[14], w[15], offset); - w[23] = amd_bytealign (w[13], w[14], offset); - w[22] = amd_bytealign (w[12], w[13], offset); - w[21] = amd_bytealign (w[11], w[12], offset); - w[20] = amd_bytealign (w[10], w[11], offset); - w[19] = amd_bytealign (w[ 9], w[10], offset); - w[18] = amd_bytealign (w[ 8], w[ 9], offset); - w[17] = amd_bytealign (w[ 7], w[ 8], offset); - w[16] = amd_bytealign (w[ 6], w[ 7], offset); - w[15] = amd_bytealign (w[ 5], w[ 6], offset); - w[14] = amd_bytealign (w[ 4], w[ 5], offset); - w[13] = amd_bytealign (w[ 3], w[ 4], offset); - w[12] = amd_bytealign (w[ 2], w[ 3], offset); - w[11] = amd_bytealign (w[ 1], w[ 2], offset); - w[10] = amd_bytealign (w[ 0], w[ 1], offset); - w[ 9] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[53], w[54], offset); + w[62] = hc_bytealign (w[52], w[53], offset); + w[61] = hc_bytealign (w[51], w[52], offset); + w[60] = hc_bytealign (w[50], w[51], offset); + w[59] = hc_bytealign (w[49], w[50], offset); + w[58] = hc_bytealign (w[48], w[49], offset); + w[57] = hc_bytealign (w[47], w[48], offset); + w[56] = hc_bytealign (w[46], w[47], offset); + w[55] = hc_bytealign (w[45], w[46], offset); + w[54] = hc_bytealign (w[44], w[45], offset); + w[53] = hc_bytealign (w[43], w[44], offset); + w[52] = hc_bytealign (w[42], w[43], offset); + w[51] = hc_bytealign (w[41], w[42], offset); + w[50] = hc_bytealign (w[40], w[41], offset); + w[49] = hc_bytealign (w[39], w[40], offset); + w[48] = hc_bytealign (w[38], w[39], offset); + w[47] = hc_bytealign (w[37], w[38], offset); + w[46] = hc_bytealign (w[36], w[37], offset); + w[45] = hc_bytealign (w[35], w[36], offset); + w[44] = hc_bytealign (w[34], w[35], offset); + w[43] = hc_bytealign (w[33], w[34], offset); + w[42] = hc_bytealign (w[32], w[33], offset); + w[41] = hc_bytealign (w[31], w[32], offset); + w[40] = hc_bytealign (w[30], w[31], offset); + w[39] = hc_bytealign (w[29], w[30], offset); + w[38] = hc_bytealign (w[28], w[29], offset); + w[37] = hc_bytealign (w[27], w[28], offset); + w[36] = hc_bytealign (w[26], w[27], offset); + w[35] = hc_bytealign (w[25], w[26], offset); + w[34] = hc_bytealign (w[24], w[25], offset); + w[33] = hc_bytealign (w[23], w[24], offset); + w[32] = hc_bytealign (w[22], w[23], offset); + w[31] = hc_bytealign (w[21], w[22], offset); + w[30] = hc_bytealign (w[20], w[21], offset); + w[29] = hc_bytealign (w[19], w[20], offset); + w[28] = hc_bytealign (w[18], w[19], offset); + w[27] = hc_bytealign (w[17], w[18], offset); + w[26] = hc_bytealign (w[16], w[17], offset); + w[25] = hc_bytealign (w[15], w[16], offset); + w[24] = hc_bytealign (w[14], w[15], offset); + w[23] = hc_bytealign (w[13], w[14], offset); + w[22] = hc_bytealign (w[12], w[13], offset); + w[21] = hc_bytealign (w[11], w[12], offset); + w[20] = hc_bytealign (w[10], w[11], offset); + w[19] = hc_bytealign (w[ 9], w[10], offset); + w[18] = hc_bytealign (w[ 8], w[ 9], offset); + w[17] = hc_bytealign (w[ 7], w[ 8], offset); + w[16] = hc_bytealign (w[ 6], w[ 7], offset); + w[15] = hc_bytealign (w[ 5], w[ 6], offset); + w[14] = hc_bytealign (w[ 4], w[ 5], offset); + w[13] = hc_bytealign (w[ 3], w[ 4], offset); + w[12] = hc_bytealign (w[ 2], w[ 3], offset); + w[11] = hc_bytealign (w[ 1], w[ 2], offset); + w[10] = hc_bytealign (w[ 0], w[ 1], offset); + w[ 9] = hc_bytealign ( 0, w[ 0], offset); w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; @@ -12346,60 +12346,60 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 10: - w[63] = amd_bytealign (w[52], w[53], offset); - w[62] = amd_bytealign (w[51], w[52], offset); - w[61] = amd_bytealign (w[50], w[51], offset); - w[60] = amd_bytealign (w[49], w[50], offset); - w[59] = amd_bytealign (w[48], w[49], offset); - w[58] = amd_bytealign (w[47], w[48], offset); - w[57] = amd_bytealign (w[46], w[47], offset); - w[56] = amd_bytealign (w[45], w[46], offset); - w[55] = amd_bytealign (w[44], w[45], offset); - w[54] = amd_bytealign (w[43], w[44], offset); - w[53] = amd_bytealign (w[42], w[43], offset); - w[52] = amd_bytealign (w[41], w[42], offset); - w[51] = amd_bytealign (w[40], w[41], offset); - w[50] = amd_bytealign (w[39], w[40], offset); - w[49] = amd_bytealign (w[38], w[39], offset); - w[48] = amd_bytealign (w[37], w[38], offset); - w[47] = amd_bytealign (w[36], w[37], offset); - w[46] = amd_bytealign (w[35], w[36], offset); - w[45] = amd_bytealign (w[34], w[35], offset); - w[44] = amd_bytealign (w[33], w[34], offset); - w[43] = amd_bytealign (w[32], w[33], offset); - w[42] = amd_bytealign (w[31], w[32], offset); - w[41] = amd_bytealign (w[30], w[31], offset); - w[40] = amd_bytealign (w[29], w[30], offset); - w[39] = amd_bytealign (w[28], w[29], offset); - w[38] = amd_bytealign (w[27], w[28], offset); - w[37] = amd_bytealign (w[26], w[27], offset); - w[36] = amd_bytealign (w[25], w[26], offset); - w[35] = amd_bytealign (w[24], w[25], offset); - w[34] = amd_bytealign (w[23], w[24], offset); - w[33] = amd_bytealign (w[22], w[23], offset); - w[32] = amd_bytealign (w[21], w[22], offset); - w[31] = amd_bytealign (w[20], w[21], offset); - w[30] = amd_bytealign (w[19], w[20], offset); - w[29] = amd_bytealign (w[18], w[19], offset); - w[28] = amd_bytealign (w[17], w[18], offset); - w[27] = amd_bytealign (w[16], w[17], offset); - w[26] = amd_bytealign (w[15], w[16], offset); - w[25] = amd_bytealign (w[14], w[15], offset); - w[24] = amd_bytealign (w[13], w[14], offset); - w[23] = amd_bytealign (w[12], w[13], offset); - w[22] = amd_bytealign (w[11], w[12], offset); - w[21] = amd_bytealign (w[10], w[11], offset); - w[20] = amd_bytealign (w[ 9], w[10], offset); - w[19] = amd_bytealign (w[ 8], w[ 9], offset); - w[18] = amd_bytealign (w[ 7], w[ 8], offset); - w[17] = amd_bytealign (w[ 6], w[ 7], offset); - w[16] = amd_bytealign (w[ 5], w[ 6], offset); - w[15] = amd_bytealign (w[ 4], w[ 5], offset); - w[14] = amd_bytealign (w[ 3], w[ 4], offset); - w[13] = amd_bytealign (w[ 2], w[ 3], offset); - w[12] = amd_bytealign (w[ 1], w[ 2], offset); - w[11] = amd_bytealign (w[ 0], w[ 1], offset); - w[10] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[52], w[53], offset); + w[62] = hc_bytealign (w[51], w[52], offset); + w[61] = hc_bytealign (w[50], w[51], offset); + w[60] = hc_bytealign (w[49], w[50], offset); + w[59] = hc_bytealign (w[48], w[49], offset); + w[58] = hc_bytealign (w[47], w[48], offset); + w[57] = hc_bytealign (w[46], w[47], offset); + w[56] = hc_bytealign (w[45], w[46], offset); + w[55] = hc_bytealign (w[44], w[45], offset); + w[54] = hc_bytealign (w[43], w[44], offset); + w[53] = hc_bytealign (w[42], w[43], offset); + w[52] = hc_bytealign (w[41], w[42], offset); + w[51] = hc_bytealign (w[40], w[41], offset); + w[50] = hc_bytealign (w[39], w[40], offset); + w[49] = hc_bytealign (w[38], w[39], offset); + w[48] = hc_bytealign (w[37], w[38], offset); + w[47] = hc_bytealign (w[36], w[37], offset); + w[46] = hc_bytealign (w[35], w[36], offset); + w[45] = hc_bytealign (w[34], w[35], offset); + w[44] = hc_bytealign (w[33], w[34], offset); + w[43] = hc_bytealign (w[32], w[33], offset); + w[42] = hc_bytealign (w[31], w[32], offset); + w[41] = hc_bytealign (w[30], w[31], offset); + w[40] = hc_bytealign (w[29], w[30], offset); + w[39] = hc_bytealign (w[28], w[29], offset); + w[38] = hc_bytealign (w[27], w[28], offset); + w[37] = hc_bytealign (w[26], w[27], offset); + w[36] = hc_bytealign (w[25], w[26], offset); + w[35] = hc_bytealign (w[24], w[25], offset); + w[34] = hc_bytealign (w[23], w[24], offset); + w[33] = hc_bytealign (w[22], w[23], offset); + w[32] = hc_bytealign (w[21], w[22], offset); + w[31] = hc_bytealign (w[20], w[21], offset); + w[30] = hc_bytealign (w[19], w[20], offset); + w[29] = hc_bytealign (w[18], w[19], offset); + w[28] = hc_bytealign (w[17], w[18], offset); + w[27] = hc_bytealign (w[16], w[17], offset); + w[26] = hc_bytealign (w[15], w[16], offset); + w[25] = hc_bytealign (w[14], w[15], offset); + w[24] = hc_bytealign (w[13], w[14], offset); + w[23] = hc_bytealign (w[12], w[13], offset); + w[22] = hc_bytealign (w[11], w[12], offset); + w[21] = hc_bytealign (w[10], w[11], offset); + w[20] = hc_bytealign (w[ 9], w[10], offset); + w[19] = hc_bytealign (w[ 8], w[ 9], offset); + w[18] = hc_bytealign (w[ 7], w[ 8], offset); + w[17] = hc_bytealign (w[ 6], w[ 7], offset); + w[16] = hc_bytealign (w[ 5], w[ 6], offset); + w[15] = hc_bytealign (w[ 4], w[ 5], offset); + w[14] = hc_bytealign (w[ 3], w[ 4], offset); + w[13] = hc_bytealign (w[ 2], w[ 3], offset); + w[12] = hc_bytealign (w[ 1], w[ 2], offset); + w[11] = hc_bytealign (w[ 0], w[ 1], offset); + w[10] = hc_bytealign ( 0, w[ 0], offset); w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; @@ -12414,59 +12414,59 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 11: - w[63] = amd_bytealign (w[51], w[52], offset); - w[62] = amd_bytealign (w[50], w[51], offset); - w[61] = amd_bytealign (w[49], w[50], offset); - w[60] = amd_bytealign (w[48], w[49], offset); - w[59] = amd_bytealign (w[47], w[48], offset); - w[58] = amd_bytealign (w[46], w[47], offset); - w[57] = amd_bytealign (w[45], w[46], offset); - w[56] = amd_bytealign (w[44], w[45], offset); - w[55] = amd_bytealign (w[43], w[44], offset); - w[54] = amd_bytealign (w[42], w[43], offset); - w[53] = amd_bytealign (w[41], w[42], offset); - w[52] = amd_bytealign (w[40], w[41], offset); - w[51] = amd_bytealign (w[39], w[40], offset); - w[50] = amd_bytealign (w[38], w[39], offset); - w[49] = amd_bytealign (w[37], w[38], offset); - w[48] = amd_bytealign (w[36], w[37], offset); - w[47] = amd_bytealign (w[35], w[36], offset); - w[46] = amd_bytealign (w[34], w[35], offset); - w[45] = amd_bytealign (w[33], w[34], offset); - w[44] = amd_bytealign (w[32], w[33], offset); - w[43] = amd_bytealign (w[31], w[32], offset); - w[42] = amd_bytealign (w[30], w[31], offset); - w[41] = amd_bytealign (w[29], w[30], offset); - w[40] = amd_bytealign (w[28], w[29], offset); - w[39] = amd_bytealign (w[27], w[28], offset); - w[38] = amd_bytealign (w[26], w[27], offset); - w[37] = amd_bytealign (w[25], w[26], offset); - w[36] = amd_bytealign (w[24], w[25], offset); - w[35] = amd_bytealign (w[23], w[24], offset); - w[34] = amd_bytealign (w[22], w[23], offset); - w[33] = amd_bytealign (w[21], w[22], offset); - w[32] = amd_bytealign (w[20], w[21], offset); - w[31] = amd_bytealign (w[19], w[20], offset); - w[30] = amd_bytealign (w[18], w[19], offset); - w[29] = amd_bytealign (w[17], w[18], offset); - w[28] = amd_bytealign (w[16], w[17], offset); - w[27] = amd_bytealign (w[15], w[16], offset); - w[26] = amd_bytealign (w[14], w[15], offset); - w[25] = amd_bytealign (w[13], w[14], offset); - w[24] = amd_bytealign (w[12], w[13], offset); - w[23] = amd_bytealign (w[11], w[12], offset); - w[22] = amd_bytealign (w[10], w[11], offset); - w[21] = amd_bytealign (w[ 9], w[10], offset); - w[20] = amd_bytealign (w[ 8], w[ 9], offset); - w[19] = amd_bytealign (w[ 7], w[ 8], offset); - w[18] = amd_bytealign (w[ 6], w[ 7], offset); - w[17] = amd_bytealign (w[ 5], w[ 6], offset); - w[16] = amd_bytealign (w[ 4], w[ 5], offset); - w[15] = amd_bytealign (w[ 3], w[ 4], offset); - w[14] = amd_bytealign (w[ 2], w[ 3], offset); - w[13] = amd_bytealign (w[ 1], w[ 2], offset); - w[12] = amd_bytealign (w[ 0], w[ 1], offset); - w[11] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[51], w[52], offset); + w[62] = hc_bytealign (w[50], w[51], offset); + w[61] = hc_bytealign (w[49], w[50], offset); + w[60] = hc_bytealign (w[48], w[49], offset); + w[59] = hc_bytealign (w[47], w[48], offset); + w[58] = hc_bytealign (w[46], w[47], offset); + w[57] = hc_bytealign (w[45], w[46], offset); + w[56] = hc_bytealign (w[44], w[45], offset); + w[55] = hc_bytealign (w[43], w[44], offset); + w[54] = hc_bytealign (w[42], w[43], offset); + w[53] = hc_bytealign (w[41], w[42], offset); + w[52] = hc_bytealign (w[40], w[41], offset); + w[51] = hc_bytealign (w[39], w[40], offset); + w[50] = hc_bytealign (w[38], w[39], offset); + w[49] = hc_bytealign (w[37], w[38], offset); + w[48] = hc_bytealign (w[36], w[37], offset); + w[47] = hc_bytealign (w[35], w[36], offset); + w[46] = hc_bytealign (w[34], w[35], offset); + w[45] = hc_bytealign (w[33], w[34], offset); + w[44] = hc_bytealign (w[32], w[33], offset); + w[43] = hc_bytealign (w[31], w[32], offset); + w[42] = hc_bytealign (w[30], w[31], offset); + w[41] = hc_bytealign (w[29], w[30], offset); + w[40] = hc_bytealign (w[28], w[29], offset); + w[39] = hc_bytealign (w[27], w[28], offset); + w[38] = hc_bytealign (w[26], w[27], offset); + w[37] = hc_bytealign (w[25], w[26], offset); + w[36] = hc_bytealign (w[24], w[25], offset); + w[35] = hc_bytealign (w[23], w[24], offset); + w[34] = hc_bytealign (w[22], w[23], offset); + w[33] = hc_bytealign (w[21], w[22], offset); + w[32] = hc_bytealign (w[20], w[21], offset); + w[31] = hc_bytealign (w[19], w[20], offset); + w[30] = hc_bytealign (w[18], w[19], offset); + w[29] = hc_bytealign (w[17], w[18], offset); + w[28] = hc_bytealign (w[16], w[17], offset); + w[27] = hc_bytealign (w[15], w[16], offset); + w[26] = hc_bytealign (w[14], w[15], offset); + w[25] = hc_bytealign (w[13], w[14], offset); + w[24] = hc_bytealign (w[12], w[13], offset); + w[23] = hc_bytealign (w[11], w[12], offset); + w[22] = hc_bytealign (w[10], w[11], offset); + w[21] = hc_bytealign (w[ 9], w[10], offset); + w[20] = hc_bytealign (w[ 8], w[ 9], offset); + w[19] = hc_bytealign (w[ 7], w[ 8], offset); + w[18] = hc_bytealign (w[ 6], w[ 7], offset); + w[17] = hc_bytealign (w[ 5], w[ 6], offset); + w[16] = hc_bytealign (w[ 4], w[ 5], offset); + w[15] = hc_bytealign (w[ 3], w[ 4], offset); + w[14] = hc_bytealign (w[ 2], w[ 3], offset); + w[13] = hc_bytealign (w[ 1], w[ 2], offset); + w[12] = hc_bytealign (w[ 0], w[ 1], offset); + w[11] = hc_bytealign ( 0, w[ 0], offset); w[10] = 0; w[ 9] = 0; w[ 8] = 0; @@ -12482,58 +12482,58 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 12: - w[63] = amd_bytealign (w[50], w[51], offset); - w[62] = amd_bytealign (w[49], w[50], offset); - w[61] = amd_bytealign (w[48], w[49], offset); - w[60] = amd_bytealign (w[47], w[48], offset); - w[59] = amd_bytealign (w[46], w[47], offset); - w[58] = amd_bytealign (w[45], w[46], offset); - w[57] = amd_bytealign (w[44], w[45], offset); - w[56] = amd_bytealign (w[43], w[44], offset); - w[55] = amd_bytealign (w[42], w[43], offset); - w[54] = amd_bytealign (w[41], w[42], offset); - w[53] = amd_bytealign (w[40], w[41], offset); - w[52] = amd_bytealign (w[39], w[40], offset); - w[51] = amd_bytealign (w[38], w[39], offset); - w[50] = amd_bytealign (w[37], w[38], offset); - w[49] = amd_bytealign (w[36], w[37], offset); - w[48] = amd_bytealign (w[35], w[36], offset); - w[47] = amd_bytealign (w[34], w[35], offset); - w[46] = amd_bytealign (w[33], w[34], offset); - w[45] = amd_bytealign (w[32], w[33], offset); - w[44] = amd_bytealign (w[31], w[32], offset); - w[43] = amd_bytealign (w[30], w[31], offset); - w[42] = amd_bytealign (w[29], w[30], offset); - w[41] = amd_bytealign (w[28], w[29], offset); - w[40] = amd_bytealign (w[27], w[28], offset); - w[39] = amd_bytealign (w[26], w[27], offset); - w[38] = amd_bytealign (w[25], w[26], offset); - w[37] = amd_bytealign (w[24], w[25], offset); - w[36] = amd_bytealign (w[23], w[24], offset); - w[35] = amd_bytealign (w[22], w[23], offset); - w[34] = amd_bytealign (w[21], w[22], offset); - w[33] = amd_bytealign (w[20], w[21], offset); - w[32] = amd_bytealign (w[19], w[20], offset); - w[31] = amd_bytealign (w[18], w[19], offset); - w[30] = amd_bytealign (w[17], w[18], offset); - w[29] = amd_bytealign (w[16], w[17], offset); - w[28] = amd_bytealign (w[15], w[16], offset); - w[27] = amd_bytealign (w[14], w[15], offset); - w[26] = amd_bytealign (w[13], w[14], offset); - w[25] = amd_bytealign (w[12], w[13], offset); - w[24] = amd_bytealign (w[11], w[12], offset); - w[23] = amd_bytealign (w[10], w[11], offset); - w[22] = amd_bytealign (w[ 9], w[10], offset); - w[21] = amd_bytealign (w[ 8], w[ 9], offset); - w[20] = amd_bytealign (w[ 7], w[ 8], offset); - w[19] = amd_bytealign (w[ 6], w[ 7], offset); - w[18] = amd_bytealign (w[ 5], w[ 6], offset); - w[17] = amd_bytealign (w[ 4], w[ 5], offset); - w[16] = amd_bytealign (w[ 3], w[ 4], offset); - w[15] = amd_bytealign (w[ 2], w[ 3], offset); - w[14] = amd_bytealign (w[ 1], w[ 2], offset); - w[13] = amd_bytealign (w[ 0], w[ 1], offset); - w[12] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[50], w[51], offset); + w[62] = hc_bytealign (w[49], w[50], offset); + w[61] = hc_bytealign (w[48], w[49], offset); + w[60] = hc_bytealign (w[47], w[48], offset); + w[59] = hc_bytealign (w[46], w[47], offset); + w[58] = hc_bytealign (w[45], w[46], offset); + w[57] = hc_bytealign (w[44], w[45], offset); + w[56] = hc_bytealign (w[43], w[44], offset); + w[55] = hc_bytealign (w[42], w[43], offset); + w[54] = hc_bytealign (w[41], w[42], offset); + w[53] = hc_bytealign (w[40], w[41], offset); + w[52] = hc_bytealign (w[39], w[40], offset); + w[51] = hc_bytealign (w[38], w[39], offset); + w[50] = hc_bytealign (w[37], w[38], offset); + w[49] = hc_bytealign (w[36], w[37], offset); + w[48] = hc_bytealign (w[35], w[36], offset); + w[47] = hc_bytealign (w[34], w[35], offset); + w[46] = hc_bytealign (w[33], w[34], offset); + w[45] = hc_bytealign (w[32], w[33], offset); + w[44] = hc_bytealign (w[31], w[32], offset); + w[43] = hc_bytealign (w[30], w[31], offset); + w[42] = hc_bytealign (w[29], w[30], offset); + w[41] = hc_bytealign (w[28], w[29], offset); + w[40] = hc_bytealign (w[27], w[28], offset); + w[39] = hc_bytealign (w[26], w[27], offset); + w[38] = hc_bytealign (w[25], w[26], offset); + w[37] = hc_bytealign (w[24], w[25], offset); + w[36] = hc_bytealign (w[23], w[24], offset); + w[35] = hc_bytealign (w[22], w[23], offset); + w[34] = hc_bytealign (w[21], w[22], offset); + w[33] = hc_bytealign (w[20], w[21], offset); + w[32] = hc_bytealign (w[19], w[20], offset); + w[31] = hc_bytealign (w[18], w[19], offset); + w[30] = hc_bytealign (w[17], w[18], offset); + w[29] = hc_bytealign (w[16], w[17], offset); + w[28] = hc_bytealign (w[15], w[16], offset); + w[27] = hc_bytealign (w[14], w[15], offset); + w[26] = hc_bytealign (w[13], w[14], offset); + w[25] = hc_bytealign (w[12], w[13], offset); + w[24] = hc_bytealign (w[11], w[12], offset); + w[23] = hc_bytealign (w[10], w[11], offset); + w[22] = hc_bytealign (w[ 9], w[10], offset); + w[21] = hc_bytealign (w[ 8], w[ 9], offset); + w[20] = hc_bytealign (w[ 7], w[ 8], offset); + w[19] = hc_bytealign (w[ 6], w[ 7], offset); + w[18] = hc_bytealign (w[ 5], w[ 6], offset); + w[17] = hc_bytealign (w[ 4], w[ 5], offset); + w[16] = hc_bytealign (w[ 3], w[ 4], offset); + w[15] = hc_bytealign (w[ 2], w[ 3], offset); + w[14] = hc_bytealign (w[ 1], w[ 2], offset); + w[13] = hc_bytealign (w[ 0], w[ 1], offset); + w[12] = hc_bytealign ( 0, w[ 0], offset); w[11] = 0; w[10] = 0; w[ 9] = 0; @@ -12550,57 +12550,57 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 13: - w[63] = amd_bytealign (w[49], w[50], offset); - w[62] = amd_bytealign (w[48], w[49], offset); - w[61] = amd_bytealign (w[47], w[48], offset); - w[60] = amd_bytealign (w[46], w[47], offset); - w[59] = amd_bytealign (w[45], w[46], offset); - w[58] = amd_bytealign (w[44], w[45], offset); - w[57] = amd_bytealign (w[43], w[44], offset); - w[56] = amd_bytealign (w[42], w[43], offset); - w[55] = amd_bytealign (w[41], w[42], offset); - w[54] = amd_bytealign (w[40], w[41], offset); - w[53] = amd_bytealign (w[39], w[40], offset); - w[52] = amd_bytealign (w[38], w[39], offset); - w[51] = amd_bytealign (w[37], w[38], offset); - w[50] = amd_bytealign (w[36], w[37], offset); - w[49] = amd_bytealign (w[35], w[36], offset); - w[48] = amd_bytealign (w[34], w[35], offset); - w[47] = amd_bytealign (w[33], w[34], offset); - w[46] = amd_bytealign (w[32], w[33], offset); - w[45] = amd_bytealign (w[31], w[32], offset); - w[44] = amd_bytealign (w[30], w[31], offset); - w[43] = amd_bytealign (w[29], w[30], offset); - w[42] = amd_bytealign (w[28], w[29], offset); - w[41] = amd_bytealign (w[27], w[28], offset); - w[40] = amd_bytealign (w[26], w[27], offset); - w[39] = amd_bytealign (w[25], w[26], offset); - w[38] = amd_bytealign (w[24], w[25], offset); - w[37] = amd_bytealign (w[23], w[24], offset); - w[36] = amd_bytealign (w[22], w[23], offset); - w[35] = amd_bytealign (w[21], w[22], offset); - w[34] = amd_bytealign (w[20], w[21], offset); - w[33] = amd_bytealign (w[19], w[20], offset); - w[32] = amd_bytealign (w[18], w[19], offset); - w[31] = amd_bytealign (w[17], w[18], offset); - w[30] = amd_bytealign (w[16], w[17], offset); - w[29] = amd_bytealign (w[15], w[16], offset); - w[28] = amd_bytealign (w[14], w[15], offset); - w[27] = amd_bytealign (w[13], w[14], offset); - w[26] = amd_bytealign (w[12], w[13], offset); - w[25] = amd_bytealign (w[11], w[12], offset); - w[24] = amd_bytealign (w[10], w[11], offset); - w[23] = amd_bytealign (w[ 9], w[10], offset); - w[22] = amd_bytealign (w[ 8], w[ 9], offset); - w[21] = amd_bytealign (w[ 7], w[ 8], offset); - w[20] = amd_bytealign (w[ 6], w[ 7], offset); - w[19] = amd_bytealign (w[ 5], w[ 6], offset); - w[18] = amd_bytealign (w[ 4], w[ 5], offset); - w[17] = amd_bytealign (w[ 3], w[ 4], offset); - w[16] = amd_bytealign (w[ 2], w[ 3], offset); - w[15] = amd_bytealign (w[ 1], w[ 2], offset); - w[14] = amd_bytealign (w[ 0], w[ 1], offset); - w[13] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[49], w[50], offset); + w[62] = hc_bytealign (w[48], w[49], offset); + w[61] = hc_bytealign (w[47], w[48], offset); + w[60] = hc_bytealign (w[46], w[47], offset); + w[59] = hc_bytealign (w[45], w[46], offset); + w[58] = hc_bytealign (w[44], w[45], offset); + w[57] = hc_bytealign (w[43], w[44], offset); + w[56] = hc_bytealign (w[42], w[43], offset); + w[55] = hc_bytealign (w[41], w[42], offset); + w[54] = hc_bytealign (w[40], w[41], offset); + w[53] = hc_bytealign (w[39], w[40], offset); + w[52] = hc_bytealign (w[38], w[39], offset); + w[51] = hc_bytealign (w[37], w[38], offset); + w[50] = hc_bytealign (w[36], w[37], offset); + w[49] = hc_bytealign (w[35], w[36], offset); + w[48] = hc_bytealign (w[34], w[35], offset); + w[47] = hc_bytealign (w[33], w[34], offset); + w[46] = hc_bytealign (w[32], w[33], offset); + w[45] = hc_bytealign (w[31], w[32], offset); + w[44] = hc_bytealign (w[30], w[31], offset); + w[43] = hc_bytealign (w[29], w[30], offset); + w[42] = hc_bytealign (w[28], w[29], offset); + w[41] = hc_bytealign (w[27], w[28], offset); + w[40] = hc_bytealign (w[26], w[27], offset); + w[39] = hc_bytealign (w[25], w[26], offset); + w[38] = hc_bytealign (w[24], w[25], offset); + w[37] = hc_bytealign (w[23], w[24], offset); + w[36] = hc_bytealign (w[22], w[23], offset); + w[35] = hc_bytealign (w[21], w[22], offset); + w[34] = hc_bytealign (w[20], w[21], offset); + w[33] = hc_bytealign (w[19], w[20], offset); + w[32] = hc_bytealign (w[18], w[19], offset); + w[31] = hc_bytealign (w[17], w[18], offset); + w[30] = hc_bytealign (w[16], w[17], offset); + w[29] = hc_bytealign (w[15], w[16], offset); + w[28] = hc_bytealign (w[14], w[15], offset); + w[27] = hc_bytealign (w[13], w[14], offset); + w[26] = hc_bytealign (w[12], w[13], offset); + w[25] = hc_bytealign (w[11], w[12], offset); + w[24] = hc_bytealign (w[10], w[11], offset); + w[23] = hc_bytealign (w[ 9], w[10], offset); + w[22] = hc_bytealign (w[ 8], w[ 9], offset); + w[21] = hc_bytealign (w[ 7], w[ 8], offset); + w[20] = hc_bytealign (w[ 6], w[ 7], offset); + w[19] = hc_bytealign (w[ 5], w[ 6], offset); + w[18] = hc_bytealign (w[ 4], w[ 5], offset); + w[17] = hc_bytealign (w[ 3], w[ 4], offset); + w[16] = hc_bytealign (w[ 2], w[ 3], offset); + w[15] = hc_bytealign (w[ 1], w[ 2], offset); + w[14] = hc_bytealign (w[ 0], w[ 1], offset); + w[13] = hc_bytealign ( 0, w[ 0], offset); w[12] = 0; w[11] = 0; w[10] = 0; @@ -12618,56 +12618,56 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 14: - w[63] = amd_bytealign (w[48], w[49], offset); - w[62] = amd_bytealign (w[47], w[48], offset); - w[61] = amd_bytealign (w[46], w[47], offset); - w[60] = amd_bytealign (w[45], w[46], offset); - w[59] = amd_bytealign (w[44], w[45], offset); - w[58] = amd_bytealign (w[43], w[44], offset); - w[57] = amd_bytealign (w[42], w[43], offset); - w[56] = amd_bytealign (w[41], w[42], offset); - w[55] = amd_bytealign (w[40], w[41], offset); - w[54] = amd_bytealign (w[39], w[40], offset); - w[53] = amd_bytealign (w[38], w[39], offset); - w[52] = amd_bytealign (w[37], w[38], offset); - w[51] = amd_bytealign (w[36], w[37], offset); - w[50] = amd_bytealign (w[35], w[36], offset); - w[49] = amd_bytealign (w[34], w[35], offset); - w[48] = amd_bytealign (w[33], w[34], offset); - w[47] = amd_bytealign (w[32], w[33], offset); - w[46] = amd_bytealign (w[31], w[32], offset); - w[45] = amd_bytealign (w[30], w[31], offset); - w[44] = amd_bytealign (w[29], w[30], offset); - w[43] = amd_bytealign (w[28], w[29], offset); - w[42] = amd_bytealign (w[27], w[28], offset); - w[41] = amd_bytealign (w[26], w[27], offset); - w[40] = amd_bytealign (w[25], w[26], offset); - w[39] = amd_bytealign (w[24], w[25], offset); - w[38] = amd_bytealign (w[23], w[24], offset); - w[37] = amd_bytealign (w[22], w[23], offset); - w[36] = amd_bytealign (w[21], w[22], offset); - w[35] = amd_bytealign (w[20], w[21], offset); - w[34] = amd_bytealign (w[19], w[20], offset); - w[33] = amd_bytealign (w[18], w[19], offset); - w[32] = amd_bytealign (w[17], w[18], offset); - w[31] = amd_bytealign (w[16], w[17], offset); - w[30] = amd_bytealign (w[15], w[16], offset); - w[29] = amd_bytealign (w[14], w[15], offset); - w[28] = amd_bytealign (w[13], w[14], offset); - w[27] = amd_bytealign (w[12], w[13], offset); - w[26] = amd_bytealign (w[11], w[12], offset); - w[25] = amd_bytealign (w[10], w[11], offset); - w[24] = amd_bytealign (w[ 9], w[10], offset); - w[23] = amd_bytealign (w[ 8], w[ 9], offset); - w[22] = amd_bytealign (w[ 7], w[ 8], offset); - w[21] = amd_bytealign (w[ 6], w[ 7], offset); - w[20] = amd_bytealign (w[ 5], w[ 6], offset); - w[19] = amd_bytealign (w[ 4], w[ 5], offset); - w[18] = amd_bytealign (w[ 3], w[ 4], offset); - w[17] = amd_bytealign (w[ 2], w[ 3], offset); - w[16] = amd_bytealign (w[ 1], w[ 2], offset); - w[15] = amd_bytealign (w[ 0], w[ 1], offset); - w[14] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[48], w[49], offset); + w[62] = hc_bytealign (w[47], w[48], offset); + w[61] = hc_bytealign (w[46], w[47], offset); + w[60] = hc_bytealign (w[45], w[46], offset); + w[59] = hc_bytealign (w[44], w[45], offset); + w[58] = hc_bytealign (w[43], w[44], offset); + w[57] = hc_bytealign (w[42], w[43], offset); + w[56] = hc_bytealign (w[41], w[42], offset); + w[55] = hc_bytealign (w[40], w[41], offset); + w[54] = hc_bytealign (w[39], w[40], offset); + w[53] = hc_bytealign (w[38], w[39], offset); + w[52] = hc_bytealign (w[37], w[38], offset); + w[51] = hc_bytealign (w[36], w[37], offset); + w[50] = hc_bytealign (w[35], w[36], offset); + w[49] = hc_bytealign (w[34], w[35], offset); + w[48] = hc_bytealign (w[33], w[34], offset); + w[47] = hc_bytealign (w[32], w[33], offset); + w[46] = hc_bytealign (w[31], w[32], offset); + w[45] = hc_bytealign (w[30], w[31], offset); + w[44] = hc_bytealign (w[29], w[30], offset); + w[43] = hc_bytealign (w[28], w[29], offset); + w[42] = hc_bytealign (w[27], w[28], offset); + w[41] = hc_bytealign (w[26], w[27], offset); + w[40] = hc_bytealign (w[25], w[26], offset); + w[39] = hc_bytealign (w[24], w[25], offset); + w[38] = hc_bytealign (w[23], w[24], offset); + w[37] = hc_bytealign (w[22], w[23], offset); + w[36] = hc_bytealign (w[21], w[22], offset); + w[35] = hc_bytealign (w[20], w[21], offset); + w[34] = hc_bytealign (w[19], w[20], offset); + w[33] = hc_bytealign (w[18], w[19], offset); + w[32] = hc_bytealign (w[17], w[18], offset); + w[31] = hc_bytealign (w[16], w[17], offset); + w[30] = hc_bytealign (w[15], w[16], offset); + w[29] = hc_bytealign (w[14], w[15], offset); + w[28] = hc_bytealign (w[13], w[14], offset); + w[27] = hc_bytealign (w[12], w[13], offset); + w[26] = hc_bytealign (w[11], w[12], offset); + w[25] = hc_bytealign (w[10], w[11], offset); + w[24] = hc_bytealign (w[ 9], w[10], offset); + w[23] = hc_bytealign (w[ 8], w[ 9], offset); + w[22] = hc_bytealign (w[ 7], w[ 8], offset); + w[21] = hc_bytealign (w[ 6], w[ 7], offset); + w[20] = hc_bytealign (w[ 5], w[ 6], offset); + w[19] = hc_bytealign (w[ 4], w[ 5], offset); + w[18] = hc_bytealign (w[ 3], w[ 4], offset); + w[17] = hc_bytealign (w[ 2], w[ 3], offset); + w[16] = hc_bytealign (w[ 1], w[ 2], offset); + w[15] = hc_bytealign (w[ 0], w[ 1], offset); + w[14] = hc_bytealign ( 0, w[ 0], offset); w[13] = 0; w[12] = 0; w[11] = 0; @@ -12686,55 +12686,55 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 15: - w[63] = amd_bytealign (w[47], w[48], offset); - w[62] = amd_bytealign (w[46], w[47], offset); - w[61] = amd_bytealign (w[45], w[46], offset); - w[60] = amd_bytealign (w[44], w[45], offset); - w[59] = amd_bytealign (w[43], w[44], offset); - w[58] = amd_bytealign (w[42], w[43], offset); - w[57] = amd_bytealign (w[41], w[42], offset); - w[56] = amd_bytealign (w[40], w[41], offset); - w[55] = amd_bytealign (w[39], w[40], offset); - w[54] = amd_bytealign (w[38], w[39], offset); - w[53] = amd_bytealign (w[37], w[38], offset); - w[52] = amd_bytealign (w[36], w[37], offset); - w[51] = amd_bytealign (w[35], w[36], offset); - w[50] = amd_bytealign (w[34], w[35], offset); - w[49] = amd_bytealign (w[33], w[34], offset); - w[48] = amd_bytealign (w[32], w[33], offset); - w[47] = amd_bytealign (w[31], w[32], offset); - w[46] = amd_bytealign (w[30], w[31], offset); - w[45] = amd_bytealign (w[29], w[30], offset); - w[44] = amd_bytealign (w[28], w[29], offset); - w[43] = amd_bytealign (w[27], w[28], offset); - w[42] = amd_bytealign (w[26], w[27], offset); - w[41] = amd_bytealign (w[25], w[26], offset); - w[40] = amd_bytealign (w[24], w[25], offset); - w[39] = amd_bytealign (w[23], w[24], offset); - w[38] = amd_bytealign (w[22], w[23], offset); - w[37] = amd_bytealign (w[21], w[22], offset); - w[36] = amd_bytealign (w[20], w[21], offset); - w[35] = amd_bytealign (w[19], w[20], offset); - w[34] = amd_bytealign (w[18], w[19], offset); - w[33] = amd_bytealign (w[17], w[18], offset); - w[32] = amd_bytealign (w[16], w[17], offset); - w[31] = amd_bytealign (w[15], w[16], offset); - w[30] = amd_bytealign (w[14], w[15], offset); - w[29] = amd_bytealign (w[13], w[14], offset); - w[28] = amd_bytealign (w[12], w[13], offset); - w[27] = amd_bytealign (w[11], w[12], offset); - w[26] = amd_bytealign (w[10], w[11], offset); - w[25] = amd_bytealign (w[ 9], w[10], offset); - w[24] = amd_bytealign (w[ 8], w[ 9], offset); - w[23] = amd_bytealign (w[ 7], w[ 8], offset); - w[22] = amd_bytealign (w[ 6], w[ 7], offset); - w[21] = amd_bytealign (w[ 5], w[ 6], offset); - w[20] = amd_bytealign (w[ 4], w[ 5], offset); - w[19] = amd_bytealign (w[ 3], w[ 4], offset); - w[18] = amd_bytealign (w[ 2], w[ 3], offset); - w[17] = amd_bytealign (w[ 1], w[ 2], offset); - w[16] = amd_bytealign (w[ 0], w[ 1], offset); - w[15] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[47], w[48], offset); + w[62] = hc_bytealign (w[46], w[47], offset); + w[61] = hc_bytealign (w[45], w[46], offset); + w[60] = hc_bytealign (w[44], w[45], offset); + w[59] = hc_bytealign (w[43], w[44], offset); + w[58] = hc_bytealign (w[42], w[43], offset); + w[57] = hc_bytealign (w[41], w[42], offset); + w[56] = hc_bytealign (w[40], w[41], offset); + w[55] = hc_bytealign (w[39], w[40], offset); + w[54] = hc_bytealign (w[38], w[39], offset); + w[53] = hc_bytealign (w[37], w[38], offset); + w[52] = hc_bytealign (w[36], w[37], offset); + w[51] = hc_bytealign (w[35], w[36], offset); + w[50] = hc_bytealign (w[34], w[35], offset); + w[49] = hc_bytealign (w[33], w[34], offset); + w[48] = hc_bytealign (w[32], w[33], offset); + w[47] = hc_bytealign (w[31], w[32], offset); + w[46] = hc_bytealign (w[30], w[31], offset); + w[45] = hc_bytealign (w[29], w[30], offset); + w[44] = hc_bytealign (w[28], w[29], offset); + w[43] = hc_bytealign (w[27], w[28], offset); + w[42] = hc_bytealign (w[26], w[27], offset); + w[41] = hc_bytealign (w[25], w[26], offset); + w[40] = hc_bytealign (w[24], w[25], offset); + w[39] = hc_bytealign (w[23], w[24], offset); + w[38] = hc_bytealign (w[22], w[23], offset); + w[37] = hc_bytealign (w[21], w[22], offset); + w[36] = hc_bytealign (w[20], w[21], offset); + w[35] = hc_bytealign (w[19], w[20], offset); + w[34] = hc_bytealign (w[18], w[19], offset); + w[33] = hc_bytealign (w[17], w[18], offset); + w[32] = hc_bytealign (w[16], w[17], offset); + w[31] = hc_bytealign (w[15], w[16], offset); + w[30] = hc_bytealign (w[14], w[15], offset); + w[29] = hc_bytealign (w[13], w[14], offset); + w[28] = hc_bytealign (w[12], w[13], offset); + w[27] = hc_bytealign (w[11], w[12], offset); + w[26] = hc_bytealign (w[10], w[11], offset); + w[25] = hc_bytealign (w[ 9], w[10], offset); + w[24] = hc_bytealign (w[ 8], w[ 9], offset); + w[23] = hc_bytealign (w[ 7], w[ 8], offset); + w[22] = hc_bytealign (w[ 6], w[ 7], offset); + w[21] = hc_bytealign (w[ 5], w[ 6], offset); + w[20] = hc_bytealign (w[ 4], w[ 5], offset); + w[19] = hc_bytealign (w[ 3], w[ 4], offset); + w[18] = hc_bytealign (w[ 2], w[ 3], offset); + w[17] = hc_bytealign (w[ 1], w[ 2], offset); + w[16] = hc_bytealign (w[ 0], w[ 1], offset); + w[15] = hc_bytealign ( 0, w[ 0], offset); w[14] = 0; w[13] = 0; w[12] = 0; @@ -12754,54 +12754,54 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 16: - w[63] = amd_bytealign (w[46], w[47], offset); - w[62] = amd_bytealign (w[45], w[46], offset); - w[61] = amd_bytealign (w[44], w[45], offset); - w[60] = amd_bytealign (w[43], w[44], offset); - w[59] = amd_bytealign (w[42], w[43], offset); - w[58] = amd_bytealign (w[41], w[42], offset); - w[57] = amd_bytealign (w[40], w[41], offset); - w[56] = amd_bytealign (w[39], w[40], offset); - w[55] = amd_bytealign (w[38], w[39], offset); - w[54] = amd_bytealign (w[37], w[38], offset); - w[53] = amd_bytealign (w[36], w[37], offset); - w[52] = amd_bytealign (w[35], w[36], offset); - w[51] = amd_bytealign (w[34], w[35], offset); - w[50] = amd_bytealign (w[33], w[34], offset); - w[49] = amd_bytealign (w[32], w[33], offset); - w[48] = amd_bytealign (w[31], w[32], offset); - w[47] = amd_bytealign (w[30], w[31], offset); - w[46] = amd_bytealign (w[29], w[30], offset); - w[45] = amd_bytealign (w[28], w[29], offset); - w[44] = amd_bytealign (w[27], w[28], offset); - w[43] = amd_bytealign (w[26], w[27], offset); - w[42] = amd_bytealign (w[25], w[26], offset); - w[41] = amd_bytealign (w[24], w[25], offset); - w[40] = amd_bytealign (w[23], w[24], offset); - w[39] = amd_bytealign (w[22], w[23], offset); - w[38] = amd_bytealign (w[21], w[22], offset); - w[37] = amd_bytealign (w[20], w[21], offset); - w[36] = amd_bytealign (w[19], w[20], offset); - w[35] = amd_bytealign (w[18], w[19], offset); - w[34] = amd_bytealign (w[17], w[18], offset); - w[33] = amd_bytealign (w[16], w[17], offset); - w[32] = amd_bytealign (w[15], w[16], offset); - w[31] = amd_bytealign (w[14], w[15], offset); - w[30] = amd_bytealign (w[13], w[14], offset); - w[29] = amd_bytealign (w[12], w[13], offset); - w[28] = amd_bytealign (w[11], w[12], offset); - w[27] = amd_bytealign (w[10], w[11], offset); - w[26] = amd_bytealign (w[ 9], w[10], offset); - w[25] = amd_bytealign (w[ 8], w[ 9], offset); - w[24] = amd_bytealign (w[ 7], w[ 8], offset); - w[23] = amd_bytealign (w[ 6], w[ 7], offset); - w[22] = amd_bytealign (w[ 5], w[ 6], offset); - w[21] = amd_bytealign (w[ 4], w[ 5], offset); - w[20] = amd_bytealign (w[ 3], w[ 4], offset); - w[19] = amd_bytealign (w[ 2], w[ 3], offset); - w[18] = amd_bytealign (w[ 1], w[ 2], offset); - w[17] = amd_bytealign (w[ 0], w[ 1], offset); - w[16] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[46], w[47], offset); + w[62] = hc_bytealign (w[45], w[46], offset); + w[61] = hc_bytealign (w[44], w[45], offset); + w[60] = hc_bytealign (w[43], w[44], offset); + w[59] = hc_bytealign (w[42], w[43], offset); + w[58] = hc_bytealign (w[41], w[42], offset); + w[57] = hc_bytealign (w[40], w[41], offset); + w[56] = hc_bytealign (w[39], w[40], offset); + w[55] = hc_bytealign (w[38], w[39], offset); + w[54] = hc_bytealign (w[37], w[38], offset); + w[53] = hc_bytealign (w[36], w[37], offset); + w[52] = hc_bytealign (w[35], w[36], offset); + w[51] = hc_bytealign (w[34], w[35], offset); + w[50] = hc_bytealign (w[33], w[34], offset); + w[49] = hc_bytealign (w[32], w[33], offset); + w[48] = hc_bytealign (w[31], w[32], offset); + w[47] = hc_bytealign (w[30], w[31], offset); + w[46] = hc_bytealign (w[29], w[30], offset); + w[45] = hc_bytealign (w[28], w[29], offset); + w[44] = hc_bytealign (w[27], w[28], offset); + w[43] = hc_bytealign (w[26], w[27], offset); + w[42] = hc_bytealign (w[25], w[26], offset); + w[41] = hc_bytealign (w[24], w[25], offset); + w[40] = hc_bytealign (w[23], w[24], offset); + w[39] = hc_bytealign (w[22], w[23], offset); + w[38] = hc_bytealign (w[21], w[22], offset); + w[37] = hc_bytealign (w[20], w[21], offset); + w[36] = hc_bytealign (w[19], w[20], offset); + w[35] = hc_bytealign (w[18], w[19], offset); + w[34] = hc_bytealign (w[17], w[18], offset); + w[33] = hc_bytealign (w[16], w[17], offset); + w[32] = hc_bytealign (w[15], w[16], offset); + w[31] = hc_bytealign (w[14], w[15], offset); + w[30] = hc_bytealign (w[13], w[14], offset); + w[29] = hc_bytealign (w[12], w[13], offset); + w[28] = hc_bytealign (w[11], w[12], offset); + w[27] = hc_bytealign (w[10], w[11], offset); + w[26] = hc_bytealign (w[ 9], w[10], offset); + w[25] = hc_bytealign (w[ 8], w[ 9], offset); + w[24] = hc_bytealign (w[ 7], w[ 8], offset); + w[23] = hc_bytealign (w[ 6], w[ 7], offset); + w[22] = hc_bytealign (w[ 5], w[ 6], offset); + w[21] = hc_bytealign (w[ 4], w[ 5], offset); + w[20] = hc_bytealign (w[ 3], w[ 4], offset); + w[19] = hc_bytealign (w[ 2], w[ 3], offset); + w[18] = hc_bytealign (w[ 1], w[ 2], offset); + w[17] = hc_bytealign (w[ 0], w[ 1], offset); + w[16] = hc_bytealign ( 0, w[ 0], offset); w[15] = 0; w[14] = 0; w[13] = 0; @@ -12822,53 +12822,53 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 17: - w[63] = amd_bytealign (w[45], w[46], offset); - w[62] = amd_bytealign (w[44], w[45], offset); - w[61] = amd_bytealign (w[43], w[44], offset); - w[60] = amd_bytealign (w[42], w[43], offset); - w[59] = amd_bytealign (w[41], w[42], offset); - w[58] = amd_bytealign (w[40], w[41], offset); - w[57] = amd_bytealign (w[39], w[40], offset); - w[56] = amd_bytealign (w[38], w[39], offset); - w[55] = amd_bytealign (w[37], w[38], offset); - w[54] = amd_bytealign (w[36], w[37], offset); - w[53] = amd_bytealign (w[35], w[36], offset); - w[52] = amd_bytealign (w[34], w[35], offset); - w[51] = amd_bytealign (w[33], w[34], offset); - w[50] = amd_bytealign (w[32], w[33], offset); - w[49] = amd_bytealign (w[31], w[32], offset); - w[48] = amd_bytealign (w[30], w[31], offset); - w[47] = amd_bytealign (w[29], w[30], offset); - w[46] = amd_bytealign (w[28], w[29], offset); - w[45] = amd_bytealign (w[27], w[28], offset); - w[44] = amd_bytealign (w[26], w[27], offset); - w[43] = amd_bytealign (w[25], w[26], offset); - w[42] = amd_bytealign (w[24], w[25], offset); - w[41] = amd_bytealign (w[23], w[24], offset); - w[40] = amd_bytealign (w[22], w[23], offset); - w[39] = amd_bytealign (w[21], w[22], offset); - w[38] = amd_bytealign (w[20], w[21], offset); - w[37] = amd_bytealign (w[19], w[20], offset); - w[36] = amd_bytealign (w[18], w[19], offset); - w[35] = amd_bytealign (w[17], w[18], offset); - w[34] = amd_bytealign (w[16], w[17], offset); - w[33] = amd_bytealign (w[15], w[16], offset); - w[32] = amd_bytealign (w[14], w[15], offset); - w[31] = amd_bytealign (w[13], w[14], offset); - w[30] = amd_bytealign (w[12], w[13], offset); - w[29] = amd_bytealign (w[11], w[12], offset); - w[28] = amd_bytealign (w[10], w[11], offset); - w[27] = amd_bytealign (w[ 9], w[10], offset); - w[26] = amd_bytealign (w[ 8], w[ 9], offset); - w[25] = amd_bytealign (w[ 7], w[ 8], offset); - w[24] = amd_bytealign (w[ 6], w[ 7], offset); - w[23] = amd_bytealign (w[ 5], w[ 6], offset); - w[22] = amd_bytealign (w[ 4], w[ 5], offset); - w[21] = amd_bytealign (w[ 3], w[ 4], offset); - w[20] = amd_bytealign (w[ 2], w[ 3], offset); - w[19] = amd_bytealign (w[ 1], w[ 2], offset); - w[18] = amd_bytealign (w[ 0], w[ 1], offset); - w[17] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[45], w[46], offset); + w[62] = hc_bytealign (w[44], w[45], offset); + w[61] = hc_bytealign (w[43], w[44], offset); + w[60] = hc_bytealign (w[42], w[43], offset); + w[59] = hc_bytealign (w[41], w[42], offset); + w[58] = hc_bytealign (w[40], w[41], offset); + w[57] = hc_bytealign (w[39], w[40], offset); + w[56] = hc_bytealign (w[38], w[39], offset); + w[55] = hc_bytealign (w[37], w[38], offset); + w[54] = hc_bytealign (w[36], w[37], offset); + w[53] = hc_bytealign (w[35], w[36], offset); + w[52] = hc_bytealign (w[34], w[35], offset); + w[51] = hc_bytealign (w[33], w[34], offset); + w[50] = hc_bytealign (w[32], w[33], offset); + w[49] = hc_bytealign (w[31], w[32], offset); + w[48] = hc_bytealign (w[30], w[31], offset); + w[47] = hc_bytealign (w[29], w[30], offset); + w[46] = hc_bytealign (w[28], w[29], offset); + w[45] = hc_bytealign (w[27], w[28], offset); + w[44] = hc_bytealign (w[26], w[27], offset); + w[43] = hc_bytealign (w[25], w[26], offset); + w[42] = hc_bytealign (w[24], w[25], offset); + w[41] = hc_bytealign (w[23], w[24], offset); + w[40] = hc_bytealign (w[22], w[23], offset); + w[39] = hc_bytealign (w[21], w[22], offset); + w[38] = hc_bytealign (w[20], w[21], offset); + w[37] = hc_bytealign (w[19], w[20], offset); + w[36] = hc_bytealign (w[18], w[19], offset); + w[35] = hc_bytealign (w[17], w[18], offset); + w[34] = hc_bytealign (w[16], w[17], offset); + w[33] = hc_bytealign (w[15], w[16], offset); + w[32] = hc_bytealign (w[14], w[15], offset); + w[31] = hc_bytealign (w[13], w[14], offset); + w[30] = hc_bytealign (w[12], w[13], offset); + w[29] = hc_bytealign (w[11], w[12], offset); + w[28] = hc_bytealign (w[10], w[11], offset); + w[27] = hc_bytealign (w[ 9], w[10], offset); + w[26] = hc_bytealign (w[ 8], w[ 9], offset); + w[25] = hc_bytealign (w[ 7], w[ 8], offset); + w[24] = hc_bytealign (w[ 6], w[ 7], offset); + w[23] = hc_bytealign (w[ 5], w[ 6], offset); + w[22] = hc_bytealign (w[ 4], w[ 5], offset); + w[21] = hc_bytealign (w[ 3], w[ 4], offset); + w[20] = hc_bytealign (w[ 2], w[ 3], offset); + w[19] = hc_bytealign (w[ 1], w[ 2], offset); + w[18] = hc_bytealign (w[ 0], w[ 1], offset); + w[17] = hc_bytealign ( 0, w[ 0], offset); w[16] = 0; w[15] = 0; w[14] = 0; @@ -12890,52 +12890,52 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 18: - w[63] = amd_bytealign (w[44], w[45], offset); - w[62] = amd_bytealign (w[43], w[44], offset); - w[61] = amd_bytealign (w[42], w[43], offset); - w[60] = amd_bytealign (w[41], w[42], offset); - w[59] = amd_bytealign (w[40], w[41], offset); - w[58] = amd_bytealign (w[39], w[40], offset); - w[57] = amd_bytealign (w[38], w[39], offset); - w[56] = amd_bytealign (w[37], w[38], offset); - w[55] = amd_bytealign (w[36], w[37], offset); - w[54] = amd_bytealign (w[35], w[36], offset); - w[53] = amd_bytealign (w[34], w[35], offset); - w[52] = amd_bytealign (w[33], w[34], offset); - w[51] = amd_bytealign (w[32], w[33], offset); - w[50] = amd_bytealign (w[31], w[32], offset); - w[49] = amd_bytealign (w[30], w[31], offset); - w[48] = amd_bytealign (w[29], w[30], offset); - w[47] = amd_bytealign (w[28], w[29], offset); - w[46] = amd_bytealign (w[27], w[28], offset); - w[45] = amd_bytealign (w[26], w[27], offset); - w[44] = amd_bytealign (w[25], w[26], offset); - w[43] = amd_bytealign (w[24], w[25], offset); - w[42] = amd_bytealign (w[23], w[24], offset); - w[41] = amd_bytealign (w[22], w[23], offset); - w[40] = amd_bytealign (w[21], w[22], offset); - w[39] = amd_bytealign (w[20], w[21], offset); - w[38] = amd_bytealign (w[19], w[20], offset); - w[37] = amd_bytealign (w[18], w[19], offset); - w[36] = amd_bytealign (w[17], w[18], offset); - w[35] = amd_bytealign (w[16], w[17], offset); - w[34] = amd_bytealign (w[15], w[16], offset); - w[33] = amd_bytealign (w[14], w[15], offset); - w[32] = amd_bytealign (w[13], w[14], offset); - w[31] = amd_bytealign (w[12], w[13], offset); - w[30] = amd_bytealign (w[11], w[12], offset); - w[29] = amd_bytealign (w[10], w[11], offset); - w[28] = amd_bytealign (w[ 9], w[10], offset); - w[27] = amd_bytealign (w[ 8], w[ 9], offset); - w[26] = amd_bytealign (w[ 7], w[ 8], offset); - w[25] = amd_bytealign (w[ 6], w[ 7], offset); - w[24] = amd_bytealign (w[ 5], w[ 6], offset); - w[23] = amd_bytealign (w[ 4], w[ 5], offset); - w[22] = amd_bytealign (w[ 3], w[ 4], offset); - w[21] = amd_bytealign (w[ 2], w[ 3], offset); - w[20] = amd_bytealign (w[ 1], w[ 2], offset); - w[19] = amd_bytealign (w[ 0], w[ 1], offset); - w[18] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[44], w[45], offset); + w[62] = hc_bytealign (w[43], w[44], offset); + w[61] = hc_bytealign (w[42], w[43], offset); + w[60] = hc_bytealign (w[41], w[42], offset); + w[59] = hc_bytealign (w[40], w[41], offset); + w[58] = hc_bytealign (w[39], w[40], offset); + w[57] = hc_bytealign (w[38], w[39], offset); + w[56] = hc_bytealign (w[37], w[38], offset); + w[55] = hc_bytealign (w[36], w[37], offset); + w[54] = hc_bytealign (w[35], w[36], offset); + w[53] = hc_bytealign (w[34], w[35], offset); + w[52] = hc_bytealign (w[33], w[34], offset); + w[51] = hc_bytealign (w[32], w[33], offset); + w[50] = hc_bytealign (w[31], w[32], offset); + w[49] = hc_bytealign (w[30], w[31], offset); + w[48] = hc_bytealign (w[29], w[30], offset); + w[47] = hc_bytealign (w[28], w[29], offset); + w[46] = hc_bytealign (w[27], w[28], offset); + w[45] = hc_bytealign (w[26], w[27], offset); + w[44] = hc_bytealign (w[25], w[26], offset); + w[43] = hc_bytealign (w[24], w[25], offset); + w[42] = hc_bytealign (w[23], w[24], offset); + w[41] = hc_bytealign (w[22], w[23], offset); + w[40] = hc_bytealign (w[21], w[22], offset); + w[39] = hc_bytealign (w[20], w[21], offset); + w[38] = hc_bytealign (w[19], w[20], offset); + w[37] = hc_bytealign (w[18], w[19], offset); + w[36] = hc_bytealign (w[17], w[18], offset); + w[35] = hc_bytealign (w[16], w[17], offset); + w[34] = hc_bytealign (w[15], w[16], offset); + w[33] = hc_bytealign (w[14], w[15], offset); + w[32] = hc_bytealign (w[13], w[14], offset); + w[31] = hc_bytealign (w[12], w[13], offset); + w[30] = hc_bytealign (w[11], w[12], offset); + w[29] = hc_bytealign (w[10], w[11], offset); + w[28] = hc_bytealign (w[ 9], w[10], offset); + w[27] = hc_bytealign (w[ 8], w[ 9], offset); + w[26] = hc_bytealign (w[ 7], w[ 8], offset); + w[25] = hc_bytealign (w[ 6], w[ 7], offset); + w[24] = hc_bytealign (w[ 5], w[ 6], offset); + w[23] = hc_bytealign (w[ 4], w[ 5], offset); + w[22] = hc_bytealign (w[ 3], w[ 4], offset); + w[21] = hc_bytealign (w[ 2], w[ 3], offset); + w[20] = hc_bytealign (w[ 1], w[ 2], offset); + w[19] = hc_bytealign (w[ 0], w[ 1], offset); + w[18] = hc_bytealign ( 0, w[ 0], offset); w[17] = 0; w[16] = 0; w[15] = 0; @@ -12958,51 +12958,51 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 19: - w[63] = amd_bytealign (w[43], w[44], offset); - w[62] = amd_bytealign (w[42], w[43], offset); - w[61] = amd_bytealign (w[41], w[42], offset); - w[60] = amd_bytealign (w[40], w[41], offset); - w[59] = amd_bytealign (w[39], w[40], offset); - w[58] = amd_bytealign (w[38], w[39], offset); - w[57] = amd_bytealign (w[37], w[38], offset); - w[56] = amd_bytealign (w[36], w[37], offset); - w[55] = amd_bytealign (w[35], w[36], offset); - w[54] = amd_bytealign (w[34], w[35], offset); - w[53] = amd_bytealign (w[33], w[34], offset); - w[52] = amd_bytealign (w[32], w[33], offset); - w[51] = amd_bytealign (w[31], w[32], offset); - w[50] = amd_bytealign (w[30], w[31], offset); - w[49] = amd_bytealign (w[29], w[30], offset); - w[48] = amd_bytealign (w[28], w[29], offset); - w[47] = amd_bytealign (w[27], w[28], offset); - w[46] = amd_bytealign (w[26], w[27], offset); - w[45] = amd_bytealign (w[25], w[26], offset); - w[44] = amd_bytealign (w[24], w[25], offset); - w[43] = amd_bytealign (w[23], w[24], offset); - w[42] = amd_bytealign (w[22], w[23], offset); - w[41] = amd_bytealign (w[21], w[22], offset); - w[40] = amd_bytealign (w[20], w[21], offset); - w[39] = amd_bytealign (w[19], w[20], offset); - w[38] = amd_bytealign (w[18], w[19], offset); - w[37] = amd_bytealign (w[17], w[18], offset); - w[36] = amd_bytealign (w[16], w[17], offset); - w[35] = amd_bytealign (w[15], w[16], offset); - w[34] = amd_bytealign (w[14], w[15], offset); - w[33] = amd_bytealign (w[13], w[14], offset); - w[32] = amd_bytealign (w[12], w[13], offset); - w[31] = amd_bytealign (w[11], w[12], offset); - w[30] = amd_bytealign (w[10], w[11], offset); - w[29] = amd_bytealign (w[ 9], w[10], offset); - w[28] = amd_bytealign (w[ 8], w[ 9], offset); - w[27] = amd_bytealign (w[ 7], w[ 8], offset); - w[26] = amd_bytealign (w[ 6], w[ 7], offset); - w[25] = amd_bytealign (w[ 5], w[ 6], offset); - w[24] = amd_bytealign (w[ 4], w[ 5], offset); - w[23] = amd_bytealign (w[ 3], w[ 4], offset); - w[22] = amd_bytealign (w[ 2], w[ 3], offset); - w[21] = amd_bytealign (w[ 1], w[ 2], offset); - w[20] = amd_bytealign (w[ 0], w[ 1], offset); - w[19] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[43], w[44], offset); + w[62] = hc_bytealign (w[42], w[43], offset); + w[61] = hc_bytealign (w[41], w[42], offset); + w[60] = hc_bytealign (w[40], w[41], offset); + w[59] = hc_bytealign (w[39], w[40], offset); + w[58] = hc_bytealign (w[38], w[39], offset); + w[57] = hc_bytealign (w[37], w[38], offset); + w[56] = hc_bytealign (w[36], w[37], offset); + w[55] = hc_bytealign (w[35], w[36], offset); + w[54] = hc_bytealign (w[34], w[35], offset); + w[53] = hc_bytealign (w[33], w[34], offset); + w[52] = hc_bytealign (w[32], w[33], offset); + w[51] = hc_bytealign (w[31], w[32], offset); + w[50] = hc_bytealign (w[30], w[31], offset); + w[49] = hc_bytealign (w[29], w[30], offset); + w[48] = hc_bytealign (w[28], w[29], offset); + w[47] = hc_bytealign (w[27], w[28], offset); + w[46] = hc_bytealign (w[26], w[27], offset); + w[45] = hc_bytealign (w[25], w[26], offset); + w[44] = hc_bytealign (w[24], w[25], offset); + w[43] = hc_bytealign (w[23], w[24], offset); + w[42] = hc_bytealign (w[22], w[23], offset); + w[41] = hc_bytealign (w[21], w[22], offset); + w[40] = hc_bytealign (w[20], w[21], offset); + w[39] = hc_bytealign (w[19], w[20], offset); + w[38] = hc_bytealign (w[18], w[19], offset); + w[37] = hc_bytealign (w[17], w[18], offset); + w[36] = hc_bytealign (w[16], w[17], offset); + w[35] = hc_bytealign (w[15], w[16], offset); + w[34] = hc_bytealign (w[14], w[15], offset); + w[33] = hc_bytealign (w[13], w[14], offset); + w[32] = hc_bytealign (w[12], w[13], offset); + w[31] = hc_bytealign (w[11], w[12], offset); + w[30] = hc_bytealign (w[10], w[11], offset); + w[29] = hc_bytealign (w[ 9], w[10], offset); + w[28] = hc_bytealign (w[ 8], w[ 9], offset); + w[27] = hc_bytealign (w[ 7], w[ 8], offset); + w[26] = hc_bytealign (w[ 6], w[ 7], offset); + w[25] = hc_bytealign (w[ 5], w[ 6], offset); + w[24] = hc_bytealign (w[ 4], w[ 5], offset); + w[23] = hc_bytealign (w[ 3], w[ 4], offset); + w[22] = hc_bytealign (w[ 2], w[ 3], offset); + w[21] = hc_bytealign (w[ 1], w[ 2], offset); + w[20] = hc_bytealign (w[ 0], w[ 1], offset); + w[19] = hc_bytealign ( 0, w[ 0], offset); w[18] = 0; w[17] = 0; w[16] = 0; @@ -13026,50 +13026,50 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 20: - w[63] = amd_bytealign (w[42], w[43], offset); - w[62] = amd_bytealign (w[41], w[42], offset); - w[61] = amd_bytealign (w[40], w[41], offset); - w[60] = amd_bytealign (w[39], w[40], offset); - w[59] = amd_bytealign (w[38], w[39], offset); - w[58] = amd_bytealign (w[37], w[38], offset); - w[57] = amd_bytealign (w[36], w[37], offset); - w[56] = amd_bytealign (w[35], w[36], offset); - w[55] = amd_bytealign (w[34], w[35], offset); - w[54] = amd_bytealign (w[33], w[34], offset); - w[53] = amd_bytealign (w[32], w[33], offset); - w[52] = amd_bytealign (w[31], w[32], offset); - w[51] = amd_bytealign (w[30], w[31], offset); - w[50] = amd_bytealign (w[29], w[30], offset); - w[49] = amd_bytealign (w[28], w[29], offset); - w[48] = amd_bytealign (w[27], w[28], offset); - w[47] = amd_bytealign (w[26], w[27], offset); - w[46] = amd_bytealign (w[25], w[26], offset); - w[45] = amd_bytealign (w[24], w[25], offset); - w[44] = amd_bytealign (w[23], w[24], offset); - w[43] = amd_bytealign (w[22], w[23], offset); - w[42] = amd_bytealign (w[21], w[22], offset); - w[41] = amd_bytealign (w[20], w[21], offset); - w[40] = amd_bytealign (w[19], w[20], offset); - w[39] = amd_bytealign (w[18], w[19], offset); - w[38] = amd_bytealign (w[17], w[18], offset); - w[37] = amd_bytealign (w[16], w[17], offset); - w[36] = amd_bytealign (w[15], w[16], offset); - w[35] = amd_bytealign (w[14], w[15], offset); - w[34] = amd_bytealign (w[13], w[14], offset); - w[33] = amd_bytealign (w[12], w[13], offset); - w[32] = amd_bytealign (w[11], w[12], offset); - w[31] = amd_bytealign (w[10], w[11], offset); - w[30] = amd_bytealign (w[ 9], w[10], offset); - w[29] = amd_bytealign (w[ 8], w[ 9], offset); - w[28] = amd_bytealign (w[ 7], w[ 8], offset); - w[27] = amd_bytealign (w[ 6], w[ 7], offset); - w[26] = amd_bytealign (w[ 5], w[ 6], offset); - w[25] = amd_bytealign (w[ 4], w[ 5], offset); - w[24] = amd_bytealign (w[ 3], w[ 4], offset); - w[23] = amd_bytealign (w[ 2], w[ 3], offset); - w[22] = amd_bytealign (w[ 1], w[ 2], offset); - w[21] = amd_bytealign (w[ 0], w[ 1], offset); - w[20] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[42], w[43], offset); + w[62] = hc_bytealign (w[41], w[42], offset); + w[61] = hc_bytealign (w[40], w[41], offset); + w[60] = hc_bytealign (w[39], w[40], offset); + w[59] = hc_bytealign (w[38], w[39], offset); + w[58] = hc_bytealign (w[37], w[38], offset); + w[57] = hc_bytealign (w[36], w[37], offset); + w[56] = hc_bytealign (w[35], w[36], offset); + w[55] = hc_bytealign (w[34], w[35], offset); + w[54] = hc_bytealign (w[33], w[34], offset); + w[53] = hc_bytealign (w[32], w[33], offset); + w[52] = hc_bytealign (w[31], w[32], offset); + w[51] = hc_bytealign (w[30], w[31], offset); + w[50] = hc_bytealign (w[29], w[30], offset); + w[49] = hc_bytealign (w[28], w[29], offset); + w[48] = hc_bytealign (w[27], w[28], offset); + w[47] = hc_bytealign (w[26], w[27], offset); + w[46] = hc_bytealign (w[25], w[26], offset); + w[45] = hc_bytealign (w[24], w[25], offset); + w[44] = hc_bytealign (w[23], w[24], offset); + w[43] = hc_bytealign (w[22], w[23], offset); + w[42] = hc_bytealign (w[21], w[22], offset); + w[41] = hc_bytealign (w[20], w[21], offset); + w[40] = hc_bytealign (w[19], w[20], offset); + w[39] = hc_bytealign (w[18], w[19], offset); + w[38] = hc_bytealign (w[17], w[18], offset); + w[37] = hc_bytealign (w[16], w[17], offset); + w[36] = hc_bytealign (w[15], w[16], offset); + w[35] = hc_bytealign (w[14], w[15], offset); + w[34] = hc_bytealign (w[13], w[14], offset); + w[33] = hc_bytealign (w[12], w[13], offset); + w[32] = hc_bytealign (w[11], w[12], offset); + w[31] = hc_bytealign (w[10], w[11], offset); + w[30] = hc_bytealign (w[ 9], w[10], offset); + w[29] = hc_bytealign (w[ 8], w[ 9], offset); + w[28] = hc_bytealign (w[ 7], w[ 8], offset); + w[27] = hc_bytealign (w[ 6], w[ 7], offset); + w[26] = hc_bytealign (w[ 5], w[ 6], offset); + w[25] = hc_bytealign (w[ 4], w[ 5], offset); + w[24] = hc_bytealign (w[ 3], w[ 4], offset); + w[23] = hc_bytealign (w[ 2], w[ 3], offset); + w[22] = hc_bytealign (w[ 1], w[ 2], offset); + w[21] = hc_bytealign (w[ 0], w[ 1], offset); + w[20] = hc_bytealign ( 0, w[ 0], offset); w[19] = 0; w[18] = 0; w[17] = 0; @@ -13094,49 +13094,49 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 21: - w[63] = amd_bytealign (w[41], w[42], offset); - w[62] = amd_bytealign (w[40], w[41], offset); - w[61] = amd_bytealign (w[39], w[40], offset); - w[60] = amd_bytealign (w[38], w[39], offset); - w[59] = amd_bytealign (w[37], w[38], offset); - w[58] = amd_bytealign (w[36], w[37], offset); - w[57] = amd_bytealign (w[35], w[36], offset); - w[56] = amd_bytealign (w[34], w[35], offset); - w[55] = amd_bytealign (w[33], w[34], offset); - w[54] = amd_bytealign (w[32], w[33], offset); - w[53] = amd_bytealign (w[31], w[32], offset); - w[52] = amd_bytealign (w[30], w[31], offset); - w[51] = amd_bytealign (w[29], w[30], offset); - w[50] = amd_bytealign (w[28], w[29], offset); - w[49] = amd_bytealign (w[27], w[28], offset); - w[48] = amd_bytealign (w[26], w[27], offset); - w[47] = amd_bytealign (w[25], w[26], offset); - w[46] = amd_bytealign (w[24], w[25], offset); - w[45] = amd_bytealign (w[23], w[24], offset); - w[44] = amd_bytealign (w[22], w[23], offset); - w[43] = amd_bytealign (w[21], w[22], offset); - w[42] = amd_bytealign (w[20], w[21], offset); - w[41] = amd_bytealign (w[19], w[20], offset); - w[40] = amd_bytealign (w[18], w[19], offset); - w[39] = amd_bytealign (w[17], w[18], offset); - w[38] = amd_bytealign (w[16], w[17], offset); - w[37] = amd_bytealign (w[15], w[16], offset); - w[36] = amd_bytealign (w[14], w[15], offset); - w[35] = amd_bytealign (w[13], w[14], offset); - w[34] = amd_bytealign (w[12], w[13], offset); - w[33] = amd_bytealign (w[11], w[12], offset); - w[32] = amd_bytealign (w[10], w[11], offset); - w[31] = amd_bytealign (w[ 9], w[10], offset); - w[30] = amd_bytealign (w[ 8], w[ 9], offset); - w[29] = amd_bytealign (w[ 7], w[ 8], offset); - w[28] = amd_bytealign (w[ 6], w[ 7], offset); - w[27] = amd_bytealign (w[ 5], w[ 6], offset); - w[26] = amd_bytealign (w[ 4], w[ 5], offset); - w[25] = amd_bytealign (w[ 3], w[ 4], offset); - w[24] = amd_bytealign (w[ 2], w[ 3], offset); - w[23] = amd_bytealign (w[ 1], w[ 2], offset); - w[22] = amd_bytealign (w[ 0], w[ 1], offset); - w[21] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[41], w[42], offset); + w[62] = hc_bytealign (w[40], w[41], offset); + w[61] = hc_bytealign (w[39], w[40], offset); + w[60] = hc_bytealign (w[38], w[39], offset); + w[59] = hc_bytealign (w[37], w[38], offset); + w[58] = hc_bytealign (w[36], w[37], offset); + w[57] = hc_bytealign (w[35], w[36], offset); + w[56] = hc_bytealign (w[34], w[35], offset); + w[55] = hc_bytealign (w[33], w[34], offset); + w[54] = hc_bytealign (w[32], w[33], offset); + w[53] = hc_bytealign (w[31], w[32], offset); + w[52] = hc_bytealign (w[30], w[31], offset); + w[51] = hc_bytealign (w[29], w[30], offset); + w[50] = hc_bytealign (w[28], w[29], offset); + w[49] = hc_bytealign (w[27], w[28], offset); + w[48] = hc_bytealign (w[26], w[27], offset); + w[47] = hc_bytealign (w[25], w[26], offset); + w[46] = hc_bytealign (w[24], w[25], offset); + w[45] = hc_bytealign (w[23], w[24], offset); + w[44] = hc_bytealign (w[22], w[23], offset); + w[43] = hc_bytealign (w[21], w[22], offset); + w[42] = hc_bytealign (w[20], w[21], offset); + w[41] = hc_bytealign (w[19], w[20], offset); + w[40] = hc_bytealign (w[18], w[19], offset); + w[39] = hc_bytealign (w[17], w[18], offset); + w[38] = hc_bytealign (w[16], w[17], offset); + w[37] = hc_bytealign (w[15], w[16], offset); + w[36] = hc_bytealign (w[14], w[15], offset); + w[35] = hc_bytealign (w[13], w[14], offset); + w[34] = hc_bytealign (w[12], w[13], offset); + w[33] = hc_bytealign (w[11], w[12], offset); + w[32] = hc_bytealign (w[10], w[11], offset); + w[31] = hc_bytealign (w[ 9], w[10], offset); + w[30] = hc_bytealign (w[ 8], w[ 9], offset); + w[29] = hc_bytealign (w[ 7], w[ 8], offset); + w[28] = hc_bytealign (w[ 6], w[ 7], offset); + w[27] = hc_bytealign (w[ 5], w[ 6], offset); + w[26] = hc_bytealign (w[ 4], w[ 5], offset); + w[25] = hc_bytealign (w[ 3], w[ 4], offset); + w[24] = hc_bytealign (w[ 2], w[ 3], offset); + w[23] = hc_bytealign (w[ 1], w[ 2], offset); + w[22] = hc_bytealign (w[ 0], w[ 1], offset); + w[21] = hc_bytealign ( 0, w[ 0], offset); w[20] = 0; w[19] = 0; w[18] = 0; @@ -13162,48 +13162,48 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 22: - w[63] = amd_bytealign (w[40], w[41], offset); - w[62] = amd_bytealign (w[39], w[40], offset); - w[61] = amd_bytealign (w[38], w[39], offset); - w[60] = amd_bytealign (w[37], w[38], offset); - w[59] = amd_bytealign (w[36], w[37], offset); - w[58] = amd_bytealign (w[35], w[36], offset); - w[57] = amd_bytealign (w[34], w[35], offset); - w[56] = amd_bytealign (w[33], w[34], offset); - w[55] = amd_bytealign (w[32], w[33], offset); - w[54] = amd_bytealign (w[31], w[32], offset); - w[53] = amd_bytealign (w[30], w[31], offset); - w[52] = amd_bytealign (w[29], w[30], offset); - w[51] = amd_bytealign (w[28], w[29], offset); - w[50] = amd_bytealign (w[27], w[28], offset); - w[49] = amd_bytealign (w[26], w[27], offset); - w[48] = amd_bytealign (w[25], w[26], offset); - w[47] = amd_bytealign (w[24], w[25], offset); - w[46] = amd_bytealign (w[23], w[24], offset); - w[45] = amd_bytealign (w[22], w[23], offset); - w[44] = amd_bytealign (w[21], w[22], offset); - w[43] = amd_bytealign (w[20], w[21], offset); - w[42] = amd_bytealign (w[19], w[20], offset); - w[41] = amd_bytealign (w[18], w[19], offset); - w[40] = amd_bytealign (w[17], w[18], offset); - w[39] = amd_bytealign (w[16], w[17], offset); - w[38] = amd_bytealign (w[15], w[16], offset); - w[37] = amd_bytealign (w[14], w[15], offset); - w[36] = amd_bytealign (w[13], w[14], offset); - w[35] = amd_bytealign (w[12], w[13], offset); - w[34] = amd_bytealign (w[11], w[12], offset); - w[33] = amd_bytealign (w[10], w[11], offset); - w[32] = amd_bytealign (w[ 9], w[10], offset); - w[31] = amd_bytealign (w[ 8], w[ 9], offset); - w[30] = amd_bytealign (w[ 7], w[ 8], offset); - w[29] = amd_bytealign (w[ 6], w[ 7], offset); - w[28] = amd_bytealign (w[ 5], w[ 6], offset); - w[27] = amd_bytealign (w[ 4], w[ 5], offset); - w[26] = amd_bytealign (w[ 3], w[ 4], offset); - w[25] = amd_bytealign (w[ 2], w[ 3], offset); - w[24] = amd_bytealign (w[ 1], w[ 2], offset); - w[23] = amd_bytealign (w[ 0], w[ 1], offset); - w[22] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[40], w[41], offset); + w[62] = hc_bytealign (w[39], w[40], offset); + w[61] = hc_bytealign (w[38], w[39], offset); + w[60] = hc_bytealign (w[37], w[38], offset); + w[59] = hc_bytealign (w[36], w[37], offset); + w[58] = hc_bytealign (w[35], w[36], offset); + w[57] = hc_bytealign (w[34], w[35], offset); + w[56] = hc_bytealign (w[33], w[34], offset); + w[55] = hc_bytealign (w[32], w[33], offset); + w[54] = hc_bytealign (w[31], w[32], offset); + w[53] = hc_bytealign (w[30], w[31], offset); + w[52] = hc_bytealign (w[29], w[30], offset); + w[51] = hc_bytealign (w[28], w[29], offset); + w[50] = hc_bytealign (w[27], w[28], offset); + w[49] = hc_bytealign (w[26], w[27], offset); + w[48] = hc_bytealign (w[25], w[26], offset); + w[47] = hc_bytealign (w[24], w[25], offset); + w[46] = hc_bytealign (w[23], w[24], offset); + w[45] = hc_bytealign (w[22], w[23], offset); + w[44] = hc_bytealign (w[21], w[22], offset); + w[43] = hc_bytealign (w[20], w[21], offset); + w[42] = hc_bytealign (w[19], w[20], offset); + w[41] = hc_bytealign (w[18], w[19], offset); + w[40] = hc_bytealign (w[17], w[18], offset); + w[39] = hc_bytealign (w[16], w[17], offset); + w[38] = hc_bytealign (w[15], w[16], offset); + w[37] = hc_bytealign (w[14], w[15], offset); + w[36] = hc_bytealign (w[13], w[14], offset); + w[35] = hc_bytealign (w[12], w[13], offset); + w[34] = hc_bytealign (w[11], w[12], offset); + w[33] = hc_bytealign (w[10], w[11], offset); + w[32] = hc_bytealign (w[ 9], w[10], offset); + w[31] = hc_bytealign (w[ 8], w[ 9], offset); + w[30] = hc_bytealign (w[ 7], w[ 8], offset); + w[29] = hc_bytealign (w[ 6], w[ 7], offset); + w[28] = hc_bytealign (w[ 5], w[ 6], offset); + w[27] = hc_bytealign (w[ 4], w[ 5], offset); + w[26] = hc_bytealign (w[ 3], w[ 4], offset); + w[25] = hc_bytealign (w[ 2], w[ 3], offset); + w[24] = hc_bytealign (w[ 1], w[ 2], offset); + w[23] = hc_bytealign (w[ 0], w[ 1], offset); + w[22] = hc_bytealign ( 0, w[ 0], offset); w[21] = 0; w[20] = 0; w[19] = 0; @@ -13230,47 +13230,47 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 23: - w[63] = amd_bytealign (w[39], w[40], offset); - w[62] = amd_bytealign (w[38], w[39], offset); - w[61] = amd_bytealign (w[37], w[38], offset); - w[60] = amd_bytealign (w[36], w[37], offset); - w[59] = amd_bytealign (w[35], w[36], offset); - w[58] = amd_bytealign (w[34], w[35], offset); - w[57] = amd_bytealign (w[33], w[34], offset); - w[56] = amd_bytealign (w[32], w[33], offset); - w[55] = amd_bytealign (w[31], w[32], offset); - w[54] = amd_bytealign (w[30], w[31], offset); - w[53] = amd_bytealign (w[29], w[30], offset); - w[52] = amd_bytealign (w[28], w[29], offset); - w[51] = amd_bytealign (w[27], w[28], offset); - w[50] = amd_bytealign (w[26], w[27], offset); - w[49] = amd_bytealign (w[25], w[26], offset); - w[48] = amd_bytealign (w[24], w[25], offset); - w[47] = amd_bytealign (w[23], w[24], offset); - w[46] = amd_bytealign (w[22], w[23], offset); - w[45] = amd_bytealign (w[21], w[22], offset); - w[44] = amd_bytealign (w[20], w[21], offset); - w[43] = amd_bytealign (w[19], w[20], offset); - w[42] = amd_bytealign (w[18], w[19], offset); - w[41] = amd_bytealign (w[17], w[18], offset); - w[40] = amd_bytealign (w[16], w[17], offset); - w[39] = amd_bytealign (w[15], w[16], offset); - w[38] = amd_bytealign (w[14], w[15], offset); - w[37] = amd_bytealign (w[13], w[14], offset); - w[36] = amd_bytealign (w[12], w[13], offset); - w[35] = amd_bytealign (w[11], w[12], offset); - w[34] = amd_bytealign (w[10], w[11], offset); - w[33] = amd_bytealign (w[ 9], w[10], offset); - w[32] = amd_bytealign (w[ 8], w[ 9], offset); - w[31] = amd_bytealign (w[ 7], w[ 8], offset); - w[30] = amd_bytealign (w[ 6], w[ 7], offset); - w[29] = amd_bytealign (w[ 5], w[ 6], offset); - w[28] = amd_bytealign (w[ 4], w[ 5], offset); - w[27] = amd_bytealign (w[ 3], w[ 4], offset); - w[26] = amd_bytealign (w[ 2], w[ 3], offset); - w[25] = amd_bytealign (w[ 1], w[ 2], offset); - w[24] = amd_bytealign (w[ 0], w[ 1], offset); - w[23] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[39], w[40], offset); + w[62] = hc_bytealign (w[38], w[39], offset); + w[61] = hc_bytealign (w[37], w[38], offset); + w[60] = hc_bytealign (w[36], w[37], offset); + w[59] = hc_bytealign (w[35], w[36], offset); + w[58] = hc_bytealign (w[34], w[35], offset); + w[57] = hc_bytealign (w[33], w[34], offset); + w[56] = hc_bytealign (w[32], w[33], offset); + w[55] = hc_bytealign (w[31], w[32], offset); + w[54] = hc_bytealign (w[30], w[31], offset); + w[53] = hc_bytealign (w[29], w[30], offset); + w[52] = hc_bytealign (w[28], w[29], offset); + w[51] = hc_bytealign (w[27], w[28], offset); + w[50] = hc_bytealign (w[26], w[27], offset); + w[49] = hc_bytealign (w[25], w[26], offset); + w[48] = hc_bytealign (w[24], w[25], offset); + w[47] = hc_bytealign (w[23], w[24], offset); + w[46] = hc_bytealign (w[22], w[23], offset); + w[45] = hc_bytealign (w[21], w[22], offset); + w[44] = hc_bytealign (w[20], w[21], offset); + w[43] = hc_bytealign (w[19], w[20], offset); + w[42] = hc_bytealign (w[18], w[19], offset); + w[41] = hc_bytealign (w[17], w[18], offset); + w[40] = hc_bytealign (w[16], w[17], offset); + w[39] = hc_bytealign (w[15], w[16], offset); + w[38] = hc_bytealign (w[14], w[15], offset); + w[37] = hc_bytealign (w[13], w[14], offset); + w[36] = hc_bytealign (w[12], w[13], offset); + w[35] = hc_bytealign (w[11], w[12], offset); + w[34] = hc_bytealign (w[10], w[11], offset); + w[33] = hc_bytealign (w[ 9], w[10], offset); + w[32] = hc_bytealign (w[ 8], w[ 9], offset); + w[31] = hc_bytealign (w[ 7], w[ 8], offset); + w[30] = hc_bytealign (w[ 6], w[ 7], offset); + w[29] = hc_bytealign (w[ 5], w[ 6], offset); + w[28] = hc_bytealign (w[ 4], w[ 5], offset); + w[27] = hc_bytealign (w[ 3], w[ 4], offset); + w[26] = hc_bytealign (w[ 2], w[ 3], offset); + w[25] = hc_bytealign (w[ 1], w[ 2], offset); + w[24] = hc_bytealign (w[ 0], w[ 1], offset); + w[23] = hc_bytealign ( 0, w[ 0], offset); w[22] = 0; w[21] = 0; w[20] = 0; @@ -13298,46 +13298,46 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 24: - w[63] = amd_bytealign (w[38], w[39], offset); - w[62] = amd_bytealign (w[37], w[38], offset); - w[61] = amd_bytealign (w[36], w[37], offset); - w[60] = amd_bytealign (w[35], w[36], offset); - w[59] = amd_bytealign (w[34], w[35], offset); - w[58] = amd_bytealign (w[33], w[34], offset); - w[57] = amd_bytealign (w[32], w[33], offset); - w[56] = amd_bytealign (w[31], w[32], offset); - w[55] = amd_bytealign (w[30], w[31], offset); - w[54] = amd_bytealign (w[29], w[30], offset); - w[53] = amd_bytealign (w[28], w[29], offset); - w[52] = amd_bytealign (w[27], w[28], offset); - w[51] = amd_bytealign (w[26], w[27], offset); - w[50] = amd_bytealign (w[25], w[26], offset); - w[49] = amd_bytealign (w[24], w[25], offset); - w[48] = amd_bytealign (w[23], w[24], offset); - w[47] = amd_bytealign (w[22], w[23], offset); - w[46] = amd_bytealign (w[21], w[22], offset); - w[45] = amd_bytealign (w[20], w[21], offset); - w[44] = amd_bytealign (w[19], w[20], offset); - w[43] = amd_bytealign (w[18], w[19], offset); - w[42] = amd_bytealign (w[17], w[18], offset); - w[41] = amd_bytealign (w[16], w[17], offset); - w[40] = amd_bytealign (w[15], w[16], offset); - w[39] = amd_bytealign (w[14], w[15], offset); - w[38] = amd_bytealign (w[13], w[14], offset); - w[37] = amd_bytealign (w[12], w[13], offset); - w[36] = amd_bytealign (w[11], w[12], offset); - w[35] = amd_bytealign (w[10], w[11], offset); - w[34] = amd_bytealign (w[ 9], w[10], offset); - w[33] = amd_bytealign (w[ 8], w[ 9], offset); - w[32] = amd_bytealign (w[ 7], w[ 8], offset); - w[31] = amd_bytealign (w[ 6], w[ 7], offset); - w[30] = amd_bytealign (w[ 5], w[ 6], offset); - w[29] = amd_bytealign (w[ 4], w[ 5], offset); - w[28] = amd_bytealign (w[ 3], w[ 4], offset); - w[27] = amd_bytealign (w[ 2], w[ 3], offset); - w[26] = amd_bytealign (w[ 1], w[ 2], offset); - w[25] = amd_bytealign (w[ 0], w[ 1], offset); - w[24] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[38], w[39], offset); + w[62] = hc_bytealign (w[37], w[38], offset); + w[61] = hc_bytealign (w[36], w[37], offset); + w[60] = hc_bytealign (w[35], w[36], offset); + w[59] = hc_bytealign (w[34], w[35], offset); + w[58] = hc_bytealign (w[33], w[34], offset); + w[57] = hc_bytealign (w[32], w[33], offset); + w[56] = hc_bytealign (w[31], w[32], offset); + w[55] = hc_bytealign (w[30], w[31], offset); + w[54] = hc_bytealign (w[29], w[30], offset); + w[53] = hc_bytealign (w[28], w[29], offset); + w[52] = hc_bytealign (w[27], w[28], offset); + w[51] = hc_bytealign (w[26], w[27], offset); + w[50] = hc_bytealign (w[25], w[26], offset); + w[49] = hc_bytealign (w[24], w[25], offset); + w[48] = hc_bytealign (w[23], w[24], offset); + w[47] = hc_bytealign (w[22], w[23], offset); + w[46] = hc_bytealign (w[21], w[22], offset); + w[45] = hc_bytealign (w[20], w[21], offset); + w[44] = hc_bytealign (w[19], w[20], offset); + w[43] = hc_bytealign (w[18], w[19], offset); + w[42] = hc_bytealign (w[17], w[18], offset); + w[41] = hc_bytealign (w[16], w[17], offset); + w[40] = hc_bytealign (w[15], w[16], offset); + w[39] = hc_bytealign (w[14], w[15], offset); + w[38] = hc_bytealign (w[13], w[14], offset); + w[37] = hc_bytealign (w[12], w[13], offset); + w[36] = hc_bytealign (w[11], w[12], offset); + w[35] = hc_bytealign (w[10], w[11], offset); + w[34] = hc_bytealign (w[ 9], w[10], offset); + w[33] = hc_bytealign (w[ 8], w[ 9], offset); + w[32] = hc_bytealign (w[ 7], w[ 8], offset); + w[31] = hc_bytealign (w[ 6], w[ 7], offset); + w[30] = hc_bytealign (w[ 5], w[ 6], offset); + w[29] = hc_bytealign (w[ 4], w[ 5], offset); + w[28] = hc_bytealign (w[ 3], w[ 4], offset); + w[27] = hc_bytealign (w[ 2], w[ 3], offset); + w[26] = hc_bytealign (w[ 1], w[ 2], offset); + w[25] = hc_bytealign (w[ 0], w[ 1], offset); + w[24] = hc_bytealign ( 0, w[ 0], offset); w[23] = 0; w[22] = 0; w[21] = 0; @@ -13366,45 +13366,45 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 25: - w[63] = amd_bytealign (w[37], w[38], offset); - w[62] = amd_bytealign (w[36], w[37], offset); - w[61] = amd_bytealign (w[35], w[36], offset); - w[60] = amd_bytealign (w[34], w[35], offset); - w[59] = amd_bytealign (w[33], w[34], offset); - w[58] = amd_bytealign (w[32], w[33], offset); - w[57] = amd_bytealign (w[31], w[32], offset); - w[56] = amd_bytealign (w[30], w[31], offset); - w[55] = amd_bytealign (w[29], w[30], offset); - w[54] = amd_bytealign (w[28], w[29], offset); - w[53] = amd_bytealign (w[27], w[28], offset); - w[52] = amd_bytealign (w[26], w[27], offset); - w[51] = amd_bytealign (w[25], w[26], offset); - w[50] = amd_bytealign (w[24], w[25], offset); - w[49] = amd_bytealign (w[23], w[24], offset); - w[48] = amd_bytealign (w[22], w[23], offset); - w[47] = amd_bytealign (w[21], w[22], offset); - w[46] = amd_bytealign (w[20], w[21], offset); - w[45] = amd_bytealign (w[19], w[20], offset); - w[44] = amd_bytealign (w[18], w[19], offset); - w[43] = amd_bytealign (w[17], w[18], offset); - w[42] = amd_bytealign (w[16], w[17], offset); - w[41] = amd_bytealign (w[15], w[16], offset); - w[40] = amd_bytealign (w[14], w[15], offset); - w[39] = amd_bytealign (w[13], w[14], offset); - w[38] = amd_bytealign (w[12], w[13], offset); - w[37] = amd_bytealign (w[11], w[12], offset); - w[36] = amd_bytealign (w[10], w[11], offset); - w[35] = amd_bytealign (w[ 9], w[10], offset); - w[34] = amd_bytealign (w[ 8], w[ 9], offset); - w[33] = amd_bytealign (w[ 7], w[ 8], offset); - w[32] = amd_bytealign (w[ 6], w[ 7], offset); - w[31] = amd_bytealign (w[ 5], w[ 6], offset); - w[30] = amd_bytealign (w[ 4], w[ 5], offset); - w[29] = amd_bytealign (w[ 3], w[ 4], offset); - w[28] = amd_bytealign (w[ 2], w[ 3], offset); - w[27] = amd_bytealign (w[ 1], w[ 2], offset); - w[26] = amd_bytealign (w[ 0], w[ 1], offset); - w[25] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[37], w[38], offset); + w[62] = hc_bytealign (w[36], w[37], offset); + w[61] = hc_bytealign (w[35], w[36], offset); + w[60] = hc_bytealign (w[34], w[35], offset); + w[59] = hc_bytealign (w[33], w[34], offset); + w[58] = hc_bytealign (w[32], w[33], offset); + w[57] = hc_bytealign (w[31], w[32], offset); + w[56] = hc_bytealign (w[30], w[31], offset); + w[55] = hc_bytealign (w[29], w[30], offset); + w[54] = hc_bytealign (w[28], w[29], offset); + w[53] = hc_bytealign (w[27], w[28], offset); + w[52] = hc_bytealign (w[26], w[27], offset); + w[51] = hc_bytealign (w[25], w[26], offset); + w[50] = hc_bytealign (w[24], w[25], offset); + w[49] = hc_bytealign (w[23], w[24], offset); + w[48] = hc_bytealign (w[22], w[23], offset); + w[47] = hc_bytealign (w[21], w[22], offset); + w[46] = hc_bytealign (w[20], w[21], offset); + w[45] = hc_bytealign (w[19], w[20], offset); + w[44] = hc_bytealign (w[18], w[19], offset); + w[43] = hc_bytealign (w[17], w[18], offset); + w[42] = hc_bytealign (w[16], w[17], offset); + w[41] = hc_bytealign (w[15], w[16], offset); + w[40] = hc_bytealign (w[14], w[15], offset); + w[39] = hc_bytealign (w[13], w[14], offset); + w[38] = hc_bytealign (w[12], w[13], offset); + w[37] = hc_bytealign (w[11], w[12], offset); + w[36] = hc_bytealign (w[10], w[11], offset); + w[35] = hc_bytealign (w[ 9], w[10], offset); + w[34] = hc_bytealign (w[ 8], w[ 9], offset); + w[33] = hc_bytealign (w[ 7], w[ 8], offset); + w[32] = hc_bytealign (w[ 6], w[ 7], offset); + w[31] = hc_bytealign (w[ 5], w[ 6], offset); + w[30] = hc_bytealign (w[ 4], w[ 5], offset); + w[29] = hc_bytealign (w[ 3], w[ 4], offset); + w[28] = hc_bytealign (w[ 2], w[ 3], offset); + w[27] = hc_bytealign (w[ 1], w[ 2], offset); + w[26] = hc_bytealign (w[ 0], w[ 1], offset); + w[25] = hc_bytealign ( 0, w[ 0], offset); w[24] = 0; w[23] = 0; w[22] = 0; @@ -13434,44 +13434,44 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 26: - w[63] = amd_bytealign (w[36], w[37], offset); - w[62] = amd_bytealign (w[35], w[36], offset); - w[61] = amd_bytealign (w[34], w[35], offset); - w[60] = amd_bytealign (w[33], w[34], offset); - w[59] = amd_bytealign (w[32], w[33], offset); - w[58] = amd_bytealign (w[31], w[32], offset); - w[57] = amd_bytealign (w[30], w[31], offset); - w[56] = amd_bytealign (w[29], w[30], offset); - w[55] = amd_bytealign (w[28], w[29], offset); - w[54] = amd_bytealign (w[27], w[28], offset); - w[53] = amd_bytealign (w[26], w[27], offset); - w[52] = amd_bytealign (w[25], w[26], offset); - w[51] = amd_bytealign (w[24], w[25], offset); - w[50] = amd_bytealign (w[23], w[24], offset); - w[49] = amd_bytealign (w[22], w[23], offset); - w[48] = amd_bytealign (w[21], w[22], offset); - w[47] = amd_bytealign (w[20], w[21], offset); - w[46] = amd_bytealign (w[19], w[20], offset); - w[45] = amd_bytealign (w[18], w[19], offset); - w[44] = amd_bytealign (w[17], w[18], offset); - w[43] = amd_bytealign (w[16], w[17], offset); - w[42] = amd_bytealign (w[15], w[16], offset); - w[41] = amd_bytealign (w[14], w[15], offset); - w[40] = amd_bytealign (w[13], w[14], offset); - w[39] = amd_bytealign (w[12], w[13], offset); - w[38] = amd_bytealign (w[11], w[12], offset); - w[37] = amd_bytealign (w[10], w[11], offset); - w[36] = amd_bytealign (w[ 9], w[10], offset); - w[35] = amd_bytealign (w[ 8], w[ 9], offset); - w[34] = amd_bytealign (w[ 7], w[ 8], offset); - w[33] = amd_bytealign (w[ 6], w[ 7], offset); - w[32] = amd_bytealign (w[ 5], w[ 6], offset); - w[31] = amd_bytealign (w[ 4], w[ 5], offset); - w[30] = amd_bytealign (w[ 3], w[ 4], offset); - w[29] = amd_bytealign (w[ 2], w[ 3], offset); - w[28] = amd_bytealign (w[ 1], w[ 2], offset); - w[27] = amd_bytealign (w[ 0], w[ 1], offset); - w[26] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[36], w[37], offset); + w[62] = hc_bytealign (w[35], w[36], offset); + w[61] = hc_bytealign (w[34], w[35], offset); + w[60] = hc_bytealign (w[33], w[34], offset); + w[59] = hc_bytealign (w[32], w[33], offset); + w[58] = hc_bytealign (w[31], w[32], offset); + w[57] = hc_bytealign (w[30], w[31], offset); + w[56] = hc_bytealign (w[29], w[30], offset); + w[55] = hc_bytealign (w[28], w[29], offset); + w[54] = hc_bytealign (w[27], w[28], offset); + w[53] = hc_bytealign (w[26], w[27], offset); + w[52] = hc_bytealign (w[25], w[26], offset); + w[51] = hc_bytealign (w[24], w[25], offset); + w[50] = hc_bytealign (w[23], w[24], offset); + w[49] = hc_bytealign (w[22], w[23], offset); + w[48] = hc_bytealign (w[21], w[22], offset); + w[47] = hc_bytealign (w[20], w[21], offset); + w[46] = hc_bytealign (w[19], w[20], offset); + w[45] = hc_bytealign (w[18], w[19], offset); + w[44] = hc_bytealign (w[17], w[18], offset); + w[43] = hc_bytealign (w[16], w[17], offset); + w[42] = hc_bytealign (w[15], w[16], offset); + w[41] = hc_bytealign (w[14], w[15], offset); + w[40] = hc_bytealign (w[13], w[14], offset); + w[39] = hc_bytealign (w[12], w[13], offset); + w[38] = hc_bytealign (w[11], w[12], offset); + w[37] = hc_bytealign (w[10], w[11], offset); + w[36] = hc_bytealign (w[ 9], w[10], offset); + w[35] = hc_bytealign (w[ 8], w[ 9], offset); + w[34] = hc_bytealign (w[ 7], w[ 8], offset); + w[33] = hc_bytealign (w[ 6], w[ 7], offset); + w[32] = hc_bytealign (w[ 5], w[ 6], offset); + w[31] = hc_bytealign (w[ 4], w[ 5], offset); + w[30] = hc_bytealign (w[ 3], w[ 4], offset); + w[29] = hc_bytealign (w[ 2], w[ 3], offset); + w[28] = hc_bytealign (w[ 1], w[ 2], offset); + w[27] = hc_bytealign (w[ 0], w[ 1], offset); + w[26] = hc_bytealign ( 0, w[ 0], offset); w[25] = 0; w[24] = 0; w[23] = 0; @@ -13502,43 +13502,43 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 27: - w[63] = amd_bytealign (w[35], w[36], offset); - w[62] = amd_bytealign (w[34], w[35], offset); - w[61] = amd_bytealign (w[33], w[34], offset); - w[60] = amd_bytealign (w[32], w[33], offset); - w[59] = amd_bytealign (w[31], w[32], offset); - w[58] = amd_bytealign (w[30], w[31], offset); - w[57] = amd_bytealign (w[29], w[30], offset); - w[56] = amd_bytealign (w[28], w[29], offset); - w[55] = amd_bytealign (w[27], w[28], offset); - w[54] = amd_bytealign (w[26], w[27], offset); - w[53] = amd_bytealign (w[25], w[26], offset); - w[52] = amd_bytealign (w[24], w[25], offset); - w[51] = amd_bytealign (w[23], w[24], offset); - w[50] = amd_bytealign (w[22], w[23], offset); - w[49] = amd_bytealign (w[21], w[22], offset); - w[48] = amd_bytealign (w[20], w[21], offset); - w[47] = amd_bytealign (w[19], w[20], offset); - w[46] = amd_bytealign (w[18], w[19], offset); - w[45] = amd_bytealign (w[17], w[18], offset); - w[44] = amd_bytealign (w[16], w[17], offset); - w[43] = amd_bytealign (w[15], w[16], offset); - w[42] = amd_bytealign (w[14], w[15], offset); - w[41] = amd_bytealign (w[13], w[14], offset); - w[40] = amd_bytealign (w[12], w[13], offset); - w[39] = amd_bytealign (w[11], w[12], offset); - w[38] = amd_bytealign (w[10], w[11], offset); - w[37] = amd_bytealign (w[ 9], w[10], offset); - w[36] = amd_bytealign (w[ 8], w[ 9], offset); - w[35] = amd_bytealign (w[ 7], w[ 8], offset); - w[34] = amd_bytealign (w[ 6], w[ 7], offset); - w[33] = amd_bytealign (w[ 5], w[ 6], offset); - w[32] = amd_bytealign (w[ 4], w[ 5], offset); - w[31] = amd_bytealign (w[ 3], w[ 4], offset); - w[30] = amd_bytealign (w[ 2], w[ 3], offset); - w[29] = amd_bytealign (w[ 1], w[ 2], offset); - w[28] = amd_bytealign (w[ 0], w[ 1], offset); - w[27] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[35], w[36], offset); + w[62] = hc_bytealign (w[34], w[35], offset); + w[61] = hc_bytealign (w[33], w[34], offset); + w[60] = hc_bytealign (w[32], w[33], offset); + w[59] = hc_bytealign (w[31], w[32], offset); + w[58] = hc_bytealign (w[30], w[31], offset); + w[57] = hc_bytealign (w[29], w[30], offset); + w[56] = hc_bytealign (w[28], w[29], offset); + w[55] = hc_bytealign (w[27], w[28], offset); + w[54] = hc_bytealign (w[26], w[27], offset); + w[53] = hc_bytealign (w[25], w[26], offset); + w[52] = hc_bytealign (w[24], w[25], offset); + w[51] = hc_bytealign (w[23], w[24], offset); + w[50] = hc_bytealign (w[22], w[23], offset); + w[49] = hc_bytealign (w[21], w[22], offset); + w[48] = hc_bytealign (w[20], w[21], offset); + w[47] = hc_bytealign (w[19], w[20], offset); + w[46] = hc_bytealign (w[18], w[19], offset); + w[45] = hc_bytealign (w[17], w[18], offset); + w[44] = hc_bytealign (w[16], w[17], offset); + w[43] = hc_bytealign (w[15], w[16], offset); + w[42] = hc_bytealign (w[14], w[15], offset); + w[41] = hc_bytealign (w[13], w[14], offset); + w[40] = hc_bytealign (w[12], w[13], offset); + w[39] = hc_bytealign (w[11], w[12], offset); + w[38] = hc_bytealign (w[10], w[11], offset); + w[37] = hc_bytealign (w[ 9], w[10], offset); + w[36] = hc_bytealign (w[ 8], w[ 9], offset); + w[35] = hc_bytealign (w[ 7], w[ 8], offset); + w[34] = hc_bytealign (w[ 6], w[ 7], offset); + w[33] = hc_bytealign (w[ 5], w[ 6], offset); + w[32] = hc_bytealign (w[ 4], w[ 5], offset); + w[31] = hc_bytealign (w[ 3], w[ 4], offset); + w[30] = hc_bytealign (w[ 2], w[ 3], offset); + w[29] = hc_bytealign (w[ 1], w[ 2], offset); + w[28] = hc_bytealign (w[ 0], w[ 1], offset); + w[27] = hc_bytealign ( 0, w[ 0], offset); w[26] = 0; w[25] = 0; w[24] = 0; @@ -13570,42 +13570,42 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 28: - w[63] = amd_bytealign (w[34], w[35], offset); - w[62] = amd_bytealign (w[33], w[34], offset); - w[61] = amd_bytealign (w[32], w[33], offset); - w[60] = amd_bytealign (w[31], w[32], offset); - w[59] = amd_bytealign (w[30], w[31], offset); - w[58] = amd_bytealign (w[29], w[30], offset); - w[57] = amd_bytealign (w[28], w[29], offset); - w[56] = amd_bytealign (w[27], w[28], offset); - w[55] = amd_bytealign (w[26], w[27], offset); - w[54] = amd_bytealign (w[25], w[26], offset); - w[53] = amd_bytealign (w[24], w[25], offset); - w[52] = amd_bytealign (w[23], w[24], offset); - w[51] = amd_bytealign (w[22], w[23], offset); - w[50] = amd_bytealign (w[21], w[22], offset); - w[49] = amd_bytealign (w[20], w[21], offset); - w[48] = amd_bytealign (w[19], w[20], offset); - w[47] = amd_bytealign (w[18], w[19], offset); - w[46] = amd_bytealign (w[17], w[18], offset); - w[45] = amd_bytealign (w[16], w[17], offset); - w[44] = amd_bytealign (w[15], w[16], offset); - w[43] = amd_bytealign (w[14], w[15], offset); - w[42] = amd_bytealign (w[13], w[14], offset); - w[41] = amd_bytealign (w[12], w[13], offset); - w[40] = amd_bytealign (w[11], w[12], offset); - w[39] = amd_bytealign (w[10], w[11], offset); - w[38] = amd_bytealign (w[ 9], w[10], offset); - w[37] = amd_bytealign (w[ 8], w[ 9], offset); - w[36] = amd_bytealign (w[ 7], w[ 8], offset); - w[35] = amd_bytealign (w[ 6], w[ 7], offset); - w[34] = amd_bytealign (w[ 5], w[ 6], offset); - w[33] = amd_bytealign (w[ 4], w[ 5], offset); - w[32] = amd_bytealign (w[ 3], w[ 4], offset); - w[31] = amd_bytealign (w[ 2], w[ 3], offset); - w[30] = amd_bytealign (w[ 1], w[ 2], offset); - w[29] = amd_bytealign (w[ 0], w[ 1], offset); - w[28] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[34], w[35], offset); + w[62] = hc_bytealign (w[33], w[34], offset); + w[61] = hc_bytealign (w[32], w[33], offset); + w[60] = hc_bytealign (w[31], w[32], offset); + w[59] = hc_bytealign (w[30], w[31], offset); + w[58] = hc_bytealign (w[29], w[30], offset); + w[57] = hc_bytealign (w[28], w[29], offset); + w[56] = hc_bytealign (w[27], w[28], offset); + w[55] = hc_bytealign (w[26], w[27], offset); + w[54] = hc_bytealign (w[25], w[26], offset); + w[53] = hc_bytealign (w[24], w[25], offset); + w[52] = hc_bytealign (w[23], w[24], offset); + w[51] = hc_bytealign (w[22], w[23], offset); + w[50] = hc_bytealign (w[21], w[22], offset); + w[49] = hc_bytealign (w[20], w[21], offset); + w[48] = hc_bytealign (w[19], w[20], offset); + w[47] = hc_bytealign (w[18], w[19], offset); + w[46] = hc_bytealign (w[17], w[18], offset); + w[45] = hc_bytealign (w[16], w[17], offset); + w[44] = hc_bytealign (w[15], w[16], offset); + w[43] = hc_bytealign (w[14], w[15], offset); + w[42] = hc_bytealign (w[13], w[14], offset); + w[41] = hc_bytealign (w[12], w[13], offset); + w[40] = hc_bytealign (w[11], w[12], offset); + w[39] = hc_bytealign (w[10], w[11], offset); + w[38] = hc_bytealign (w[ 9], w[10], offset); + w[37] = hc_bytealign (w[ 8], w[ 9], offset); + w[36] = hc_bytealign (w[ 7], w[ 8], offset); + w[35] = hc_bytealign (w[ 6], w[ 7], offset); + w[34] = hc_bytealign (w[ 5], w[ 6], offset); + w[33] = hc_bytealign (w[ 4], w[ 5], offset); + w[32] = hc_bytealign (w[ 3], w[ 4], offset); + w[31] = hc_bytealign (w[ 2], w[ 3], offset); + w[30] = hc_bytealign (w[ 1], w[ 2], offset); + w[29] = hc_bytealign (w[ 0], w[ 1], offset); + w[28] = hc_bytealign ( 0, w[ 0], offset); w[27] = 0; w[26] = 0; w[25] = 0; @@ -13638,41 +13638,41 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 29: - w[63] = amd_bytealign (w[33], w[34], offset); - w[62] = amd_bytealign (w[32], w[33], offset); - w[61] = amd_bytealign (w[31], w[32], offset); - w[60] = amd_bytealign (w[30], w[31], offset); - w[59] = amd_bytealign (w[29], w[30], offset); - w[58] = amd_bytealign (w[28], w[29], offset); - w[57] = amd_bytealign (w[27], w[28], offset); - w[56] = amd_bytealign (w[26], w[27], offset); - w[55] = amd_bytealign (w[25], w[26], offset); - w[54] = amd_bytealign (w[24], w[25], offset); - w[53] = amd_bytealign (w[23], w[24], offset); - w[52] = amd_bytealign (w[22], w[23], offset); - w[51] = amd_bytealign (w[21], w[22], offset); - w[50] = amd_bytealign (w[20], w[21], offset); - w[49] = amd_bytealign (w[19], w[20], offset); - w[48] = amd_bytealign (w[18], w[19], offset); - w[47] = amd_bytealign (w[17], w[18], offset); - w[46] = amd_bytealign (w[16], w[17], offset); - w[45] = amd_bytealign (w[15], w[16], offset); - w[44] = amd_bytealign (w[14], w[15], offset); - w[43] = amd_bytealign (w[13], w[14], offset); - w[42] = amd_bytealign (w[12], w[13], offset); - w[41] = amd_bytealign (w[11], w[12], offset); - w[40] = amd_bytealign (w[10], w[11], offset); - w[39] = amd_bytealign (w[ 9], w[10], offset); - w[38] = amd_bytealign (w[ 8], w[ 9], offset); - w[37] = amd_bytealign (w[ 7], w[ 8], offset); - w[36] = amd_bytealign (w[ 6], w[ 7], offset); - w[35] = amd_bytealign (w[ 5], w[ 6], offset); - w[34] = amd_bytealign (w[ 4], w[ 5], offset); - w[33] = amd_bytealign (w[ 3], w[ 4], offset); - w[32] = amd_bytealign (w[ 2], w[ 3], offset); - w[31] = amd_bytealign (w[ 1], w[ 2], offset); - w[30] = amd_bytealign (w[ 0], w[ 1], offset); - w[29] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[33], w[34], offset); + w[62] = hc_bytealign (w[32], w[33], offset); + w[61] = hc_bytealign (w[31], w[32], offset); + w[60] = hc_bytealign (w[30], w[31], offset); + w[59] = hc_bytealign (w[29], w[30], offset); + w[58] = hc_bytealign (w[28], w[29], offset); + w[57] = hc_bytealign (w[27], w[28], offset); + w[56] = hc_bytealign (w[26], w[27], offset); + w[55] = hc_bytealign (w[25], w[26], offset); + w[54] = hc_bytealign (w[24], w[25], offset); + w[53] = hc_bytealign (w[23], w[24], offset); + w[52] = hc_bytealign (w[22], w[23], offset); + w[51] = hc_bytealign (w[21], w[22], offset); + w[50] = hc_bytealign (w[20], w[21], offset); + w[49] = hc_bytealign (w[19], w[20], offset); + w[48] = hc_bytealign (w[18], w[19], offset); + w[47] = hc_bytealign (w[17], w[18], offset); + w[46] = hc_bytealign (w[16], w[17], offset); + w[45] = hc_bytealign (w[15], w[16], offset); + w[44] = hc_bytealign (w[14], w[15], offset); + w[43] = hc_bytealign (w[13], w[14], offset); + w[42] = hc_bytealign (w[12], w[13], offset); + w[41] = hc_bytealign (w[11], w[12], offset); + w[40] = hc_bytealign (w[10], w[11], offset); + w[39] = hc_bytealign (w[ 9], w[10], offset); + w[38] = hc_bytealign (w[ 8], w[ 9], offset); + w[37] = hc_bytealign (w[ 7], w[ 8], offset); + w[36] = hc_bytealign (w[ 6], w[ 7], offset); + w[35] = hc_bytealign (w[ 5], w[ 6], offset); + w[34] = hc_bytealign (w[ 4], w[ 5], offset); + w[33] = hc_bytealign (w[ 3], w[ 4], offset); + w[32] = hc_bytealign (w[ 2], w[ 3], offset); + w[31] = hc_bytealign (w[ 1], w[ 2], offset); + w[30] = hc_bytealign (w[ 0], w[ 1], offset); + w[29] = hc_bytealign ( 0, w[ 0], offset); w[28] = 0; w[27] = 0; w[26] = 0; @@ -13706,40 +13706,40 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 30: - w[63] = amd_bytealign (w[32], w[33], offset); - w[62] = amd_bytealign (w[31], w[32], offset); - w[61] = amd_bytealign (w[30], w[31], offset); - w[60] = amd_bytealign (w[29], w[30], offset); - w[59] = amd_bytealign (w[28], w[29], offset); - w[58] = amd_bytealign (w[27], w[28], offset); - w[57] = amd_bytealign (w[26], w[27], offset); - w[56] = amd_bytealign (w[25], w[26], offset); - w[55] = amd_bytealign (w[24], w[25], offset); - w[54] = amd_bytealign (w[23], w[24], offset); - w[53] = amd_bytealign (w[22], w[23], offset); - w[52] = amd_bytealign (w[21], w[22], offset); - w[51] = amd_bytealign (w[20], w[21], offset); - w[50] = amd_bytealign (w[19], w[20], offset); - w[49] = amd_bytealign (w[18], w[19], offset); - w[48] = amd_bytealign (w[17], w[18], offset); - w[47] = amd_bytealign (w[16], w[17], offset); - w[46] = amd_bytealign (w[15], w[16], offset); - w[45] = amd_bytealign (w[14], w[15], offset); - w[44] = amd_bytealign (w[13], w[14], offset); - w[43] = amd_bytealign (w[12], w[13], offset); - w[42] = amd_bytealign (w[11], w[12], offset); - w[41] = amd_bytealign (w[10], w[11], offset); - w[40] = amd_bytealign (w[ 9], w[10], offset); - w[39] = amd_bytealign (w[ 8], w[ 9], offset); - w[38] = amd_bytealign (w[ 7], w[ 8], offset); - w[37] = amd_bytealign (w[ 6], w[ 7], offset); - w[36] = amd_bytealign (w[ 5], w[ 6], offset); - w[35] = amd_bytealign (w[ 4], w[ 5], offset); - w[34] = amd_bytealign (w[ 3], w[ 4], offset); - w[33] = amd_bytealign (w[ 2], w[ 3], offset); - w[32] = amd_bytealign (w[ 1], w[ 2], offset); - w[31] = amd_bytealign (w[ 0], w[ 1], offset); - w[30] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[32], w[33], offset); + w[62] = hc_bytealign (w[31], w[32], offset); + w[61] = hc_bytealign (w[30], w[31], offset); + w[60] = hc_bytealign (w[29], w[30], offset); + w[59] = hc_bytealign (w[28], w[29], offset); + w[58] = hc_bytealign (w[27], w[28], offset); + w[57] = hc_bytealign (w[26], w[27], offset); + w[56] = hc_bytealign (w[25], w[26], offset); + w[55] = hc_bytealign (w[24], w[25], offset); + w[54] = hc_bytealign (w[23], w[24], offset); + w[53] = hc_bytealign (w[22], w[23], offset); + w[52] = hc_bytealign (w[21], w[22], offset); + w[51] = hc_bytealign (w[20], w[21], offset); + w[50] = hc_bytealign (w[19], w[20], offset); + w[49] = hc_bytealign (w[18], w[19], offset); + w[48] = hc_bytealign (w[17], w[18], offset); + w[47] = hc_bytealign (w[16], w[17], offset); + w[46] = hc_bytealign (w[15], w[16], offset); + w[45] = hc_bytealign (w[14], w[15], offset); + w[44] = hc_bytealign (w[13], w[14], offset); + w[43] = hc_bytealign (w[12], w[13], offset); + w[42] = hc_bytealign (w[11], w[12], offset); + w[41] = hc_bytealign (w[10], w[11], offset); + w[40] = hc_bytealign (w[ 9], w[10], offset); + w[39] = hc_bytealign (w[ 8], w[ 9], offset); + w[38] = hc_bytealign (w[ 7], w[ 8], offset); + w[37] = hc_bytealign (w[ 6], w[ 7], offset); + w[36] = hc_bytealign (w[ 5], w[ 6], offset); + w[35] = hc_bytealign (w[ 4], w[ 5], offset); + w[34] = hc_bytealign (w[ 3], w[ 4], offset); + w[33] = hc_bytealign (w[ 2], w[ 3], offset); + w[32] = hc_bytealign (w[ 1], w[ 2], offset); + w[31] = hc_bytealign (w[ 0], w[ 1], offset); + w[30] = hc_bytealign ( 0, w[ 0], offset); w[29] = 0; w[28] = 0; w[27] = 0; @@ -13774,39 +13774,39 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 31: - w[63] = amd_bytealign (w[31], w[32], offset); - w[62] = amd_bytealign (w[30], w[31], offset); - w[61] = amd_bytealign (w[29], w[30], offset); - w[60] = amd_bytealign (w[28], w[29], offset); - w[59] = amd_bytealign (w[27], w[28], offset); - w[58] = amd_bytealign (w[26], w[27], offset); - w[57] = amd_bytealign (w[25], w[26], offset); - w[56] = amd_bytealign (w[24], w[25], offset); - w[55] = amd_bytealign (w[23], w[24], offset); - w[54] = amd_bytealign (w[22], w[23], offset); - w[53] = amd_bytealign (w[21], w[22], offset); - w[52] = amd_bytealign (w[20], w[21], offset); - w[51] = amd_bytealign (w[19], w[20], offset); - w[50] = amd_bytealign (w[18], w[19], offset); - w[49] = amd_bytealign (w[17], w[18], offset); - w[48] = amd_bytealign (w[16], w[17], offset); - w[47] = amd_bytealign (w[15], w[16], offset); - w[46] = amd_bytealign (w[14], w[15], offset); - w[45] = amd_bytealign (w[13], w[14], offset); - w[44] = amd_bytealign (w[12], w[13], offset); - w[43] = amd_bytealign (w[11], w[12], offset); - w[42] = amd_bytealign (w[10], w[11], offset); - w[41] = amd_bytealign (w[ 9], w[10], offset); - w[40] = amd_bytealign (w[ 8], w[ 9], offset); - w[39] = amd_bytealign (w[ 7], w[ 8], offset); - w[38] = amd_bytealign (w[ 6], w[ 7], offset); - w[37] = amd_bytealign (w[ 5], w[ 6], offset); - w[36] = amd_bytealign (w[ 4], w[ 5], offset); - w[35] = amd_bytealign (w[ 3], w[ 4], offset); - w[34] = amd_bytealign (w[ 2], w[ 3], offset); - w[33] = amd_bytealign (w[ 1], w[ 2], offset); - w[32] = amd_bytealign (w[ 0], w[ 1], offset); - w[31] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[31], w[32], offset); + w[62] = hc_bytealign (w[30], w[31], offset); + w[61] = hc_bytealign (w[29], w[30], offset); + w[60] = hc_bytealign (w[28], w[29], offset); + w[59] = hc_bytealign (w[27], w[28], offset); + w[58] = hc_bytealign (w[26], w[27], offset); + w[57] = hc_bytealign (w[25], w[26], offset); + w[56] = hc_bytealign (w[24], w[25], offset); + w[55] = hc_bytealign (w[23], w[24], offset); + w[54] = hc_bytealign (w[22], w[23], offset); + w[53] = hc_bytealign (w[21], w[22], offset); + w[52] = hc_bytealign (w[20], w[21], offset); + w[51] = hc_bytealign (w[19], w[20], offset); + w[50] = hc_bytealign (w[18], w[19], offset); + w[49] = hc_bytealign (w[17], w[18], offset); + w[48] = hc_bytealign (w[16], w[17], offset); + w[47] = hc_bytealign (w[15], w[16], offset); + w[46] = hc_bytealign (w[14], w[15], offset); + w[45] = hc_bytealign (w[13], w[14], offset); + w[44] = hc_bytealign (w[12], w[13], offset); + w[43] = hc_bytealign (w[11], w[12], offset); + w[42] = hc_bytealign (w[10], w[11], offset); + w[41] = hc_bytealign (w[ 9], w[10], offset); + w[40] = hc_bytealign (w[ 8], w[ 9], offset); + w[39] = hc_bytealign (w[ 7], w[ 8], offset); + w[38] = hc_bytealign (w[ 6], w[ 7], offset); + w[37] = hc_bytealign (w[ 5], w[ 6], offset); + w[36] = hc_bytealign (w[ 4], w[ 5], offset); + w[35] = hc_bytealign (w[ 3], w[ 4], offset); + w[34] = hc_bytealign (w[ 2], w[ 3], offset); + w[33] = hc_bytealign (w[ 1], w[ 2], offset); + w[32] = hc_bytealign (w[ 0], w[ 1], offset); + w[31] = hc_bytealign ( 0, w[ 0], offset); w[30] = 0; w[29] = 0; w[28] = 0; @@ -13842,38 +13842,38 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 32: - w[63] = amd_bytealign (w[30], w[31], offset); - w[62] = amd_bytealign (w[29], w[30], offset); - w[61] = amd_bytealign (w[28], w[29], offset); - w[60] = amd_bytealign (w[27], w[28], offset); - w[59] = amd_bytealign (w[26], w[27], offset); - w[58] = amd_bytealign (w[25], w[26], offset); - w[57] = amd_bytealign (w[24], w[25], offset); - w[56] = amd_bytealign (w[23], w[24], offset); - w[55] = amd_bytealign (w[22], w[23], offset); - w[54] = amd_bytealign (w[21], w[22], offset); - w[53] = amd_bytealign (w[20], w[21], offset); - w[52] = amd_bytealign (w[19], w[20], offset); - w[51] = amd_bytealign (w[18], w[19], offset); - w[50] = amd_bytealign (w[17], w[18], offset); - w[49] = amd_bytealign (w[16], w[17], offset); - w[48] = amd_bytealign (w[15], w[16], offset); - w[47] = amd_bytealign (w[14], w[15], offset); - w[46] = amd_bytealign (w[13], w[14], offset); - w[45] = amd_bytealign (w[12], w[13], offset); - w[44] = amd_bytealign (w[11], w[12], offset); - w[43] = amd_bytealign (w[10], w[11], offset); - w[42] = amd_bytealign (w[ 9], w[10], offset); - w[41] = amd_bytealign (w[ 8], w[ 9], offset); - w[40] = amd_bytealign (w[ 7], w[ 8], offset); - w[39] = amd_bytealign (w[ 6], w[ 7], offset); - w[38] = amd_bytealign (w[ 5], w[ 6], offset); - w[37] = amd_bytealign (w[ 4], w[ 5], offset); - w[36] = amd_bytealign (w[ 3], w[ 4], offset); - w[35] = amd_bytealign (w[ 2], w[ 3], offset); - w[34] = amd_bytealign (w[ 1], w[ 2], offset); - w[33] = amd_bytealign (w[ 0], w[ 1], offset); - w[32] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[30], w[31], offset); + w[62] = hc_bytealign (w[29], w[30], offset); + w[61] = hc_bytealign (w[28], w[29], offset); + w[60] = hc_bytealign (w[27], w[28], offset); + w[59] = hc_bytealign (w[26], w[27], offset); + w[58] = hc_bytealign (w[25], w[26], offset); + w[57] = hc_bytealign (w[24], w[25], offset); + w[56] = hc_bytealign (w[23], w[24], offset); + w[55] = hc_bytealign (w[22], w[23], offset); + w[54] = hc_bytealign (w[21], w[22], offset); + w[53] = hc_bytealign (w[20], w[21], offset); + w[52] = hc_bytealign (w[19], w[20], offset); + w[51] = hc_bytealign (w[18], w[19], offset); + w[50] = hc_bytealign (w[17], w[18], offset); + w[49] = hc_bytealign (w[16], w[17], offset); + w[48] = hc_bytealign (w[15], w[16], offset); + w[47] = hc_bytealign (w[14], w[15], offset); + w[46] = hc_bytealign (w[13], w[14], offset); + w[45] = hc_bytealign (w[12], w[13], offset); + w[44] = hc_bytealign (w[11], w[12], offset); + w[43] = hc_bytealign (w[10], w[11], offset); + w[42] = hc_bytealign (w[ 9], w[10], offset); + w[41] = hc_bytealign (w[ 8], w[ 9], offset); + w[40] = hc_bytealign (w[ 7], w[ 8], offset); + w[39] = hc_bytealign (w[ 6], w[ 7], offset); + w[38] = hc_bytealign (w[ 5], w[ 6], offset); + w[37] = hc_bytealign (w[ 4], w[ 5], offset); + w[36] = hc_bytealign (w[ 3], w[ 4], offset); + w[35] = hc_bytealign (w[ 2], w[ 3], offset); + w[34] = hc_bytealign (w[ 1], w[ 2], offset); + w[33] = hc_bytealign (w[ 0], w[ 1], offset); + w[32] = hc_bytealign ( 0, w[ 0], offset); w[31] = 0; w[30] = 0; w[29] = 0; @@ -13910,37 +13910,37 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 33: - w[63] = amd_bytealign (w[29], w[30], offset); - w[62] = amd_bytealign (w[28], w[29], offset); - w[61] = amd_bytealign (w[27], w[28], offset); - w[60] = amd_bytealign (w[26], w[27], offset); - w[59] = amd_bytealign (w[25], w[26], offset); - w[58] = amd_bytealign (w[24], w[25], offset); - w[57] = amd_bytealign (w[23], w[24], offset); - w[56] = amd_bytealign (w[22], w[23], offset); - w[55] = amd_bytealign (w[21], w[22], offset); - w[54] = amd_bytealign (w[20], w[21], offset); - w[53] = amd_bytealign (w[19], w[20], offset); - w[52] = amd_bytealign (w[18], w[19], offset); - w[51] = amd_bytealign (w[17], w[18], offset); - w[50] = amd_bytealign (w[16], w[17], offset); - w[49] = amd_bytealign (w[15], w[16], offset); - w[48] = amd_bytealign (w[14], w[15], offset); - w[47] = amd_bytealign (w[13], w[14], offset); - w[46] = amd_bytealign (w[12], w[13], offset); - w[45] = amd_bytealign (w[11], w[12], offset); - w[44] = amd_bytealign (w[10], w[11], offset); - w[43] = amd_bytealign (w[ 9], w[10], offset); - w[42] = amd_bytealign (w[ 8], w[ 9], offset); - w[41] = amd_bytealign (w[ 7], w[ 8], offset); - w[40] = amd_bytealign (w[ 6], w[ 7], offset); - w[39] = amd_bytealign (w[ 5], w[ 6], offset); - w[38] = amd_bytealign (w[ 4], w[ 5], offset); - w[37] = amd_bytealign (w[ 3], w[ 4], offset); - w[36] = amd_bytealign (w[ 2], w[ 3], offset); - w[35] = amd_bytealign (w[ 1], w[ 2], offset); - w[34] = amd_bytealign (w[ 0], w[ 1], offset); - w[33] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[29], w[30], offset); + w[62] = hc_bytealign (w[28], w[29], offset); + w[61] = hc_bytealign (w[27], w[28], offset); + w[60] = hc_bytealign (w[26], w[27], offset); + w[59] = hc_bytealign (w[25], w[26], offset); + w[58] = hc_bytealign (w[24], w[25], offset); + w[57] = hc_bytealign (w[23], w[24], offset); + w[56] = hc_bytealign (w[22], w[23], offset); + w[55] = hc_bytealign (w[21], w[22], offset); + w[54] = hc_bytealign (w[20], w[21], offset); + w[53] = hc_bytealign (w[19], w[20], offset); + w[52] = hc_bytealign (w[18], w[19], offset); + w[51] = hc_bytealign (w[17], w[18], offset); + w[50] = hc_bytealign (w[16], w[17], offset); + w[49] = hc_bytealign (w[15], w[16], offset); + w[48] = hc_bytealign (w[14], w[15], offset); + w[47] = hc_bytealign (w[13], w[14], offset); + w[46] = hc_bytealign (w[12], w[13], offset); + w[45] = hc_bytealign (w[11], w[12], offset); + w[44] = hc_bytealign (w[10], w[11], offset); + w[43] = hc_bytealign (w[ 9], w[10], offset); + w[42] = hc_bytealign (w[ 8], w[ 9], offset); + w[41] = hc_bytealign (w[ 7], w[ 8], offset); + w[40] = hc_bytealign (w[ 6], w[ 7], offset); + w[39] = hc_bytealign (w[ 5], w[ 6], offset); + w[38] = hc_bytealign (w[ 4], w[ 5], offset); + w[37] = hc_bytealign (w[ 3], w[ 4], offset); + w[36] = hc_bytealign (w[ 2], w[ 3], offset); + w[35] = hc_bytealign (w[ 1], w[ 2], offset); + w[34] = hc_bytealign (w[ 0], w[ 1], offset); + w[33] = hc_bytealign ( 0, w[ 0], offset); w[32] = 0; w[31] = 0; w[30] = 0; @@ -13978,36 +13978,36 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 34: - w[63] = amd_bytealign (w[28], w[29], offset); - w[62] = amd_bytealign (w[27], w[28], offset); - w[61] = amd_bytealign (w[26], w[27], offset); - w[60] = amd_bytealign (w[25], w[26], offset); - w[59] = amd_bytealign (w[24], w[25], offset); - w[58] = amd_bytealign (w[23], w[24], offset); - w[57] = amd_bytealign (w[22], w[23], offset); - w[56] = amd_bytealign (w[21], w[22], offset); - w[55] = amd_bytealign (w[20], w[21], offset); - w[54] = amd_bytealign (w[19], w[20], offset); - w[53] = amd_bytealign (w[18], w[19], offset); - w[52] = amd_bytealign (w[17], w[18], offset); - w[51] = amd_bytealign (w[16], w[17], offset); - w[50] = amd_bytealign (w[15], w[16], offset); - w[49] = amd_bytealign (w[14], w[15], offset); - w[48] = amd_bytealign (w[13], w[14], offset); - w[47] = amd_bytealign (w[12], w[13], offset); - w[46] = amd_bytealign (w[11], w[12], offset); - w[45] = amd_bytealign (w[10], w[11], offset); - w[44] = amd_bytealign (w[ 9], w[10], offset); - w[43] = amd_bytealign (w[ 8], w[ 9], offset); - w[42] = amd_bytealign (w[ 7], w[ 8], offset); - w[41] = amd_bytealign (w[ 6], w[ 7], offset); - w[40] = amd_bytealign (w[ 5], w[ 6], offset); - w[39] = amd_bytealign (w[ 4], w[ 5], offset); - w[38] = amd_bytealign (w[ 3], w[ 4], offset); - w[37] = amd_bytealign (w[ 2], w[ 3], offset); - w[36] = amd_bytealign (w[ 1], w[ 2], offset); - w[35] = amd_bytealign (w[ 0], w[ 1], offset); - w[34] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[28], w[29], offset); + w[62] = hc_bytealign (w[27], w[28], offset); + w[61] = hc_bytealign (w[26], w[27], offset); + w[60] = hc_bytealign (w[25], w[26], offset); + w[59] = hc_bytealign (w[24], w[25], offset); + w[58] = hc_bytealign (w[23], w[24], offset); + w[57] = hc_bytealign (w[22], w[23], offset); + w[56] = hc_bytealign (w[21], w[22], offset); + w[55] = hc_bytealign (w[20], w[21], offset); + w[54] = hc_bytealign (w[19], w[20], offset); + w[53] = hc_bytealign (w[18], w[19], offset); + w[52] = hc_bytealign (w[17], w[18], offset); + w[51] = hc_bytealign (w[16], w[17], offset); + w[50] = hc_bytealign (w[15], w[16], offset); + w[49] = hc_bytealign (w[14], w[15], offset); + w[48] = hc_bytealign (w[13], w[14], offset); + w[47] = hc_bytealign (w[12], w[13], offset); + w[46] = hc_bytealign (w[11], w[12], offset); + w[45] = hc_bytealign (w[10], w[11], offset); + w[44] = hc_bytealign (w[ 9], w[10], offset); + w[43] = hc_bytealign (w[ 8], w[ 9], offset); + w[42] = hc_bytealign (w[ 7], w[ 8], offset); + w[41] = hc_bytealign (w[ 6], w[ 7], offset); + w[40] = hc_bytealign (w[ 5], w[ 6], offset); + w[39] = hc_bytealign (w[ 4], w[ 5], offset); + w[38] = hc_bytealign (w[ 3], w[ 4], offset); + w[37] = hc_bytealign (w[ 2], w[ 3], offset); + w[36] = hc_bytealign (w[ 1], w[ 2], offset); + w[35] = hc_bytealign (w[ 0], w[ 1], offset); + w[34] = hc_bytealign ( 0, w[ 0], offset); w[33] = 0; w[32] = 0; w[31] = 0; @@ -14046,35 +14046,35 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 35: - w[63] = amd_bytealign (w[27], w[28], offset); - w[62] = amd_bytealign (w[26], w[27], offset); - w[61] = amd_bytealign (w[25], w[26], offset); - w[60] = amd_bytealign (w[24], w[25], offset); - w[59] = amd_bytealign (w[23], w[24], offset); - w[58] = amd_bytealign (w[22], w[23], offset); - w[57] = amd_bytealign (w[21], w[22], offset); - w[56] = amd_bytealign (w[20], w[21], offset); - w[55] = amd_bytealign (w[19], w[20], offset); - w[54] = amd_bytealign (w[18], w[19], offset); - w[53] = amd_bytealign (w[17], w[18], offset); - w[52] = amd_bytealign (w[16], w[17], offset); - w[51] = amd_bytealign (w[15], w[16], offset); - w[50] = amd_bytealign (w[14], w[15], offset); - w[49] = amd_bytealign (w[13], w[14], offset); - w[48] = amd_bytealign (w[12], w[13], offset); - w[47] = amd_bytealign (w[11], w[12], offset); - w[46] = amd_bytealign (w[10], w[11], offset); - w[45] = amd_bytealign (w[ 9], w[10], offset); - w[44] = amd_bytealign (w[ 8], w[ 9], offset); - w[43] = amd_bytealign (w[ 7], w[ 8], offset); - w[42] = amd_bytealign (w[ 6], w[ 7], offset); - w[41] = amd_bytealign (w[ 5], w[ 6], offset); - w[40] = amd_bytealign (w[ 4], w[ 5], offset); - w[39] = amd_bytealign (w[ 3], w[ 4], offset); - w[38] = amd_bytealign (w[ 2], w[ 3], offset); - w[37] = amd_bytealign (w[ 1], w[ 2], offset); - w[36] = amd_bytealign (w[ 0], w[ 1], offset); - w[35] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[27], w[28], offset); + w[62] = hc_bytealign (w[26], w[27], offset); + w[61] = hc_bytealign (w[25], w[26], offset); + w[60] = hc_bytealign (w[24], w[25], offset); + w[59] = hc_bytealign (w[23], w[24], offset); + w[58] = hc_bytealign (w[22], w[23], offset); + w[57] = hc_bytealign (w[21], w[22], offset); + w[56] = hc_bytealign (w[20], w[21], offset); + w[55] = hc_bytealign (w[19], w[20], offset); + w[54] = hc_bytealign (w[18], w[19], offset); + w[53] = hc_bytealign (w[17], w[18], offset); + w[52] = hc_bytealign (w[16], w[17], offset); + w[51] = hc_bytealign (w[15], w[16], offset); + w[50] = hc_bytealign (w[14], w[15], offset); + w[49] = hc_bytealign (w[13], w[14], offset); + w[48] = hc_bytealign (w[12], w[13], offset); + w[47] = hc_bytealign (w[11], w[12], offset); + w[46] = hc_bytealign (w[10], w[11], offset); + w[45] = hc_bytealign (w[ 9], w[10], offset); + w[44] = hc_bytealign (w[ 8], w[ 9], offset); + w[43] = hc_bytealign (w[ 7], w[ 8], offset); + w[42] = hc_bytealign (w[ 6], w[ 7], offset); + w[41] = hc_bytealign (w[ 5], w[ 6], offset); + w[40] = hc_bytealign (w[ 4], w[ 5], offset); + w[39] = hc_bytealign (w[ 3], w[ 4], offset); + w[38] = hc_bytealign (w[ 2], w[ 3], offset); + w[37] = hc_bytealign (w[ 1], w[ 2], offset); + w[36] = hc_bytealign (w[ 0], w[ 1], offset); + w[35] = hc_bytealign ( 0, w[ 0], offset); w[34] = 0; w[33] = 0; w[32] = 0; @@ -14114,34 +14114,34 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 36: - w[63] = amd_bytealign (w[26], w[27], offset); - w[62] = amd_bytealign (w[25], w[26], offset); - w[61] = amd_bytealign (w[24], w[25], offset); - w[60] = amd_bytealign (w[23], w[24], offset); - w[59] = amd_bytealign (w[22], w[23], offset); - w[58] = amd_bytealign (w[21], w[22], offset); - w[57] = amd_bytealign (w[20], w[21], offset); - w[56] = amd_bytealign (w[19], w[20], offset); - w[55] = amd_bytealign (w[18], w[19], offset); - w[54] = amd_bytealign (w[17], w[18], offset); - w[53] = amd_bytealign (w[16], w[17], offset); - w[52] = amd_bytealign (w[15], w[16], offset); - w[51] = amd_bytealign (w[14], w[15], offset); - w[50] = amd_bytealign (w[13], w[14], offset); - w[49] = amd_bytealign (w[12], w[13], offset); - w[48] = amd_bytealign (w[11], w[12], offset); - w[47] = amd_bytealign (w[10], w[11], offset); - w[46] = amd_bytealign (w[ 9], w[10], offset); - w[45] = amd_bytealign (w[ 8], w[ 9], offset); - w[44] = amd_bytealign (w[ 7], w[ 8], offset); - w[43] = amd_bytealign (w[ 6], w[ 7], offset); - w[42] = amd_bytealign (w[ 5], w[ 6], offset); - w[41] = amd_bytealign (w[ 4], w[ 5], offset); - w[40] = amd_bytealign (w[ 3], w[ 4], offset); - w[39] = amd_bytealign (w[ 2], w[ 3], offset); - w[38] = amd_bytealign (w[ 1], w[ 2], offset); - w[37] = amd_bytealign (w[ 0], w[ 1], offset); - w[36] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[26], w[27], offset); + w[62] = hc_bytealign (w[25], w[26], offset); + w[61] = hc_bytealign (w[24], w[25], offset); + w[60] = hc_bytealign (w[23], w[24], offset); + w[59] = hc_bytealign (w[22], w[23], offset); + w[58] = hc_bytealign (w[21], w[22], offset); + w[57] = hc_bytealign (w[20], w[21], offset); + w[56] = hc_bytealign (w[19], w[20], offset); + w[55] = hc_bytealign (w[18], w[19], offset); + w[54] = hc_bytealign (w[17], w[18], offset); + w[53] = hc_bytealign (w[16], w[17], offset); + w[52] = hc_bytealign (w[15], w[16], offset); + w[51] = hc_bytealign (w[14], w[15], offset); + w[50] = hc_bytealign (w[13], w[14], offset); + w[49] = hc_bytealign (w[12], w[13], offset); + w[48] = hc_bytealign (w[11], w[12], offset); + w[47] = hc_bytealign (w[10], w[11], offset); + w[46] = hc_bytealign (w[ 9], w[10], offset); + w[45] = hc_bytealign (w[ 8], w[ 9], offset); + w[44] = hc_bytealign (w[ 7], w[ 8], offset); + w[43] = hc_bytealign (w[ 6], w[ 7], offset); + w[42] = hc_bytealign (w[ 5], w[ 6], offset); + w[41] = hc_bytealign (w[ 4], w[ 5], offset); + w[40] = hc_bytealign (w[ 3], w[ 4], offset); + w[39] = hc_bytealign (w[ 2], w[ 3], offset); + w[38] = hc_bytealign (w[ 1], w[ 2], offset); + w[37] = hc_bytealign (w[ 0], w[ 1], offset); + w[36] = hc_bytealign ( 0, w[ 0], offset); w[35] = 0; w[34] = 0; w[33] = 0; @@ -14182,33 +14182,33 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 37: - w[63] = amd_bytealign (w[25], w[26], offset); - w[62] = amd_bytealign (w[24], w[25], offset); - w[61] = amd_bytealign (w[23], w[24], offset); - w[60] = amd_bytealign (w[22], w[23], offset); - w[59] = amd_bytealign (w[21], w[22], offset); - w[58] = amd_bytealign (w[20], w[21], offset); - w[57] = amd_bytealign (w[19], w[20], offset); - w[56] = amd_bytealign (w[18], w[19], offset); - w[55] = amd_bytealign (w[17], w[18], offset); - w[54] = amd_bytealign (w[16], w[17], offset); - w[53] = amd_bytealign (w[15], w[16], offset); - w[52] = amd_bytealign (w[14], w[15], offset); - w[51] = amd_bytealign (w[13], w[14], offset); - w[50] = amd_bytealign (w[12], w[13], offset); - w[49] = amd_bytealign (w[11], w[12], offset); - w[48] = amd_bytealign (w[10], w[11], offset); - w[47] = amd_bytealign (w[ 9], w[10], offset); - w[46] = amd_bytealign (w[ 8], w[ 9], offset); - w[45] = amd_bytealign (w[ 7], w[ 8], offset); - w[44] = amd_bytealign (w[ 6], w[ 7], offset); - w[43] = amd_bytealign (w[ 5], w[ 6], offset); - w[42] = amd_bytealign (w[ 4], w[ 5], offset); - w[41] = amd_bytealign (w[ 3], w[ 4], offset); - w[40] = amd_bytealign (w[ 2], w[ 3], offset); - w[39] = amd_bytealign (w[ 1], w[ 2], offset); - w[38] = amd_bytealign (w[ 0], w[ 1], offset); - w[37] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[25], w[26], offset); + w[62] = hc_bytealign (w[24], w[25], offset); + w[61] = hc_bytealign (w[23], w[24], offset); + w[60] = hc_bytealign (w[22], w[23], offset); + w[59] = hc_bytealign (w[21], w[22], offset); + w[58] = hc_bytealign (w[20], w[21], offset); + w[57] = hc_bytealign (w[19], w[20], offset); + w[56] = hc_bytealign (w[18], w[19], offset); + w[55] = hc_bytealign (w[17], w[18], offset); + w[54] = hc_bytealign (w[16], w[17], offset); + w[53] = hc_bytealign (w[15], w[16], offset); + w[52] = hc_bytealign (w[14], w[15], offset); + w[51] = hc_bytealign (w[13], w[14], offset); + w[50] = hc_bytealign (w[12], w[13], offset); + w[49] = hc_bytealign (w[11], w[12], offset); + w[48] = hc_bytealign (w[10], w[11], offset); + w[47] = hc_bytealign (w[ 9], w[10], offset); + w[46] = hc_bytealign (w[ 8], w[ 9], offset); + w[45] = hc_bytealign (w[ 7], w[ 8], offset); + w[44] = hc_bytealign (w[ 6], w[ 7], offset); + w[43] = hc_bytealign (w[ 5], w[ 6], offset); + w[42] = hc_bytealign (w[ 4], w[ 5], offset); + w[41] = hc_bytealign (w[ 3], w[ 4], offset); + w[40] = hc_bytealign (w[ 2], w[ 3], offset); + w[39] = hc_bytealign (w[ 1], w[ 2], offset); + w[38] = hc_bytealign (w[ 0], w[ 1], offset); + w[37] = hc_bytealign ( 0, w[ 0], offset); w[36] = 0; w[35] = 0; w[34] = 0; @@ -14250,32 +14250,32 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 38: - w[63] = amd_bytealign (w[24], w[25], offset); - w[62] = amd_bytealign (w[23], w[24], offset); - w[61] = amd_bytealign (w[22], w[23], offset); - w[60] = amd_bytealign (w[21], w[22], offset); - w[59] = amd_bytealign (w[20], w[21], offset); - w[58] = amd_bytealign (w[19], w[20], offset); - w[57] = amd_bytealign (w[18], w[19], offset); - w[56] = amd_bytealign (w[17], w[18], offset); - w[55] = amd_bytealign (w[16], w[17], offset); - w[54] = amd_bytealign (w[15], w[16], offset); - w[53] = amd_bytealign (w[14], w[15], offset); - w[52] = amd_bytealign (w[13], w[14], offset); - w[51] = amd_bytealign (w[12], w[13], offset); - w[50] = amd_bytealign (w[11], w[12], offset); - w[49] = amd_bytealign (w[10], w[11], offset); - w[48] = amd_bytealign (w[ 9], w[10], offset); - w[47] = amd_bytealign (w[ 8], w[ 9], offset); - w[46] = amd_bytealign (w[ 7], w[ 8], offset); - w[45] = amd_bytealign (w[ 6], w[ 7], offset); - w[44] = amd_bytealign (w[ 5], w[ 6], offset); - w[43] = amd_bytealign (w[ 4], w[ 5], offset); - w[42] = amd_bytealign (w[ 3], w[ 4], offset); - w[41] = amd_bytealign (w[ 2], w[ 3], offset); - w[40] = amd_bytealign (w[ 1], w[ 2], offset); - w[39] = amd_bytealign (w[ 0], w[ 1], offset); - w[38] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[24], w[25], offset); + w[62] = hc_bytealign (w[23], w[24], offset); + w[61] = hc_bytealign (w[22], w[23], offset); + w[60] = hc_bytealign (w[21], w[22], offset); + w[59] = hc_bytealign (w[20], w[21], offset); + w[58] = hc_bytealign (w[19], w[20], offset); + w[57] = hc_bytealign (w[18], w[19], offset); + w[56] = hc_bytealign (w[17], w[18], offset); + w[55] = hc_bytealign (w[16], w[17], offset); + w[54] = hc_bytealign (w[15], w[16], offset); + w[53] = hc_bytealign (w[14], w[15], offset); + w[52] = hc_bytealign (w[13], w[14], offset); + w[51] = hc_bytealign (w[12], w[13], offset); + w[50] = hc_bytealign (w[11], w[12], offset); + w[49] = hc_bytealign (w[10], w[11], offset); + w[48] = hc_bytealign (w[ 9], w[10], offset); + w[47] = hc_bytealign (w[ 8], w[ 9], offset); + w[46] = hc_bytealign (w[ 7], w[ 8], offset); + w[45] = hc_bytealign (w[ 6], w[ 7], offset); + w[44] = hc_bytealign (w[ 5], w[ 6], offset); + w[43] = hc_bytealign (w[ 4], w[ 5], offset); + w[42] = hc_bytealign (w[ 3], w[ 4], offset); + w[41] = hc_bytealign (w[ 2], w[ 3], offset); + w[40] = hc_bytealign (w[ 1], w[ 2], offset); + w[39] = hc_bytealign (w[ 0], w[ 1], offset); + w[38] = hc_bytealign ( 0, w[ 0], offset); w[37] = 0; w[36] = 0; w[35] = 0; @@ -14318,31 +14318,31 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 39: - w[63] = amd_bytealign (w[23], w[24], offset); - w[62] = amd_bytealign (w[22], w[23], offset); - w[61] = amd_bytealign (w[21], w[22], offset); - w[60] = amd_bytealign (w[20], w[21], offset); - w[59] = amd_bytealign (w[19], w[20], offset); - w[58] = amd_bytealign (w[18], w[19], offset); - w[57] = amd_bytealign (w[17], w[18], offset); - w[56] = amd_bytealign (w[16], w[17], offset); - w[55] = amd_bytealign (w[15], w[16], offset); - w[54] = amd_bytealign (w[14], w[15], offset); - w[53] = amd_bytealign (w[13], w[14], offset); - w[52] = amd_bytealign (w[12], w[13], offset); - w[51] = amd_bytealign (w[11], w[12], offset); - w[50] = amd_bytealign (w[10], w[11], offset); - w[49] = amd_bytealign (w[ 9], w[10], offset); - w[48] = amd_bytealign (w[ 8], w[ 9], offset); - w[47] = amd_bytealign (w[ 7], w[ 8], offset); - w[46] = amd_bytealign (w[ 6], w[ 7], offset); - w[45] = amd_bytealign (w[ 5], w[ 6], offset); - w[44] = amd_bytealign (w[ 4], w[ 5], offset); - w[43] = amd_bytealign (w[ 3], w[ 4], offset); - w[42] = amd_bytealign (w[ 2], w[ 3], offset); - w[41] = amd_bytealign (w[ 1], w[ 2], offset); - w[40] = amd_bytealign (w[ 0], w[ 1], offset); - w[39] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[23], w[24], offset); + w[62] = hc_bytealign (w[22], w[23], offset); + w[61] = hc_bytealign (w[21], w[22], offset); + w[60] = hc_bytealign (w[20], w[21], offset); + w[59] = hc_bytealign (w[19], w[20], offset); + w[58] = hc_bytealign (w[18], w[19], offset); + w[57] = hc_bytealign (w[17], w[18], offset); + w[56] = hc_bytealign (w[16], w[17], offset); + w[55] = hc_bytealign (w[15], w[16], offset); + w[54] = hc_bytealign (w[14], w[15], offset); + w[53] = hc_bytealign (w[13], w[14], offset); + w[52] = hc_bytealign (w[12], w[13], offset); + w[51] = hc_bytealign (w[11], w[12], offset); + w[50] = hc_bytealign (w[10], w[11], offset); + w[49] = hc_bytealign (w[ 9], w[10], offset); + w[48] = hc_bytealign (w[ 8], w[ 9], offset); + w[47] = hc_bytealign (w[ 7], w[ 8], offset); + w[46] = hc_bytealign (w[ 6], w[ 7], offset); + w[45] = hc_bytealign (w[ 5], w[ 6], offset); + w[44] = hc_bytealign (w[ 4], w[ 5], offset); + w[43] = hc_bytealign (w[ 3], w[ 4], offset); + w[42] = hc_bytealign (w[ 2], w[ 3], offset); + w[41] = hc_bytealign (w[ 1], w[ 2], offset); + w[40] = hc_bytealign (w[ 0], w[ 1], offset); + w[39] = hc_bytealign ( 0, w[ 0], offset); w[38] = 0; w[37] = 0; w[36] = 0; @@ -14386,30 +14386,30 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 40: - w[63] = amd_bytealign (w[22], w[23], offset); - w[62] = amd_bytealign (w[21], w[22], offset); - w[61] = amd_bytealign (w[20], w[21], offset); - w[60] = amd_bytealign (w[19], w[20], offset); - w[59] = amd_bytealign (w[18], w[19], offset); - w[58] = amd_bytealign (w[17], w[18], offset); - w[57] = amd_bytealign (w[16], w[17], offset); - w[56] = amd_bytealign (w[15], w[16], offset); - w[55] = amd_bytealign (w[14], w[15], offset); - w[54] = amd_bytealign (w[13], w[14], offset); - w[53] = amd_bytealign (w[12], w[13], offset); - w[52] = amd_bytealign (w[11], w[12], offset); - w[51] = amd_bytealign (w[10], w[11], offset); - w[50] = amd_bytealign (w[ 9], w[10], offset); - w[49] = amd_bytealign (w[ 8], w[ 9], offset); - w[48] = amd_bytealign (w[ 7], w[ 8], offset); - w[47] = amd_bytealign (w[ 6], w[ 7], offset); - w[46] = amd_bytealign (w[ 5], w[ 6], offset); - w[45] = amd_bytealign (w[ 4], w[ 5], offset); - w[44] = amd_bytealign (w[ 3], w[ 4], offset); - w[43] = amd_bytealign (w[ 2], w[ 3], offset); - w[42] = amd_bytealign (w[ 1], w[ 2], offset); - w[41] = amd_bytealign (w[ 0], w[ 1], offset); - w[40] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[22], w[23], offset); + w[62] = hc_bytealign (w[21], w[22], offset); + w[61] = hc_bytealign (w[20], w[21], offset); + w[60] = hc_bytealign (w[19], w[20], offset); + w[59] = hc_bytealign (w[18], w[19], offset); + w[58] = hc_bytealign (w[17], w[18], offset); + w[57] = hc_bytealign (w[16], w[17], offset); + w[56] = hc_bytealign (w[15], w[16], offset); + w[55] = hc_bytealign (w[14], w[15], offset); + w[54] = hc_bytealign (w[13], w[14], offset); + w[53] = hc_bytealign (w[12], w[13], offset); + w[52] = hc_bytealign (w[11], w[12], offset); + w[51] = hc_bytealign (w[10], w[11], offset); + w[50] = hc_bytealign (w[ 9], w[10], offset); + w[49] = hc_bytealign (w[ 8], w[ 9], offset); + w[48] = hc_bytealign (w[ 7], w[ 8], offset); + w[47] = hc_bytealign (w[ 6], w[ 7], offset); + w[46] = hc_bytealign (w[ 5], w[ 6], offset); + w[45] = hc_bytealign (w[ 4], w[ 5], offset); + w[44] = hc_bytealign (w[ 3], w[ 4], offset); + w[43] = hc_bytealign (w[ 2], w[ 3], offset); + w[42] = hc_bytealign (w[ 1], w[ 2], offset); + w[41] = hc_bytealign (w[ 0], w[ 1], offset); + w[40] = hc_bytealign ( 0, w[ 0], offset); w[39] = 0; w[38] = 0; w[37] = 0; @@ -14454,29 +14454,29 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 41: - w[63] = amd_bytealign (w[21], w[22], offset); - w[62] = amd_bytealign (w[20], w[21], offset); - w[61] = amd_bytealign (w[19], w[20], offset); - w[60] = amd_bytealign (w[18], w[19], offset); - w[59] = amd_bytealign (w[17], w[18], offset); - w[58] = amd_bytealign (w[16], w[17], offset); - w[57] = amd_bytealign (w[15], w[16], offset); - w[56] = amd_bytealign (w[14], w[15], offset); - w[55] = amd_bytealign (w[13], w[14], offset); - w[54] = amd_bytealign (w[12], w[13], offset); - w[53] = amd_bytealign (w[11], w[12], offset); - w[52] = amd_bytealign (w[10], w[11], offset); - w[51] = amd_bytealign (w[ 9], w[10], offset); - w[50] = amd_bytealign (w[ 8], w[ 9], offset); - w[49] = amd_bytealign (w[ 7], w[ 8], offset); - w[48] = amd_bytealign (w[ 6], w[ 7], offset); - w[47] = amd_bytealign (w[ 5], w[ 6], offset); - w[46] = amd_bytealign (w[ 4], w[ 5], offset); - w[45] = amd_bytealign (w[ 3], w[ 4], offset); - w[44] = amd_bytealign (w[ 2], w[ 3], offset); - w[43] = amd_bytealign (w[ 1], w[ 2], offset); - w[42] = amd_bytealign (w[ 0], w[ 1], offset); - w[41] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[21], w[22], offset); + w[62] = hc_bytealign (w[20], w[21], offset); + w[61] = hc_bytealign (w[19], w[20], offset); + w[60] = hc_bytealign (w[18], w[19], offset); + w[59] = hc_bytealign (w[17], w[18], offset); + w[58] = hc_bytealign (w[16], w[17], offset); + w[57] = hc_bytealign (w[15], w[16], offset); + w[56] = hc_bytealign (w[14], w[15], offset); + w[55] = hc_bytealign (w[13], w[14], offset); + w[54] = hc_bytealign (w[12], w[13], offset); + w[53] = hc_bytealign (w[11], w[12], offset); + w[52] = hc_bytealign (w[10], w[11], offset); + w[51] = hc_bytealign (w[ 9], w[10], offset); + w[50] = hc_bytealign (w[ 8], w[ 9], offset); + w[49] = hc_bytealign (w[ 7], w[ 8], offset); + w[48] = hc_bytealign (w[ 6], w[ 7], offset); + w[47] = hc_bytealign (w[ 5], w[ 6], offset); + w[46] = hc_bytealign (w[ 4], w[ 5], offset); + w[45] = hc_bytealign (w[ 3], w[ 4], offset); + w[44] = hc_bytealign (w[ 2], w[ 3], offset); + w[43] = hc_bytealign (w[ 1], w[ 2], offset); + w[42] = hc_bytealign (w[ 0], w[ 1], offset); + w[41] = hc_bytealign ( 0, w[ 0], offset); w[40] = 0; w[39] = 0; w[38] = 0; @@ -14522,28 +14522,28 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 42: - w[63] = amd_bytealign (w[20], w[21], offset); - w[62] = amd_bytealign (w[19], w[20], offset); - w[61] = amd_bytealign (w[18], w[19], offset); - w[60] = amd_bytealign (w[17], w[18], offset); - w[59] = amd_bytealign (w[16], w[17], offset); - w[58] = amd_bytealign (w[15], w[16], offset); - w[57] = amd_bytealign (w[14], w[15], offset); - w[56] = amd_bytealign (w[13], w[14], offset); - w[55] = amd_bytealign (w[12], w[13], offset); - w[54] = amd_bytealign (w[11], w[12], offset); - w[53] = amd_bytealign (w[10], w[11], offset); - w[52] = amd_bytealign (w[ 9], w[10], offset); - w[51] = amd_bytealign (w[ 8], w[ 9], offset); - w[50] = amd_bytealign (w[ 7], w[ 8], offset); - w[49] = amd_bytealign (w[ 6], w[ 7], offset); - w[48] = amd_bytealign (w[ 5], w[ 6], offset); - w[47] = amd_bytealign (w[ 4], w[ 5], offset); - w[46] = amd_bytealign (w[ 3], w[ 4], offset); - w[45] = amd_bytealign (w[ 2], w[ 3], offset); - w[44] = amd_bytealign (w[ 1], w[ 2], offset); - w[43] = amd_bytealign (w[ 0], w[ 1], offset); - w[42] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[20], w[21], offset); + w[62] = hc_bytealign (w[19], w[20], offset); + w[61] = hc_bytealign (w[18], w[19], offset); + w[60] = hc_bytealign (w[17], w[18], offset); + w[59] = hc_bytealign (w[16], w[17], offset); + w[58] = hc_bytealign (w[15], w[16], offset); + w[57] = hc_bytealign (w[14], w[15], offset); + w[56] = hc_bytealign (w[13], w[14], offset); + w[55] = hc_bytealign (w[12], w[13], offset); + w[54] = hc_bytealign (w[11], w[12], offset); + w[53] = hc_bytealign (w[10], w[11], offset); + w[52] = hc_bytealign (w[ 9], w[10], offset); + w[51] = hc_bytealign (w[ 8], w[ 9], offset); + w[50] = hc_bytealign (w[ 7], w[ 8], offset); + w[49] = hc_bytealign (w[ 6], w[ 7], offset); + w[48] = hc_bytealign (w[ 5], w[ 6], offset); + w[47] = hc_bytealign (w[ 4], w[ 5], offset); + w[46] = hc_bytealign (w[ 3], w[ 4], offset); + w[45] = hc_bytealign (w[ 2], w[ 3], offset); + w[44] = hc_bytealign (w[ 1], w[ 2], offset); + w[43] = hc_bytealign (w[ 0], w[ 1], offset); + w[42] = hc_bytealign ( 0, w[ 0], offset); w[41] = 0; w[40] = 0; w[39] = 0; @@ -14590,27 +14590,27 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 43: - w[63] = amd_bytealign (w[19], w[20], offset); - w[62] = amd_bytealign (w[18], w[19], offset); - w[61] = amd_bytealign (w[17], w[18], offset); - w[60] = amd_bytealign (w[16], w[17], offset); - w[59] = amd_bytealign (w[15], w[16], offset); - w[58] = amd_bytealign (w[14], w[15], offset); - w[57] = amd_bytealign (w[13], w[14], offset); - w[56] = amd_bytealign (w[12], w[13], offset); - w[55] = amd_bytealign (w[11], w[12], offset); - w[54] = amd_bytealign (w[10], w[11], offset); - w[53] = amd_bytealign (w[ 9], w[10], offset); - w[52] = amd_bytealign (w[ 8], w[ 9], offset); - w[51] = amd_bytealign (w[ 7], w[ 8], offset); - w[50] = amd_bytealign (w[ 6], w[ 7], offset); - w[49] = amd_bytealign (w[ 5], w[ 6], offset); - w[48] = amd_bytealign (w[ 4], w[ 5], offset); - w[47] = amd_bytealign (w[ 3], w[ 4], offset); - w[46] = amd_bytealign (w[ 2], w[ 3], offset); - w[45] = amd_bytealign (w[ 1], w[ 2], offset); - w[44] = amd_bytealign (w[ 0], w[ 1], offset); - w[43] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[19], w[20], offset); + w[62] = hc_bytealign (w[18], w[19], offset); + w[61] = hc_bytealign (w[17], w[18], offset); + w[60] = hc_bytealign (w[16], w[17], offset); + w[59] = hc_bytealign (w[15], w[16], offset); + w[58] = hc_bytealign (w[14], w[15], offset); + w[57] = hc_bytealign (w[13], w[14], offset); + w[56] = hc_bytealign (w[12], w[13], offset); + w[55] = hc_bytealign (w[11], w[12], offset); + w[54] = hc_bytealign (w[10], w[11], offset); + w[53] = hc_bytealign (w[ 9], w[10], offset); + w[52] = hc_bytealign (w[ 8], w[ 9], offset); + w[51] = hc_bytealign (w[ 7], w[ 8], offset); + w[50] = hc_bytealign (w[ 6], w[ 7], offset); + w[49] = hc_bytealign (w[ 5], w[ 6], offset); + w[48] = hc_bytealign (w[ 4], w[ 5], offset); + w[47] = hc_bytealign (w[ 3], w[ 4], offset); + w[46] = hc_bytealign (w[ 2], w[ 3], offset); + w[45] = hc_bytealign (w[ 1], w[ 2], offset); + w[44] = hc_bytealign (w[ 0], w[ 1], offset); + w[43] = hc_bytealign ( 0, w[ 0], offset); w[42] = 0; w[41] = 0; w[40] = 0; @@ -14658,26 +14658,26 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 44: - w[63] = amd_bytealign (w[18], w[19], offset); - w[62] = amd_bytealign (w[17], w[18], offset); - w[61] = amd_bytealign (w[16], w[17], offset); - w[60] = amd_bytealign (w[15], w[16], offset); - w[59] = amd_bytealign (w[14], w[15], offset); - w[58] = amd_bytealign (w[13], w[14], offset); - w[57] = amd_bytealign (w[12], w[13], offset); - w[56] = amd_bytealign (w[11], w[12], offset); - w[55] = amd_bytealign (w[10], w[11], offset); - w[54] = amd_bytealign (w[ 9], w[10], offset); - w[53] = amd_bytealign (w[ 8], w[ 9], offset); - w[52] = amd_bytealign (w[ 7], w[ 8], offset); - w[51] = amd_bytealign (w[ 6], w[ 7], offset); - w[50] = amd_bytealign (w[ 5], w[ 6], offset); - w[49] = amd_bytealign (w[ 4], w[ 5], offset); - w[48] = amd_bytealign (w[ 3], w[ 4], offset); - w[47] = amd_bytealign (w[ 2], w[ 3], offset); - w[46] = amd_bytealign (w[ 1], w[ 2], offset); - w[45] = amd_bytealign (w[ 0], w[ 1], offset); - w[44] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[18], w[19], offset); + w[62] = hc_bytealign (w[17], w[18], offset); + w[61] = hc_bytealign (w[16], w[17], offset); + w[60] = hc_bytealign (w[15], w[16], offset); + w[59] = hc_bytealign (w[14], w[15], offset); + w[58] = hc_bytealign (w[13], w[14], offset); + w[57] = hc_bytealign (w[12], w[13], offset); + w[56] = hc_bytealign (w[11], w[12], offset); + w[55] = hc_bytealign (w[10], w[11], offset); + w[54] = hc_bytealign (w[ 9], w[10], offset); + w[53] = hc_bytealign (w[ 8], w[ 9], offset); + w[52] = hc_bytealign (w[ 7], w[ 8], offset); + w[51] = hc_bytealign (w[ 6], w[ 7], offset); + w[50] = hc_bytealign (w[ 5], w[ 6], offset); + w[49] = hc_bytealign (w[ 4], w[ 5], offset); + w[48] = hc_bytealign (w[ 3], w[ 4], offset); + w[47] = hc_bytealign (w[ 2], w[ 3], offset); + w[46] = hc_bytealign (w[ 1], w[ 2], offset); + w[45] = hc_bytealign (w[ 0], w[ 1], offset); + w[44] = hc_bytealign ( 0, w[ 0], offset); w[43] = 0; w[42] = 0; w[41] = 0; @@ -14726,25 +14726,25 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 45: - w[63] = amd_bytealign (w[17], w[18], offset); - w[62] = amd_bytealign (w[16], w[17], offset); - w[61] = amd_bytealign (w[15], w[16], offset); - w[60] = amd_bytealign (w[14], w[15], offset); - w[59] = amd_bytealign (w[13], w[14], offset); - w[58] = amd_bytealign (w[12], w[13], offset); - w[57] = amd_bytealign (w[11], w[12], offset); - w[56] = amd_bytealign (w[10], w[11], offset); - w[55] = amd_bytealign (w[ 9], w[10], offset); - w[54] = amd_bytealign (w[ 8], w[ 9], offset); - w[53] = amd_bytealign (w[ 7], w[ 8], offset); - w[52] = amd_bytealign (w[ 6], w[ 7], offset); - w[51] = amd_bytealign (w[ 5], w[ 6], offset); - w[50] = amd_bytealign (w[ 4], w[ 5], offset); - w[49] = amd_bytealign (w[ 3], w[ 4], offset); - w[48] = amd_bytealign (w[ 2], w[ 3], offset); - w[47] = amd_bytealign (w[ 1], w[ 2], offset); - w[46] = amd_bytealign (w[ 0], w[ 1], offset); - w[45] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[17], w[18], offset); + w[62] = hc_bytealign (w[16], w[17], offset); + w[61] = hc_bytealign (w[15], w[16], offset); + w[60] = hc_bytealign (w[14], w[15], offset); + w[59] = hc_bytealign (w[13], w[14], offset); + w[58] = hc_bytealign (w[12], w[13], offset); + w[57] = hc_bytealign (w[11], w[12], offset); + w[56] = hc_bytealign (w[10], w[11], offset); + w[55] = hc_bytealign (w[ 9], w[10], offset); + w[54] = hc_bytealign (w[ 8], w[ 9], offset); + w[53] = hc_bytealign (w[ 7], w[ 8], offset); + w[52] = hc_bytealign (w[ 6], w[ 7], offset); + w[51] = hc_bytealign (w[ 5], w[ 6], offset); + w[50] = hc_bytealign (w[ 4], w[ 5], offset); + w[49] = hc_bytealign (w[ 3], w[ 4], offset); + w[48] = hc_bytealign (w[ 2], w[ 3], offset); + w[47] = hc_bytealign (w[ 1], w[ 2], offset); + w[46] = hc_bytealign (w[ 0], w[ 1], offset); + w[45] = hc_bytealign ( 0, w[ 0], offset); w[44] = 0; w[43] = 0; w[42] = 0; @@ -14794,24 +14794,24 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 46: - w[63] = amd_bytealign (w[16], w[17], offset); - w[62] = amd_bytealign (w[15], w[16], offset); - w[61] = amd_bytealign (w[14], w[15], offset); - w[60] = amd_bytealign (w[13], w[14], offset); - w[59] = amd_bytealign (w[12], w[13], offset); - w[58] = amd_bytealign (w[11], w[12], offset); - w[57] = amd_bytealign (w[10], w[11], offset); - w[56] = amd_bytealign (w[ 9], w[10], offset); - w[55] = amd_bytealign (w[ 8], w[ 9], offset); - w[54] = amd_bytealign (w[ 7], w[ 8], offset); - w[53] = amd_bytealign (w[ 6], w[ 7], offset); - w[52] = amd_bytealign (w[ 5], w[ 6], offset); - w[51] = amd_bytealign (w[ 4], w[ 5], offset); - w[50] = amd_bytealign (w[ 3], w[ 4], offset); - w[49] = amd_bytealign (w[ 2], w[ 3], offset); - w[48] = amd_bytealign (w[ 1], w[ 2], offset); - w[47] = amd_bytealign (w[ 0], w[ 1], offset); - w[46] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[16], w[17], offset); + w[62] = hc_bytealign (w[15], w[16], offset); + w[61] = hc_bytealign (w[14], w[15], offset); + w[60] = hc_bytealign (w[13], w[14], offset); + w[59] = hc_bytealign (w[12], w[13], offset); + w[58] = hc_bytealign (w[11], w[12], offset); + w[57] = hc_bytealign (w[10], w[11], offset); + w[56] = hc_bytealign (w[ 9], w[10], offset); + w[55] = hc_bytealign (w[ 8], w[ 9], offset); + w[54] = hc_bytealign (w[ 7], w[ 8], offset); + w[53] = hc_bytealign (w[ 6], w[ 7], offset); + w[52] = hc_bytealign (w[ 5], w[ 6], offset); + w[51] = hc_bytealign (w[ 4], w[ 5], offset); + w[50] = hc_bytealign (w[ 3], w[ 4], offset); + w[49] = hc_bytealign (w[ 2], w[ 3], offset); + w[48] = hc_bytealign (w[ 1], w[ 2], offset); + w[47] = hc_bytealign (w[ 0], w[ 1], offset); + w[46] = hc_bytealign ( 0, w[ 0], offset); w[45] = 0; w[44] = 0; w[43] = 0; @@ -14862,23 +14862,23 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 47: - w[63] = amd_bytealign (w[15], w[16], offset); - w[62] = amd_bytealign (w[14], w[15], offset); - w[61] = amd_bytealign (w[13], w[14], offset); - w[60] = amd_bytealign (w[12], w[13], offset); - w[59] = amd_bytealign (w[11], w[12], offset); - w[58] = amd_bytealign (w[10], w[11], offset); - w[57] = amd_bytealign (w[ 9], w[10], offset); - w[56] = amd_bytealign (w[ 8], w[ 9], offset); - w[55] = amd_bytealign (w[ 7], w[ 8], offset); - w[54] = amd_bytealign (w[ 6], w[ 7], offset); - w[53] = amd_bytealign (w[ 5], w[ 6], offset); - w[52] = amd_bytealign (w[ 4], w[ 5], offset); - w[51] = amd_bytealign (w[ 3], w[ 4], offset); - w[50] = amd_bytealign (w[ 2], w[ 3], offset); - w[49] = amd_bytealign (w[ 1], w[ 2], offset); - w[48] = amd_bytealign (w[ 0], w[ 1], offset); - w[47] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[15], w[16], offset); + w[62] = hc_bytealign (w[14], w[15], offset); + w[61] = hc_bytealign (w[13], w[14], offset); + w[60] = hc_bytealign (w[12], w[13], offset); + w[59] = hc_bytealign (w[11], w[12], offset); + w[58] = hc_bytealign (w[10], w[11], offset); + w[57] = hc_bytealign (w[ 9], w[10], offset); + w[56] = hc_bytealign (w[ 8], w[ 9], offset); + w[55] = hc_bytealign (w[ 7], w[ 8], offset); + w[54] = hc_bytealign (w[ 6], w[ 7], offset); + w[53] = hc_bytealign (w[ 5], w[ 6], offset); + w[52] = hc_bytealign (w[ 4], w[ 5], offset); + w[51] = hc_bytealign (w[ 3], w[ 4], offset); + w[50] = hc_bytealign (w[ 2], w[ 3], offset); + w[49] = hc_bytealign (w[ 1], w[ 2], offset); + w[48] = hc_bytealign (w[ 0], w[ 1], offset); + w[47] = hc_bytealign ( 0, w[ 0], offset); w[46] = 0; w[45] = 0; w[44] = 0; @@ -14930,22 +14930,22 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 48: - w[63] = amd_bytealign (w[14], w[15], offset); - w[62] = amd_bytealign (w[13], w[14], offset); - w[61] = amd_bytealign (w[12], w[13], offset); - w[60] = amd_bytealign (w[11], w[12], offset); - w[59] = amd_bytealign (w[10], w[11], offset); - w[58] = amd_bytealign (w[ 9], w[10], offset); - w[57] = amd_bytealign (w[ 8], w[ 9], offset); - w[56] = amd_bytealign (w[ 7], w[ 8], offset); - w[55] = amd_bytealign (w[ 6], w[ 7], offset); - w[54] = amd_bytealign (w[ 5], w[ 6], offset); - w[53] = amd_bytealign (w[ 4], w[ 5], offset); - w[52] = amd_bytealign (w[ 3], w[ 4], offset); - w[51] = amd_bytealign (w[ 2], w[ 3], offset); - w[50] = amd_bytealign (w[ 1], w[ 2], offset); - w[49] = amd_bytealign (w[ 0], w[ 1], offset); - w[48] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[14], w[15], offset); + w[62] = hc_bytealign (w[13], w[14], offset); + w[61] = hc_bytealign (w[12], w[13], offset); + w[60] = hc_bytealign (w[11], w[12], offset); + w[59] = hc_bytealign (w[10], w[11], offset); + w[58] = hc_bytealign (w[ 9], w[10], offset); + w[57] = hc_bytealign (w[ 8], w[ 9], offset); + w[56] = hc_bytealign (w[ 7], w[ 8], offset); + w[55] = hc_bytealign (w[ 6], w[ 7], offset); + w[54] = hc_bytealign (w[ 5], w[ 6], offset); + w[53] = hc_bytealign (w[ 4], w[ 5], offset); + w[52] = hc_bytealign (w[ 3], w[ 4], offset); + w[51] = hc_bytealign (w[ 2], w[ 3], offset); + w[50] = hc_bytealign (w[ 1], w[ 2], offset); + w[49] = hc_bytealign (w[ 0], w[ 1], offset); + w[48] = hc_bytealign ( 0, w[ 0], offset); w[47] = 0; w[46] = 0; w[45] = 0; @@ -14998,21 +14998,21 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 49: - w[63] = amd_bytealign (w[13], w[14], offset); - w[62] = amd_bytealign (w[12], w[13], offset); - w[61] = amd_bytealign (w[11], w[12], offset); - w[60] = amd_bytealign (w[10], w[11], offset); - w[59] = amd_bytealign (w[ 9], w[10], offset); - w[58] = amd_bytealign (w[ 8], w[ 9], offset); - w[57] = amd_bytealign (w[ 7], w[ 8], offset); - w[56] = amd_bytealign (w[ 6], w[ 7], offset); - w[55] = amd_bytealign (w[ 5], w[ 6], offset); - w[54] = amd_bytealign (w[ 4], w[ 5], offset); - w[53] = amd_bytealign (w[ 3], w[ 4], offset); - w[52] = amd_bytealign (w[ 2], w[ 3], offset); - w[51] = amd_bytealign (w[ 1], w[ 2], offset); - w[50] = amd_bytealign (w[ 0], w[ 1], offset); - w[49] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[13], w[14], offset); + w[62] = hc_bytealign (w[12], w[13], offset); + w[61] = hc_bytealign (w[11], w[12], offset); + w[60] = hc_bytealign (w[10], w[11], offset); + w[59] = hc_bytealign (w[ 9], w[10], offset); + w[58] = hc_bytealign (w[ 8], w[ 9], offset); + w[57] = hc_bytealign (w[ 7], w[ 8], offset); + w[56] = hc_bytealign (w[ 6], w[ 7], offset); + w[55] = hc_bytealign (w[ 5], w[ 6], offset); + w[54] = hc_bytealign (w[ 4], w[ 5], offset); + w[53] = hc_bytealign (w[ 3], w[ 4], offset); + w[52] = hc_bytealign (w[ 2], w[ 3], offset); + w[51] = hc_bytealign (w[ 1], w[ 2], offset); + w[50] = hc_bytealign (w[ 0], w[ 1], offset); + w[49] = hc_bytealign ( 0, w[ 0], offset); w[48] = 0; w[47] = 0; w[46] = 0; @@ -15066,20 +15066,20 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 50: - w[63] = amd_bytealign (w[12], w[13], offset); - w[62] = amd_bytealign (w[11], w[12], offset); - w[61] = amd_bytealign (w[10], w[11], offset); - w[60] = amd_bytealign (w[ 9], w[10], offset); - w[59] = amd_bytealign (w[ 8], w[ 9], offset); - w[58] = amd_bytealign (w[ 7], w[ 8], offset); - w[57] = amd_bytealign (w[ 6], w[ 7], offset); - w[56] = amd_bytealign (w[ 5], w[ 6], offset); - w[55] = amd_bytealign (w[ 4], w[ 5], offset); - w[54] = amd_bytealign (w[ 3], w[ 4], offset); - w[53] = amd_bytealign (w[ 2], w[ 3], offset); - w[52] = amd_bytealign (w[ 1], w[ 2], offset); - w[51] = amd_bytealign (w[ 0], w[ 1], offset); - w[50] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[12], w[13], offset); + w[62] = hc_bytealign (w[11], w[12], offset); + w[61] = hc_bytealign (w[10], w[11], offset); + w[60] = hc_bytealign (w[ 9], w[10], offset); + w[59] = hc_bytealign (w[ 8], w[ 9], offset); + w[58] = hc_bytealign (w[ 7], w[ 8], offset); + w[57] = hc_bytealign (w[ 6], w[ 7], offset); + w[56] = hc_bytealign (w[ 5], w[ 6], offset); + w[55] = hc_bytealign (w[ 4], w[ 5], offset); + w[54] = hc_bytealign (w[ 3], w[ 4], offset); + w[53] = hc_bytealign (w[ 2], w[ 3], offset); + w[52] = hc_bytealign (w[ 1], w[ 2], offset); + w[51] = hc_bytealign (w[ 0], w[ 1], offset); + w[50] = hc_bytealign ( 0, w[ 0], offset); w[49] = 0; w[48] = 0; w[47] = 0; @@ -15134,19 +15134,19 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 51: - w[63] = amd_bytealign (w[11], w[12], offset); - w[62] = amd_bytealign (w[10], w[11], offset); - w[61] = amd_bytealign (w[ 9], w[10], offset); - w[60] = amd_bytealign (w[ 8], w[ 9], offset); - w[59] = amd_bytealign (w[ 7], w[ 8], offset); - w[58] = amd_bytealign (w[ 6], w[ 7], offset); - w[57] = amd_bytealign (w[ 5], w[ 6], offset); - w[56] = amd_bytealign (w[ 4], w[ 5], offset); - w[55] = amd_bytealign (w[ 3], w[ 4], offset); - w[54] = amd_bytealign (w[ 2], w[ 3], offset); - w[53] = amd_bytealign (w[ 1], w[ 2], offset); - w[52] = amd_bytealign (w[ 0], w[ 1], offset); - w[51] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[11], w[12], offset); + w[62] = hc_bytealign (w[10], w[11], offset); + w[61] = hc_bytealign (w[ 9], w[10], offset); + w[60] = hc_bytealign (w[ 8], w[ 9], offset); + w[59] = hc_bytealign (w[ 7], w[ 8], offset); + w[58] = hc_bytealign (w[ 6], w[ 7], offset); + w[57] = hc_bytealign (w[ 5], w[ 6], offset); + w[56] = hc_bytealign (w[ 4], w[ 5], offset); + w[55] = hc_bytealign (w[ 3], w[ 4], offset); + w[54] = hc_bytealign (w[ 2], w[ 3], offset); + w[53] = hc_bytealign (w[ 1], w[ 2], offset); + w[52] = hc_bytealign (w[ 0], w[ 1], offset); + w[51] = hc_bytealign ( 0, w[ 0], offset); w[50] = 0; w[49] = 0; w[48] = 0; @@ -15202,18 +15202,18 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 52: - w[63] = amd_bytealign (w[10], w[11], offset); - w[62] = amd_bytealign (w[ 9], w[10], offset); - w[61] = amd_bytealign (w[ 8], w[ 9], offset); - w[60] = amd_bytealign (w[ 7], w[ 8], offset); - w[59] = amd_bytealign (w[ 6], w[ 7], offset); - w[58] = amd_bytealign (w[ 5], w[ 6], offset); - w[57] = amd_bytealign (w[ 4], w[ 5], offset); - w[56] = amd_bytealign (w[ 3], w[ 4], offset); - w[55] = amd_bytealign (w[ 2], w[ 3], offset); - w[54] = amd_bytealign (w[ 1], w[ 2], offset); - w[53] = amd_bytealign (w[ 0], w[ 1], offset); - w[52] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[10], w[11], offset); + w[62] = hc_bytealign (w[ 9], w[10], offset); + w[61] = hc_bytealign (w[ 8], w[ 9], offset); + w[60] = hc_bytealign (w[ 7], w[ 8], offset); + w[59] = hc_bytealign (w[ 6], w[ 7], offset); + w[58] = hc_bytealign (w[ 5], w[ 6], offset); + w[57] = hc_bytealign (w[ 4], w[ 5], offset); + w[56] = hc_bytealign (w[ 3], w[ 4], offset); + w[55] = hc_bytealign (w[ 2], w[ 3], offset); + w[54] = hc_bytealign (w[ 1], w[ 2], offset); + w[53] = hc_bytealign (w[ 0], w[ 1], offset); + w[52] = hc_bytealign ( 0, w[ 0], offset); w[51] = 0; w[50] = 0; w[49] = 0; @@ -15270,17 +15270,17 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 53: - w[63] = amd_bytealign (w[ 9], w[10], offset); - w[62] = amd_bytealign (w[ 8], w[ 9], offset); - w[61] = amd_bytealign (w[ 7], w[ 8], offset); - w[60] = amd_bytealign (w[ 6], w[ 7], offset); - w[59] = amd_bytealign (w[ 5], w[ 6], offset); - w[58] = amd_bytealign (w[ 4], w[ 5], offset); - w[57] = amd_bytealign (w[ 3], w[ 4], offset); - w[56] = amd_bytealign (w[ 2], w[ 3], offset); - w[55] = amd_bytealign (w[ 1], w[ 2], offset); - w[54] = amd_bytealign (w[ 0], w[ 1], offset); - w[53] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[ 9], w[10], offset); + w[62] = hc_bytealign (w[ 8], w[ 9], offset); + w[61] = hc_bytealign (w[ 7], w[ 8], offset); + w[60] = hc_bytealign (w[ 6], w[ 7], offset); + w[59] = hc_bytealign (w[ 5], w[ 6], offset); + w[58] = hc_bytealign (w[ 4], w[ 5], offset); + w[57] = hc_bytealign (w[ 3], w[ 4], offset); + w[56] = hc_bytealign (w[ 2], w[ 3], offset); + w[55] = hc_bytealign (w[ 1], w[ 2], offset); + w[54] = hc_bytealign (w[ 0], w[ 1], offset); + w[53] = hc_bytealign ( 0, w[ 0], offset); w[52] = 0; w[51] = 0; w[50] = 0; @@ -15338,16 +15338,16 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 54: - w[63] = amd_bytealign (w[ 8], w[ 9], offset); - w[62] = amd_bytealign (w[ 7], w[ 8], offset); - w[61] = amd_bytealign (w[ 6], w[ 7], offset); - w[60] = amd_bytealign (w[ 5], w[ 6], offset); - w[59] = amd_bytealign (w[ 4], w[ 5], offset); - w[58] = amd_bytealign (w[ 3], w[ 4], offset); - w[57] = amd_bytealign (w[ 2], w[ 3], offset); - w[56] = amd_bytealign (w[ 1], w[ 2], offset); - w[55] = amd_bytealign (w[ 0], w[ 1], offset); - w[54] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[ 8], w[ 9], offset); + w[62] = hc_bytealign (w[ 7], w[ 8], offset); + w[61] = hc_bytealign (w[ 6], w[ 7], offset); + w[60] = hc_bytealign (w[ 5], w[ 6], offset); + w[59] = hc_bytealign (w[ 4], w[ 5], offset); + w[58] = hc_bytealign (w[ 3], w[ 4], offset); + w[57] = hc_bytealign (w[ 2], w[ 3], offset); + w[56] = hc_bytealign (w[ 1], w[ 2], offset); + w[55] = hc_bytealign (w[ 0], w[ 1], offset); + w[54] = hc_bytealign ( 0, w[ 0], offset); w[53] = 0; w[52] = 0; w[51] = 0; @@ -15406,15 +15406,15 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 55: - w[63] = amd_bytealign (w[ 7], w[ 8], offset); - w[62] = amd_bytealign (w[ 6], w[ 7], offset); - w[61] = amd_bytealign (w[ 5], w[ 6], offset); - w[60] = amd_bytealign (w[ 4], w[ 5], offset); - w[59] = amd_bytealign (w[ 3], w[ 4], offset); - w[58] = amd_bytealign (w[ 2], w[ 3], offset); - w[57] = amd_bytealign (w[ 1], w[ 2], offset); - w[56] = amd_bytealign (w[ 0], w[ 1], offset); - w[55] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[ 7], w[ 8], offset); + w[62] = hc_bytealign (w[ 6], w[ 7], offset); + w[61] = hc_bytealign (w[ 5], w[ 6], offset); + w[60] = hc_bytealign (w[ 4], w[ 5], offset); + w[59] = hc_bytealign (w[ 3], w[ 4], offset); + w[58] = hc_bytealign (w[ 2], w[ 3], offset); + w[57] = hc_bytealign (w[ 1], w[ 2], offset); + w[56] = hc_bytealign (w[ 0], w[ 1], offset); + w[55] = hc_bytealign ( 0, w[ 0], offset); w[54] = 0; w[53] = 0; w[52] = 0; @@ -15474,14 +15474,14 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 56: - w[63] = amd_bytealign (w[ 6], w[ 7], offset); - w[62] = amd_bytealign (w[ 5], w[ 6], offset); - w[61] = amd_bytealign (w[ 4], w[ 5], offset); - w[60] = amd_bytealign (w[ 3], w[ 4], offset); - w[59] = amd_bytealign (w[ 2], w[ 3], offset); - w[58] = amd_bytealign (w[ 1], w[ 2], offset); - w[57] = amd_bytealign (w[ 0], w[ 1], offset); - w[56] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[ 6], w[ 7], offset); + w[62] = hc_bytealign (w[ 5], w[ 6], offset); + w[61] = hc_bytealign (w[ 4], w[ 5], offset); + w[60] = hc_bytealign (w[ 3], w[ 4], offset); + w[59] = hc_bytealign (w[ 2], w[ 3], offset); + w[58] = hc_bytealign (w[ 1], w[ 2], offset); + w[57] = hc_bytealign (w[ 0], w[ 1], offset); + w[56] = hc_bytealign ( 0, w[ 0], offset); w[55] = 0; w[54] = 0; w[53] = 0; @@ -15542,13 +15542,13 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 57: - w[63] = amd_bytealign (w[ 5], w[ 6], offset); - w[62] = amd_bytealign (w[ 4], w[ 5], offset); - w[61] = amd_bytealign (w[ 3], w[ 4], offset); - w[60] = amd_bytealign (w[ 2], w[ 3], offset); - w[59] = amd_bytealign (w[ 1], w[ 2], offset); - w[58] = amd_bytealign (w[ 0], w[ 1], offset); - w[57] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[ 5], w[ 6], offset); + w[62] = hc_bytealign (w[ 4], w[ 5], offset); + w[61] = hc_bytealign (w[ 3], w[ 4], offset); + w[60] = hc_bytealign (w[ 2], w[ 3], offset); + w[59] = hc_bytealign (w[ 1], w[ 2], offset); + w[58] = hc_bytealign (w[ 0], w[ 1], offset); + w[57] = hc_bytealign ( 0, w[ 0], offset); w[56] = 0; w[55] = 0; w[54] = 0; @@ -15610,12 +15610,12 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 58: - w[63] = amd_bytealign (w[ 4], w[ 5], offset); - w[62] = amd_bytealign (w[ 3], w[ 4], offset); - w[61] = amd_bytealign (w[ 2], w[ 3], offset); - w[60] = amd_bytealign (w[ 1], w[ 2], offset); - w[59] = amd_bytealign (w[ 0], w[ 1], offset); - w[58] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[ 4], w[ 5], offset); + w[62] = hc_bytealign (w[ 3], w[ 4], offset); + w[61] = hc_bytealign (w[ 2], w[ 3], offset); + w[60] = hc_bytealign (w[ 1], w[ 2], offset); + w[59] = hc_bytealign (w[ 0], w[ 1], offset); + w[58] = hc_bytealign ( 0, w[ 0], offset); w[57] = 0; w[56] = 0; w[55] = 0; @@ -15678,11 +15678,11 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 59: - w[63] = amd_bytealign (w[ 3], w[ 4], offset); - w[62] = amd_bytealign (w[ 2], w[ 3], offset); - w[61] = amd_bytealign (w[ 1], w[ 2], offset); - w[60] = amd_bytealign (w[ 0], w[ 1], offset); - w[59] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[ 3], w[ 4], offset); + w[62] = hc_bytealign (w[ 2], w[ 3], offset); + w[61] = hc_bytealign (w[ 1], w[ 2], offset); + w[60] = hc_bytealign (w[ 0], w[ 1], offset); + w[59] = hc_bytealign ( 0, w[ 0], offset); w[58] = 0; w[57] = 0; w[56] = 0; @@ -15746,10 +15746,10 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 60: - w[63] = amd_bytealign (w[ 2], w[ 3], offset); - w[62] = amd_bytealign (w[ 1], w[ 2], offset); - w[61] = amd_bytealign (w[ 0], w[ 1], offset); - w[60] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[ 2], w[ 3], offset); + w[62] = hc_bytealign (w[ 1], w[ 2], offset); + w[61] = hc_bytealign (w[ 0], w[ 1], offset); + w[60] = hc_bytealign ( 0, w[ 0], offset); w[59] = 0; w[58] = 0; w[57] = 0; @@ -15814,9 +15814,9 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 61: - w[63] = amd_bytealign (w[ 1], w[ 2], offset); - w[62] = amd_bytealign (w[ 0], w[ 1], offset); - w[61] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[ 1], w[ 2], offset); + w[62] = hc_bytealign (w[ 0], w[ 1], offset); + w[61] = hc_bytealign ( 0, w[ 0], offset); w[60] = 0; w[59] = 0; w[58] = 0; @@ -15882,8 +15882,8 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 62: - w[63] = amd_bytealign (w[ 0], w[ 1], offset); - w[62] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[ 0], w[ 1], offset); + w[62] = hc_bytealign ( 0, w[ 0], offset); w[61] = 0; w[60] = 0; w[59] = 0; @@ -15950,7 +15950,7 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 63: - w[63] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign ( 0, w[ 0], offset); w[62] = 0; w[61] = 0; w[60] = 0; @@ -16036,271 +16036,271 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) switch (offset_switch) { case 0: - w[63] = __byte_perm (w[62], w[63], selector); - w[62] = __byte_perm (w[61], w[62], selector); - w[61] = __byte_perm (w[60], w[61], selector); - w[60] = __byte_perm (w[59], w[60], selector); - w[59] = __byte_perm (w[58], w[59], selector); - w[58] = __byte_perm (w[57], w[58], selector); - w[57] = __byte_perm (w[56], w[57], selector); - w[56] = __byte_perm (w[55], w[56], selector); - w[55] = __byte_perm (w[54], w[55], selector); - w[54] = __byte_perm (w[53], w[54], selector); - w[53] = __byte_perm (w[52], w[53], selector); - w[52] = __byte_perm (w[51], w[52], selector); - w[51] = __byte_perm (w[50], w[51], selector); - w[50] = __byte_perm (w[49], w[50], selector); - w[49] = __byte_perm (w[48], w[49], selector); - w[48] = __byte_perm (w[47], w[48], selector); - w[47] = __byte_perm (w[46], w[47], selector); - w[46] = __byte_perm (w[45], w[46], selector); - w[45] = __byte_perm (w[44], w[45], selector); - w[44] = __byte_perm (w[43], w[44], selector); - w[43] = __byte_perm (w[42], w[43], selector); - w[42] = __byte_perm (w[41], w[42], selector); - w[41] = __byte_perm (w[40], w[41], selector); - w[40] = __byte_perm (w[39], w[40], selector); - w[39] = __byte_perm (w[38], w[39], selector); - w[38] = __byte_perm (w[37], w[38], selector); - w[37] = __byte_perm (w[36], w[37], selector); - w[36] = __byte_perm (w[35], w[36], selector); - w[35] = __byte_perm (w[34], w[35], selector); - w[34] = __byte_perm (w[33], w[34], selector); - w[33] = __byte_perm (w[32], w[33], selector); - w[32] = __byte_perm (w[31], w[32], selector); - w[31] = __byte_perm (w[30], w[31], selector); - w[30] = __byte_perm (w[29], w[30], selector); - w[29] = __byte_perm (w[28], w[29], selector); - w[28] = __byte_perm (w[27], w[28], selector); - w[27] = __byte_perm (w[26], w[27], selector); - w[26] = __byte_perm (w[25], w[26], selector); - w[25] = __byte_perm (w[24], w[25], selector); - w[24] = __byte_perm (w[23], w[24], selector); - w[23] = __byte_perm (w[22], w[23], selector); - w[22] = __byte_perm (w[21], w[22], selector); - w[21] = __byte_perm (w[20], w[21], selector); - w[20] = __byte_perm (w[19], w[20], selector); - w[19] = __byte_perm (w[18], w[19], selector); - w[18] = __byte_perm (w[17], w[18], selector); - w[17] = __byte_perm (w[16], w[17], selector); - w[16] = __byte_perm (w[15], w[16], selector); - w[15] = __byte_perm (w[14], w[15], selector); - w[14] = __byte_perm (w[13], w[14], selector); - w[13] = __byte_perm (w[12], w[13], selector); - w[12] = __byte_perm (w[11], w[12], selector); - w[11] = __byte_perm (w[10], w[11], selector); - w[10] = __byte_perm (w[ 9], w[10], selector); - w[ 9] = __byte_perm (w[ 8], w[ 9], selector); - w[ 8] = __byte_perm (w[ 7], w[ 8], selector); - w[ 7] = __byte_perm (w[ 6], w[ 7], selector); - w[ 6] = __byte_perm (w[ 5], w[ 6], selector); - w[ 5] = __byte_perm (w[ 4], w[ 5], selector); - w[ 4] = __byte_perm (w[ 3], w[ 4], selector); - w[ 3] = __byte_perm (w[ 2], w[ 3], selector); - w[ 2] = __byte_perm (w[ 1], w[ 2], selector); - w[ 1] = __byte_perm (w[ 0], w[ 1], selector); - w[ 0] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[62], w[63], selector); + w[62] = hc_byte_perm (w[61], w[62], selector); + w[61] = hc_byte_perm (w[60], w[61], selector); + w[60] = hc_byte_perm (w[59], w[60], selector); + w[59] = hc_byte_perm (w[58], w[59], selector); + w[58] = hc_byte_perm (w[57], w[58], selector); + w[57] = hc_byte_perm (w[56], w[57], selector); + w[56] = hc_byte_perm (w[55], w[56], selector); + w[55] = hc_byte_perm (w[54], w[55], selector); + w[54] = hc_byte_perm (w[53], w[54], selector); + w[53] = hc_byte_perm (w[52], w[53], selector); + w[52] = hc_byte_perm (w[51], w[52], selector); + w[51] = hc_byte_perm (w[50], w[51], selector); + w[50] = hc_byte_perm (w[49], w[50], selector); + w[49] = hc_byte_perm (w[48], w[49], selector); + w[48] = hc_byte_perm (w[47], w[48], selector); + w[47] = hc_byte_perm (w[46], w[47], selector); + w[46] = hc_byte_perm (w[45], w[46], selector); + w[45] = hc_byte_perm (w[44], w[45], selector); + w[44] = hc_byte_perm (w[43], w[44], selector); + w[43] = hc_byte_perm (w[42], w[43], selector); + w[42] = hc_byte_perm (w[41], w[42], selector); + w[41] = hc_byte_perm (w[40], w[41], selector); + w[40] = hc_byte_perm (w[39], w[40], selector); + w[39] = hc_byte_perm (w[38], w[39], selector); + w[38] = hc_byte_perm (w[37], w[38], selector); + w[37] = hc_byte_perm (w[36], w[37], selector); + w[36] = hc_byte_perm (w[35], w[36], selector); + w[35] = hc_byte_perm (w[34], w[35], selector); + w[34] = hc_byte_perm (w[33], w[34], selector); + w[33] = hc_byte_perm (w[32], w[33], selector); + w[32] = hc_byte_perm (w[31], w[32], selector); + w[31] = hc_byte_perm (w[30], w[31], selector); + w[30] = hc_byte_perm (w[29], w[30], selector); + w[29] = hc_byte_perm (w[28], w[29], selector); + w[28] = hc_byte_perm (w[27], w[28], selector); + w[27] = hc_byte_perm (w[26], w[27], selector); + w[26] = hc_byte_perm (w[25], w[26], selector); + w[25] = hc_byte_perm (w[24], w[25], selector); + w[24] = hc_byte_perm (w[23], w[24], selector); + w[23] = hc_byte_perm (w[22], w[23], selector); + w[22] = hc_byte_perm (w[21], w[22], selector); + w[21] = hc_byte_perm (w[20], w[21], selector); + w[20] = hc_byte_perm (w[19], w[20], selector); + w[19] = hc_byte_perm (w[18], w[19], selector); + w[18] = hc_byte_perm (w[17], w[18], selector); + w[17] = hc_byte_perm (w[16], w[17], selector); + w[16] = hc_byte_perm (w[15], w[16], selector); + w[15] = hc_byte_perm (w[14], w[15], selector); + w[14] = hc_byte_perm (w[13], w[14], selector); + w[13] = hc_byte_perm (w[12], w[13], selector); + w[12] = hc_byte_perm (w[11], w[12], selector); + w[11] = hc_byte_perm (w[10], w[11], selector); + w[10] = hc_byte_perm (w[ 9], w[10], selector); + w[ 9] = hc_byte_perm (w[ 8], w[ 9], selector); + w[ 8] = hc_byte_perm (w[ 7], w[ 8], selector); + w[ 7] = hc_byte_perm (w[ 6], w[ 7], selector); + w[ 6] = hc_byte_perm (w[ 5], w[ 6], selector); + w[ 5] = hc_byte_perm (w[ 4], w[ 5], selector); + w[ 4] = hc_byte_perm (w[ 3], w[ 4], selector); + w[ 3] = hc_byte_perm (w[ 2], w[ 3], selector); + w[ 2] = hc_byte_perm (w[ 1], w[ 2], selector); + w[ 1] = hc_byte_perm (w[ 0], w[ 1], selector); + w[ 0] = hc_byte_perm ( 0, w[ 0], selector); break; case 1: - w[63] = __byte_perm (w[61], w[62], selector); - w[62] = __byte_perm (w[60], w[61], selector); - w[61] = __byte_perm (w[59], w[60], selector); - w[60] = __byte_perm (w[58], w[59], selector); - w[59] = __byte_perm (w[57], w[58], selector); - w[58] = __byte_perm (w[56], w[57], selector); - w[57] = __byte_perm (w[55], w[56], selector); - w[56] = __byte_perm (w[54], w[55], selector); - w[55] = __byte_perm (w[53], w[54], selector); - w[54] = __byte_perm (w[52], w[53], selector); - w[53] = __byte_perm (w[51], w[52], selector); - w[52] = __byte_perm (w[50], w[51], selector); - w[51] = __byte_perm (w[49], w[50], selector); - w[50] = __byte_perm (w[48], w[49], selector); - w[49] = __byte_perm (w[47], w[48], selector); - w[48] = __byte_perm (w[46], w[47], selector); - w[47] = __byte_perm (w[45], w[46], selector); - w[46] = __byte_perm (w[44], w[45], selector); - w[45] = __byte_perm (w[43], w[44], selector); - w[44] = __byte_perm (w[42], w[43], selector); - w[43] = __byte_perm (w[41], w[42], selector); - w[42] = __byte_perm (w[40], w[41], selector); - w[41] = __byte_perm (w[39], w[40], selector); - w[40] = __byte_perm (w[38], w[39], selector); - w[39] = __byte_perm (w[37], w[38], selector); - w[38] = __byte_perm (w[36], w[37], selector); - w[37] = __byte_perm (w[35], w[36], selector); - w[36] = __byte_perm (w[34], w[35], selector); - w[35] = __byte_perm (w[33], w[34], selector); - w[34] = __byte_perm (w[32], w[33], selector); - w[33] = __byte_perm (w[31], w[32], selector); - w[32] = __byte_perm (w[30], w[31], selector); - w[31] = __byte_perm (w[29], w[30], selector); - w[30] = __byte_perm (w[28], w[29], selector); - w[29] = __byte_perm (w[27], w[28], selector); - w[28] = __byte_perm (w[26], w[27], selector); - w[27] = __byte_perm (w[25], w[26], selector); - w[26] = __byte_perm (w[24], w[25], selector); - w[25] = __byte_perm (w[23], w[24], selector); - w[24] = __byte_perm (w[22], w[23], selector); - w[23] = __byte_perm (w[21], w[22], selector); - w[22] = __byte_perm (w[20], w[21], selector); - w[21] = __byte_perm (w[19], w[20], selector); - w[20] = __byte_perm (w[18], w[19], selector); - w[19] = __byte_perm (w[17], w[18], selector); - w[18] = __byte_perm (w[16], w[17], selector); - w[17] = __byte_perm (w[15], w[16], selector); - w[16] = __byte_perm (w[14], w[15], selector); - w[15] = __byte_perm (w[13], w[14], selector); - w[14] = __byte_perm (w[12], w[13], selector); - w[13] = __byte_perm (w[11], w[12], selector); - w[12] = __byte_perm (w[10], w[11], selector); - w[11] = __byte_perm (w[ 9], w[10], selector); - w[10] = __byte_perm (w[ 8], w[ 9], selector); - w[ 9] = __byte_perm (w[ 7], w[ 8], selector); - w[ 8] = __byte_perm (w[ 6], w[ 7], selector); - w[ 7] = __byte_perm (w[ 5], w[ 6], selector); - w[ 6] = __byte_perm (w[ 4], w[ 5], selector); - w[ 5] = __byte_perm (w[ 3], w[ 4], selector); - w[ 4] = __byte_perm (w[ 2], w[ 3], selector); - w[ 3] = __byte_perm (w[ 1], w[ 2], selector); - w[ 2] = __byte_perm (w[ 0], w[ 1], selector); - w[ 1] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[61], w[62], selector); + w[62] = hc_byte_perm (w[60], w[61], selector); + w[61] = hc_byte_perm (w[59], w[60], selector); + w[60] = hc_byte_perm (w[58], w[59], selector); + w[59] = hc_byte_perm (w[57], w[58], selector); + w[58] = hc_byte_perm (w[56], w[57], selector); + w[57] = hc_byte_perm (w[55], w[56], selector); + w[56] = hc_byte_perm (w[54], w[55], selector); + w[55] = hc_byte_perm (w[53], w[54], selector); + w[54] = hc_byte_perm (w[52], w[53], selector); + w[53] = hc_byte_perm (w[51], w[52], selector); + w[52] = hc_byte_perm (w[50], w[51], selector); + w[51] = hc_byte_perm (w[49], w[50], selector); + w[50] = hc_byte_perm (w[48], w[49], selector); + w[49] = hc_byte_perm (w[47], w[48], selector); + w[48] = hc_byte_perm (w[46], w[47], selector); + w[47] = hc_byte_perm (w[45], w[46], selector); + w[46] = hc_byte_perm (w[44], w[45], selector); + w[45] = hc_byte_perm (w[43], w[44], selector); + w[44] = hc_byte_perm (w[42], w[43], selector); + w[43] = hc_byte_perm (w[41], w[42], selector); + w[42] = hc_byte_perm (w[40], w[41], selector); + w[41] = hc_byte_perm (w[39], w[40], selector); + w[40] = hc_byte_perm (w[38], w[39], selector); + w[39] = hc_byte_perm (w[37], w[38], selector); + w[38] = hc_byte_perm (w[36], w[37], selector); + w[37] = hc_byte_perm (w[35], w[36], selector); + w[36] = hc_byte_perm (w[34], w[35], selector); + w[35] = hc_byte_perm (w[33], w[34], selector); + w[34] = hc_byte_perm (w[32], w[33], selector); + w[33] = hc_byte_perm (w[31], w[32], selector); + w[32] = hc_byte_perm (w[30], w[31], selector); + w[31] = hc_byte_perm (w[29], w[30], selector); + w[30] = hc_byte_perm (w[28], w[29], selector); + w[29] = hc_byte_perm (w[27], w[28], selector); + w[28] = hc_byte_perm (w[26], w[27], selector); + w[27] = hc_byte_perm (w[25], w[26], selector); + w[26] = hc_byte_perm (w[24], w[25], selector); + w[25] = hc_byte_perm (w[23], w[24], selector); + w[24] = hc_byte_perm (w[22], w[23], selector); + w[23] = hc_byte_perm (w[21], w[22], selector); + w[22] = hc_byte_perm (w[20], w[21], selector); + w[21] = hc_byte_perm (w[19], w[20], selector); + w[20] = hc_byte_perm (w[18], w[19], selector); + w[19] = hc_byte_perm (w[17], w[18], selector); + w[18] = hc_byte_perm (w[16], w[17], selector); + w[17] = hc_byte_perm (w[15], w[16], selector); + w[16] = hc_byte_perm (w[14], w[15], selector); + w[15] = hc_byte_perm (w[13], w[14], selector); + w[14] = hc_byte_perm (w[12], w[13], selector); + w[13] = hc_byte_perm (w[11], w[12], selector); + w[12] = hc_byte_perm (w[10], w[11], selector); + w[11] = hc_byte_perm (w[ 9], w[10], selector); + w[10] = hc_byte_perm (w[ 8], w[ 9], selector); + w[ 9] = hc_byte_perm (w[ 7], w[ 8], selector); + w[ 8] = hc_byte_perm (w[ 6], w[ 7], selector); + w[ 7] = hc_byte_perm (w[ 5], w[ 6], selector); + w[ 6] = hc_byte_perm (w[ 4], w[ 5], selector); + w[ 5] = hc_byte_perm (w[ 3], w[ 4], selector); + w[ 4] = hc_byte_perm (w[ 2], w[ 3], selector); + w[ 3] = hc_byte_perm (w[ 1], w[ 2], selector); + w[ 2] = hc_byte_perm (w[ 0], w[ 1], selector); + w[ 1] = hc_byte_perm ( 0, w[ 0], selector); w[ 0] = 0; break; case 2: - w[63] = __byte_perm (w[60], w[61], selector); - w[62] = __byte_perm (w[59], w[60], selector); - w[61] = __byte_perm (w[58], w[59], selector); - w[60] = __byte_perm (w[57], w[58], selector); - w[59] = __byte_perm (w[56], w[57], selector); - w[58] = __byte_perm (w[55], w[56], selector); - w[57] = __byte_perm (w[54], w[55], selector); - w[56] = __byte_perm (w[53], w[54], selector); - w[55] = __byte_perm (w[52], w[53], selector); - w[54] = __byte_perm (w[51], w[52], selector); - w[53] = __byte_perm (w[50], w[51], selector); - w[52] = __byte_perm (w[49], w[50], selector); - w[51] = __byte_perm (w[48], w[49], selector); - w[50] = __byte_perm (w[47], w[48], selector); - w[49] = __byte_perm (w[46], w[47], selector); - w[48] = __byte_perm (w[45], w[46], selector); - w[47] = __byte_perm (w[44], w[45], selector); - w[46] = __byte_perm (w[43], w[44], selector); - w[45] = __byte_perm (w[42], w[43], selector); - w[44] = __byte_perm (w[41], w[42], selector); - w[43] = __byte_perm (w[40], w[41], selector); - w[42] = __byte_perm (w[39], w[40], selector); - w[41] = __byte_perm (w[38], w[39], selector); - w[40] = __byte_perm (w[37], w[38], selector); - w[39] = __byte_perm (w[36], w[37], selector); - w[38] = __byte_perm (w[35], w[36], selector); - w[37] = __byte_perm (w[34], w[35], selector); - w[36] = __byte_perm (w[33], w[34], selector); - w[35] = __byte_perm (w[32], w[33], selector); - w[34] = __byte_perm (w[31], w[32], selector); - w[33] = __byte_perm (w[30], w[31], selector); - w[32] = __byte_perm (w[29], w[30], selector); - w[31] = __byte_perm (w[28], w[29], selector); - w[30] = __byte_perm (w[27], w[28], selector); - w[29] = __byte_perm (w[26], w[27], selector); - w[28] = __byte_perm (w[25], w[26], selector); - w[27] = __byte_perm (w[24], w[25], selector); - w[26] = __byte_perm (w[23], w[24], selector); - w[25] = __byte_perm (w[22], w[23], selector); - w[24] = __byte_perm (w[21], w[22], selector); - w[23] = __byte_perm (w[20], w[21], selector); - w[22] = __byte_perm (w[19], w[20], selector); - w[21] = __byte_perm (w[18], w[19], selector); - w[20] = __byte_perm (w[17], w[18], selector); - w[19] = __byte_perm (w[16], w[17], selector); - w[18] = __byte_perm (w[15], w[16], selector); - w[17] = __byte_perm (w[14], w[15], selector); - w[16] = __byte_perm (w[13], w[14], selector); - w[15] = __byte_perm (w[12], w[13], selector); - w[14] = __byte_perm (w[11], w[12], selector); - w[13] = __byte_perm (w[10], w[11], selector); - w[12] = __byte_perm (w[ 9], w[10], selector); - w[11] = __byte_perm (w[ 8], w[ 9], selector); - w[10] = __byte_perm (w[ 7], w[ 8], selector); - w[ 9] = __byte_perm (w[ 6], w[ 7], selector); - w[ 8] = __byte_perm (w[ 5], w[ 6], selector); - w[ 7] = __byte_perm (w[ 4], w[ 5], selector); - w[ 6] = __byte_perm (w[ 3], w[ 4], selector); - w[ 5] = __byte_perm (w[ 2], w[ 3], selector); - w[ 4] = __byte_perm (w[ 1], w[ 2], selector); - w[ 3] = __byte_perm (w[ 0], w[ 1], selector); - w[ 2] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[60], w[61], selector); + w[62] = hc_byte_perm (w[59], w[60], selector); + w[61] = hc_byte_perm (w[58], w[59], selector); + w[60] = hc_byte_perm (w[57], w[58], selector); + w[59] = hc_byte_perm (w[56], w[57], selector); + w[58] = hc_byte_perm (w[55], w[56], selector); + w[57] = hc_byte_perm (w[54], w[55], selector); + w[56] = hc_byte_perm (w[53], w[54], selector); + w[55] = hc_byte_perm (w[52], w[53], selector); + w[54] = hc_byte_perm (w[51], w[52], selector); + w[53] = hc_byte_perm (w[50], w[51], selector); + w[52] = hc_byte_perm (w[49], w[50], selector); + w[51] = hc_byte_perm (w[48], w[49], selector); + w[50] = hc_byte_perm (w[47], w[48], selector); + w[49] = hc_byte_perm (w[46], w[47], selector); + w[48] = hc_byte_perm (w[45], w[46], selector); + w[47] = hc_byte_perm (w[44], w[45], selector); + w[46] = hc_byte_perm (w[43], w[44], selector); + w[45] = hc_byte_perm (w[42], w[43], selector); + w[44] = hc_byte_perm (w[41], w[42], selector); + w[43] = hc_byte_perm (w[40], w[41], selector); + w[42] = hc_byte_perm (w[39], w[40], selector); + w[41] = hc_byte_perm (w[38], w[39], selector); + w[40] = hc_byte_perm (w[37], w[38], selector); + w[39] = hc_byte_perm (w[36], w[37], selector); + w[38] = hc_byte_perm (w[35], w[36], selector); + w[37] = hc_byte_perm (w[34], w[35], selector); + w[36] = hc_byte_perm (w[33], w[34], selector); + w[35] = hc_byte_perm (w[32], w[33], selector); + w[34] = hc_byte_perm (w[31], w[32], selector); + w[33] = hc_byte_perm (w[30], w[31], selector); + w[32] = hc_byte_perm (w[29], w[30], selector); + w[31] = hc_byte_perm (w[28], w[29], selector); + w[30] = hc_byte_perm (w[27], w[28], selector); + w[29] = hc_byte_perm (w[26], w[27], selector); + w[28] = hc_byte_perm (w[25], w[26], selector); + w[27] = hc_byte_perm (w[24], w[25], selector); + w[26] = hc_byte_perm (w[23], w[24], selector); + w[25] = hc_byte_perm (w[22], w[23], selector); + w[24] = hc_byte_perm (w[21], w[22], selector); + w[23] = hc_byte_perm (w[20], w[21], selector); + w[22] = hc_byte_perm (w[19], w[20], selector); + w[21] = hc_byte_perm (w[18], w[19], selector); + w[20] = hc_byte_perm (w[17], w[18], selector); + w[19] = hc_byte_perm (w[16], w[17], selector); + w[18] = hc_byte_perm (w[15], w[16], selector); + w[17] = hc_byte_perm (w[14], w[15], selector); + w[16] = hc_byte_perm (w[13], w[14], selector); + w[15] = hc_byte_perm (w[12], w[13], selector); + w[14] = hc_byte_perm (w[11], w[12], selector); + w[13] = hc_byte_perm (w[10], w[11], selector); + w[12] = hc_byte_perm (w[ 9], w[10], selector); + w[11] = hc_byte_perm (w[ 8], w[ 9], selector); + w[10] = hc_byte_perm (w[ 7], w[ 8], selector); + w[ 9] = hc_byte_perm (w[ 6], w[ 7], selector); + w[ 8] = hc_byte_perm (w[ 5], w[ 6], selector); + w[ 7] = hc_byte_perm (w[ 4], w[ 5], selector); + w[ 6] = hc_byte_perm (w[ 3], w[ 4], selector); + w[ 5] = hc_byte_perm (w[ 2], w[ 3], selector); + w[ 4] = hc_byte_perm (w[ 1], w[ 2], selector); + w[ 3] = hc_byte_perm (w[ 0], w[ 1], selector); + w[ 2] = hc_byte_perm ( 0, w[ 0], selector); w[ 1] = 0; w[ 0] = 0; break; case 3: - w[63] = __byte_perm (w[59], w[60], selector); - w[62] = __byte_perm (w[58], w[59], selector); - w[61] = __byte_perm (w[57], w[58], selector); - w[60] = __byte_perm (w[56], w[57], selector); - w[59] = __byte_perm (w[55], w[56], selector); - w[58] = __byte_perm (w[54], w[55], selector); - w[57] = __byte_perm (w[53], w[54], selector); - w[56] = __byte_perm (w[52], w[53], selector); - w[55] = __byte_perm (w[51], w[52], selector); - w[54] = __byte_perm (w[50], w[51], selector); - w[53] = __byte_perm (w[49], w[50], selector); - w[52] = __byte_perm (w[48], w[49], selector); - w[51] = __byte_perm (w[47], w[48], selector); - w[50] = __byte_perm (w[46], w[47], selector); - w[49] = __byte_perm (w[45], w[46], selector); - w[48] = __byte_perm (w[44], w[45], selector); - w[47] = __byte_perm (w[43], w[44], selector); - w[46] = __byte_perm (w[42], w[43], selector); - w[45] = __byte_perm (w[41], w[42], selector); - w[44] = __byte_perm (w[40], w[41], selector); - w[43] = __byte_perm (w[39], w[40], selector); - w[42] = __byte_perm (w[38], w[39], selector); - w[41] = __byte_perm (w[37], w[38], selector); - w[40] = __byte_perm (w[36], w[37], selector); - w[39] = __byte_perm (w[35], w[36], selector); - w[38] = __byte_perm (w[34], w[35], selector); - w[37] = __byte_perm (w[33], w[34], selector); - w[36] = __byte_perm (w[32], w[33], selector); - w[35] = __byte_perm (w[31], w[32], selector); - w[34] = __byte_perm (w[30], w[31], selector); - w[33] = __byte_perm (w[29], w[30], selector); - w[32] = __byte_perm (w[28], w[29], selector); - w[31] = __byte_perm (w[27], w[28], selector); - w[30] = __byte_perm (w[26], w[27], selector); - w[29] = __byte_perm (w[25], w[26], selector); - w[28] = __byte_perm (w[24], w[25], selector); - w[27] = __byte_perm (w[23], w[24], selector); - w[26] = __byte_perm (w[22], w[23], selector); - w[25] = __byte_perm (w[21], w[22], selector); - w[24] = __byte_perm (w[20], w[21], selector); - w[23] = __byte_perm (w[19], w[20], selector); - w[22] = __byte_perm (w[18], w[19], selector); - w[21] = __byte_perm (w[17], w[18], selector); - w[20] = __byte_perm (w[16], w[17], selector); - w[19] = __byte_perm (w[15], w[16], selector); - w[18] = __byte_perm (w[14], w[15], selector); - w[17] = __byte_perm (w[13], w[14], selector); - w[16] = __byte_perm (w[12], w[13], selector); - w[15] = __byte_perm (w[11], w[12], selector); - w[14] = __byte_perm (w[10], w[11], selector); - w[13] = __byte_perm (w[ 9], w[10], selector); - w[12] = __byte_perm (w[ 8], w[ 9], selector); - w[11] = __byte_perm (w[ 7], w[ 8], selector); - w[10] = __byte_perm (w[ 6], w[ 7], selector); - w[ 9] = __byte_perm (w[ 5], w[ 6], selector); - w[ 8] = __byte_perm (w[ 4], w[ 5], selector); - w[ 7] = __byte_perm (w[ 3], w[ 4], selector); - w[ 6] = __byte_perm (w[ 2], w[ 3], selector); - w[ 5] = __byte_perm (w[ 1], w[ 2], selector); - w[ 4] = __byte_perm (w[ 0], w[ 1], selector); - w[ 3] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[59], w[60], selector); + w[62] = hc_byte_perm (w[58], w[59], selector); + w[61] = hc_byte_perm (w[57], w[58], selector); + w[60] = hc_byte_perm (w[56], w[57], selector); + w[59] = hc_byte_perm (w[55], w[56], selector); + w[58] = hc_byte_perm (w[54], w[55], selector); + w[57] = hc_byte_perm (w[53], w[54], selector); + w[56] = hc_byte_perm (w[52], w[53], selector); + w[55] = hc_byte_perm (w[51], w[52], selector); + w[54] = hc_byte_perm (w[50], w[51], selector); + w[53] = hc_byte_perm (w[49], w[50], selector); + w[52] = hc_byte_perm (w[48], w[49], selector); + w[51] = hc_byte_perm (w[47], w[48], selector); + w[50] = hc_byte_perm (w[46], w[47], selector); + w[49] = hc_byte_perm (w[45], w[46], selector); + w[48] = hc_byte_perm (w[44], w[45], selector); + w[47] = hc_byte_perm (w[43], w[44], selector); + w[46] = hc_byte_perm (w[42], w[43], selector); + w[45] = hc_byte_perm (w[41], w[42], selector); + w[44] = hc_byte_perm (w[40], w[41], selector); + w[43] = hc_byte_perm (w[39], w[40], selector); + w[42] = hc_byte_perm (w[38], w[39], selector); + w[41] = hc_byte_perm (w[37], w[38], selector); + w[40] = hc_byte_perm (w[36], w[37], selector); + w[39] = hc_byte_perm (w[35], w[36], selector); + w[38] = hc_byte_perm (w[34], w[35], selector); + w[37] = hc_byte_perm (w[33], w[34], selector); + w[36] = hc_byte_perm (w[32], w[33], selector); + w[35] = hc_byte_perm (w[31], w[32], selector); + w[34] = hc_byte_perm (w[30], w[31], selector); + w[33] = hc_byte_perm (w[29], w[30], selector); + w[32] = hc_byte_perm (w[28], w[29], selector); + w[31] = hc_byte_perm (w[27], w[28], selector); + w[30] = hc_byte_perm (w[26], w[27], selector); + w[29] = hc_byte_perm (w[25], w[26], selector); + w[28] = hc_byte_perm (w[24], w[25], selector); + w[27] = hc_byte_perm (w[23], w[24], selector); + w[26] = hc_byte_perm (w[22], w[23], selector); + w[25] = hc_byte_perm (w[21], w[22], selector); + w[24] = hc_byte_perm (w[20], w[21], selector); + w[23] = hc_byte_perm (w[19], w[20], selector); + w[22] = hc_byte_perm (w[18], w[19], selector); + w[21] = hc_byte_perm (w[17], w[18], selector); + w[20] = hc_byte_perm (w[16], w[17], selector); + w[19] = hc_byte_perm (w[15], w[16], selector); + w[18] = hc_byte_perm (w[14], w[15], selector); + w[17] = hc_byte_perm (w[13], w[14], selector); + w[16] = hc_byte_perm (w[12], w[13], selector); + w[15] = hc_byte_perm (w[11], w[12], selector); + w[14] = hc_byte_perm (w[10], w[11], selector); + w[13] = hc_byte_perm (w[ 9], w[10], selector); + w[12] = hc_byte_perm (w[ 8], w[ 9], selector); + w[11] = hc_byte_perm (w[ 7], w[ 8], selector); + w[10] = hc_byte_perm (w[ 6], w[ 7], selector); + w[ 9] = hc_byte_perm (w[ 5], w[ 6], selector); + w[ 8] = hc_byte_perm (w[ 4], w[ 5], selector); + w[ 7] = hc_byte_perm (w[ 3], w[ 4], selector); + w[ 6] = hc_byte_perm (w[ 2], w[ 3], selector); + w[ 5] = hc_byte_perm (w[ 1], w[ 2], selector); + w[ 4] = hc_byte_perm (w[ 0], w[ 1], selector); + w[ 3] = hc_byte_perm ( 0, w[ 0], selector); w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; @@ -16308,66 +16308,66 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 4: - w[63] = __byte_perm (w[58], w[59], selector); - w[62] = __byte_perm (w[57], w[58], selector); - w[61] = __byte_perm (w[56], w[57], selector); - w[60] = __byte_perm (w[55], w[56], selector); - w[59] = __byte_perm (w[54], w[55], selector); - w[58] = __byte_perm (w[53], w[54], selector); - w[57] = __byte_perm (w[52], w[53], selector); - w[56] = __byte_perm (w[51], w[52], selector); - w[55] = __byte_perm (w[50], w[51], selector); - w[54] = __byte_perm (w[49], w[50], selector); - w[53] = __byte_perm (w[48], w[49], selector); - w[52] = __byte_perm (w[47], w[48], selector); - w[51] = __byte_perm (w[46], w[47], selector); - w[50] = __byte_perm (w[45], w[46], selector); - w[49] = __byte_perm (w[44], w[45], selector); - w[48] = __byte_perm (w[43], w[44], selector); - w[47] = __byte_perm (w[42], w[43], selector); - w[46] = __byte_perm (w[41], w[42], selector); - w[45] = __byte_perm (w[40], w[41], selector); - w[44] = __byte_perm (w[39], w[40], selector); - w[43] = __byte_perm (w[38], w[39], selector); - w[42] = __byte_perm (w[37], w[38], selector); - w[41] = __byte_perm (w[36], w[37], selector); - w[40] = __byte_perm (w[35], w[36], selector); - w[39] = __byte_perm (w[34], w[35], selector); - w[38] = __byte_perm (w[33], w[34], selector); - w[37] = __byte_perm (w[32], w[33], selector); - w[36] = __byte_perm (w[31], w[32], selector); - w[35] = __byte_perm (w[30], w[31], selector); - w[34] = __byte_perm (w[29], w[30], selector); - w[33] = __byte_perm (w[28], w[29], selector); - w[32] = __byte_perm (w[27], w[28], selector); - w[31] = __byte_perm (w[26], w[27], selector); - w[30] = __byte_perm (w[25], w[26], selector); - w[29] = __byte_perm (w[24], w[25], selector); - w[28] = __byte_perm (w[23], w[24], selector); - w[27] = __byte_perm (w[22], w[23], selector); - w[26] = __byte_perm (w[21], w[22], selector); - w[25] = __byte_perm (w[20], w[21], selector); - w[24] = __byte_perm (w[19], w[20], selector); - w[23] = __byte_perm (w[18], w[19], selector); - w[22] = __byte_perm (w[17], w[18], selector); - w[21] = __byte_perm (w[16], w[17], selector); - w[20] = __byte_perm (w[15], w[16], selector); - w[19] = __byte_perm (w[14], w[15], selector); - w[18] = __byte_perm (w[13], w[14], selector); - w[17] = __byte_perm (w[12], w[13], selector); - w[16] = __byte_perm (w[11], w[12], selector); - w[15] = __byte_perm (w[10], w[11], selector); - w[14] = __byte_perm (w[ 9], w[10], selector); - w[13] = __byte_perm (w[ 8], w[ 9], selector); - w[12] = __byte_perm (w[ 7], w[ 8], selector); - w[11] = __byte_perm (w[ 6], w[ 7], selector); - w[10] = __byte_perm (w[ 5], w[ 6], selector); - w[ 9] = __byte_perm (w[ 4], w[ 5], selector); - w[ 8] = __byte_perm (w[ 3], w[ 4], selector); - w[ 7] = __byte_perm (w[ 2], w[ 3], selector); - w[ 6] = __byte_perm (w[ 1], w[ 2], selector); - w[ 5] = __byte_perm (w[ 0], w[ 1], selector); - w[ 4] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[58], w[59], selector); + w[62] = hc_byte_perm (w[57], w[58], selector); + w[61] = hc_byte_perm (w[56], w[57], selector); + w[60] = hc_byte_perm (w[55], w[56], selector); + w[59] = hc_byte_perm (w[54], w[55], selector); + w[58] = hc_byte_perm (w[53], w[54], selector); + w[57] = hc_byte_perm (w[52], w[53], selector); + w[56] = hc_byte_perm (w[51], w[52], selector); + w[55] = hc_byte_perm (w[50], w[51], selector); + w[54] = hc_byte_perm (w[49], w[50], selector); + w[53] = hc_byte_perm (w[48], w[49], selector); + w[52] = hc_byte_perm (w[47], w[48], selector); + w[51] = hc_byte_perm (w[46], w[47], selector); + w[50] = hc_byte_perm (w[45], w[46], selector); + w[49] = hc_byte_perm (w[44], w[45], selector); + w[48] = hc_byte_perm (w[43], w[44], selector); + w[47] = hc_byte_perm (w[42], w[43], selector); + w[46] = hc_byte_perm (w[41], w[42], selector); + w[45] = hc_byte_perm (w[40], w[41], selector); + w[44] = hc_byte_perm (w[39], w[40], selector); + w[43] = hc_byte_perm (w[38], w[39], selector); + w[42] = hc_byte_perm (w[37], w[38], selector); + w[41] = hc_byte_perm (w[36], w[37], selector); + w[40] = hc_byte_perm (w[35], w[36], selector); + w[39] = hc_byte_perm (w[34], w[35], selector); + w[38] = hc_byte_perm (w[33], w[34], selector); + w[37] = hc_byte_perm (w[32], w[33], selector); + w[36] = hc_byte_perm (w[31], w[32], selector); + w[35] = hc_byte_perm (w[30], w[31], selector); + w[34] = hc_byte_perm (w[29], w[30], selector); + w[33] = hc_byte_perm (w[28], w[29], selector); + w[32] = hc_byte_perm (w[27], w[28], selector); + w[31] = hc_byte_perm (w[26], w[27], selector); + w[30] = hc_byte_perm (w[25], w[26], selector); + w[29] = hc_byte_perm (w[24], w[25], selector); + w[28] = hc_byte_perm (w[23], w[24], selector); + w[27] = hc_byte_perm (w[22], w[23], selector); + w[26] = hc_byte_perm (w[21], w[22], selector); + w[25] = hc_byte_perm (w[20], w[21], selector); + w[24] = hc_byte_perm (w[19], w[20], selector); + w[23] = hc_byte_perm (w[18], w[19], selector); + w[22] = hc_byte_perm (w[17], w[18], selector); + w[21] = hc_byte_perm (w[16], w[17], selector); + w[20] = hc_byte_perm (w[15], w[16], selector); + w[19] = hc_byte_perm (w[14], w[15], selector); + w[18] = hc_byte_perm (w[13], w[14], selector); + w[17] = hc_byte_perm (w[12], w[13], selector); + w[16] = hc_byte_perm (w[11], w[12], selector); + w[15] = hc_byte_perm (w[10], w[11], selector); + w[14] = hc_byte_perm (w[ 9], w[10], selector); + w[13] = hc_byte_perm (w[ 8], w[ 9], selector); + w[12] = hc_byte_perm (w[ 7], w[ 8], selector); + w[11] = hc_byte_perm (w[ 6], w[ 7], selector); + w[10] = hc_byte_perm (w[ 5], w[ 6], selector); + w[ 9] = hc_byte_perm (w[ 4], w[ 5], selector); + w[ 8] = hc_byte_perm (w[ 3], w[ 4], selector); + w[ 7] = hc_byte_perm (w[ 2], w[ 3], selector); + w[ 6] = hc_byte_perm (w[ 1], w[ 2], selector); + w[ 5] = hc_byte_perm (w[ 0], w[ 1], selector); + w[ 4] = hc_byte_perm ( 0, w[ 0], selector); w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; @@ -16376,65 +16376,65 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 5: - w[63] = __byte_perm (w[57], w[58], selector); - w[62] = __byte_perm (w[56], w[57], selector); - w[61] = __byte_perm (w[55], w[56], selector); - w[60] = __byte_perm (w[54], w[55], selector); - w[59] = __byte_perm (w[53], w[54], selector); - w[58] = __byte_perm (w[52], w[53], selector); - w[57] = __byte_perm (w[51], w[52], selector); - w[56] = __byte_perm (w[50], w[51], selector); - w[55] = __byte_perm (w[49], w[50], selector); - w[54] = __byte_perm (w[48], w[49], selector); - w[53] = __byte_perm (w[47], w[48], selector); - w[52] = __byte_perm (w[46], w[47], selector); - w[51] = __byte_perm (w[45], w[46], selector); - w[50] = __byte_perm (w[44], w[45], selector); - w[49] = __byte_perm (w[43], w[44], selector); - w[48] = __byte_perm (w[42], w[43], selector); - w[47] = __byte_perm (w[41], w[42], selector); - w[46] = __byte_perm (w[40], w[41], selector); - w[45] = __byte_perm (w[39], w[40], selector); - w[44] = __byte_perm (w[38], w[39], selector); - w[43] = __byte_perm (w[37], w[38], selector); - w[42] = __byte_perm (w[36], w[37], selector); - w[41] = __byte_perm (w[35], w[36], selector); - w[40] = __byte_perm (w[34], w[35], selector); - w[39] = __byte_perm (w[33], w[34], selector); - w[38] = __byte_perm (w[32], w[33], selector); - w[37] = __byte_perm (w[31], w[32], selector); - w[36] = __byte_perm (w[30], w[31], selector); - w[35] = __byte_perm (w[29], w[30], selector); - w[34] = __byte_perm (w[28], w[29], selector); - w[33] = __byte_perm (w[27], w[28], selector); - w[32] = __byte_perm (w[26], w[27], selector); - w[31] = __byte_perm (w[25], w[26], selector); - w[30] = __byte_perm (w[24], w[25], selector); - w[29] = __byte_perm (w[23], w[24], selector); - w[28] = __byte_perm (w[22], w[23], selector); - w[27] = __byte_perm (w[21], w[22], selector); - w[26] = __byte_perm (w[20], w[21], selector); - w[25] = __byte_perm (w[19], w[20], selector); - w[24] = __byte_perm (w[18], w[19], selector); - w[23] = __byte_perm (w[17], w[18], selector); - w[22] = __byte_perm (w[16], w[17], selector); - w[21] = __byte_perm (w[15], w[16], selector); - w[20] = __byte_perm (w[14], w[15], selector); - w[19] = __byte_perm (w[13], w[14], selector); - w[18] = __byte_perm (w[12], w[13], selector); - w[17] = __byte_perm (w[11], w[12], selector); - w[16] = __byte_perm (w[10], w[11], selector); - w[15] = __byte_perm (w[ 9], w[10], selector); - w[14] = __byte_perm (w[ 8], w[ 9], selector); - w[13] = __byte_perm (w[ 7], w[ 8], selector); - w[12] = __byte_perm (w[ 6], w[ 7], selector); - w[11] = __byte_perm (w[ 5], w[ 6], selector); - w[10] = __byte_perm (w[ 4], w[ 5], selector); - w[ 9] = __byte_perm (w[ 3], w[ 4], selector); - w[ 8] = __byte_perm (w[ 2], w[ 3], selector); - w[ 7] = __byte_perm (w[ 1], w[ 2], selector); - w[ 6] = __byte_perm (w[ 0], w[ 1], selector); - w[ 5] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[57], w[58], selector); + w[62] = hc_byte_perm (w[56], w[57], selector); + w[61] = hc_byte_perm (w[55], w[56], selector); + w[60] = hc_byte_perm (w[54], w[55], selector); + w[59] = hc_byte_perm (w[53], w[54], selector); + w[58] = hc_byte_perm (w[52], w[53], selector); + w[57] = hc_byte_perm (w[51], w[52], selector); + w[56] = hc_byte_perm (w[50], w[51], selector); + w[55] = hc_byte_perm (w[49], w[50], selector); + w[54] = hc_byte_perm (w[48], w[49], selector); + w[53] = hc_byte_perm (w[47], w[48], selector); + w[52] = hc_byte_perm (w[46], w[47], selector); + w[51] = hc_byte_perm (w[45], w[46], selector); + w[50] = hc_byte_perm (w[44], w[45], selector); + w[49] = hc_byte_perm (w[43], w[44], selector); + w[48] = hc_byte_perm (w[42], w[43], selector); + w[47] = hc_byte_perm (w[41], w[42], selector); + w[46] = hc_byte_perm (w[40], w[41], selector); + w[45] = hc_byte_perm (w[39], w[40], selector); + w[44] = hc_byte_perm (w[38], w[39], selector); + w[43] = hc_byte_perm (w[37], w[38], selector); + w[42] = hc_byte_perm (w[36], w[37], selector); + w[41] = hc_byte_perm (w[35], w[36], selector); + w[40] = hc_byte_perm (w[34], w[35], selector); + w[39] = hc_byte_perm (w[33], w[34], selector); + w[38] = hc_byte_perm (w[32], w[33], selector); + w[37] = hc_byte_perm (w[31], w[32], selector); + w[36] = hc_byte_perm (w[30], w[31], selector); + w[35] = hc_byte_perm (w[29], w[30], selector); + w[34] = hc_byte_perm (w[28], w[29], selector); + w[33] = hc_byte_perm (w[27], w[28], selector); + w[32] = hc_byte_perm (w[26], w[27], selector); + w[31] = hc_byte_perm (w[25], w[26], selector); + w[30] = hc_byte_perm (w[24], w[25], selector); + w[29] = hc_byte_perm (w[23], w[24], selector); + w[28] = hc_byte_perm (w[22], w[23], selector); + w[27] = hc_byte_perm (w[21], w[22], selector); + w[26] = hc_byte_perm (w[20], w[21], selector); + w[25] = hc_byte_perm (w[19], w[20], selector); + w[24] = hc_byte_perm (w[18], w[19], selector); + w[23] = hc_byte_perm (w[17], w[18], selector); + w[22] = hc_byte_perm (w[16], w[17], selector); + w[21] = hc_byte_perm (w[15], w[16], selector); + w[20] = hc_byte_perm (w[14], w[15], selector); + w[19] = hc_byte_perm (w[13], w[14], selector); + w[18] = hc_byte_perm (w[12], w[13], selector); + w[17] = hc_byte_perm (w[11], w[12], selector); + w[16] = hc_byte_perm (w[10], w[11], selector); + w[15] = hc_byte_perm (w[ 9], w[10], selector); + w[14] = hc_byte_perm (w[ 8], w[ 9], selector); + w[13] = hc_byte_perm (w[ 7], w[ 8], selector); + w[12] = hc_byte_perm (w[ 6], w[ 7], selector); + w[11] = hc_byte_perm (w[ 5], w[ 6], selector); + w[10] = hc_byte_perm (w[ 4], w[ 5], selector); + w[ 9] = hc_byte_perm (w[ 3], w[ 4], selector); + w[ 8] = hc_byte_perm (w[ 2], w[ 3], selector); + w[ 7] = hc_byte_perm (w[ 1], w[ 2], selector); + w[ 6] = hc_byte_perm (w[ 0], w[ 1], selector); + w[ 5] = hc_byte_perm ( 0, w[ 0], selector); w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; @@ -16444,64 +16444,64 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 6: - w[63] = __byte_perm (w[56], w[57], selector); - w[62] = __byte_perm (w[55], w[56], selector); - w[61] = __byte_perm (w[54], w[55], selector); - w[60] = __byte_perm (w[53], w[54], selector); - w[59] = __byte_perm (w[52], w[53], selector); - w[58] = __byte_perm (w[51], w[52], selector); - w[57] = __byte_perm (w[50], w[51], selector); - w[56] = __byte_perm (w[49], w[50], selector); - w[55] = __byte_perm (w[48], w[49], selector); - w[54] = __byte_perm (w[47], w[48], selector); - w[53] = __byte_perm (w[46], w[47], selector); - w[52] = __byte_perm (w[45], w[46], selector); - w[51] = __byte_perm (w[44], w[45], selector); - w[50] = __byte_perm (w[43], w[44], selector); - w[49] = __byte_perm (w[42], w[43], selector); - w[48] = __byte_perm (w[41], w[42], selector); - w[47] = __byte_perm (w[40], w[41], selector); - w[46] = __byte_perm (w[39], w[40], selector); - w[45] = __byte_perm (w[38], w[39], selector); - w[44] = __byte_perm (w[37], w[38], selector); - w[43] = __byte_perm (w[36], w[37], selector); - w[42] = __byte_perm (w[35], w[36], selector); - w[41] = __byte_perm (w[34], w[35], selector); - w[40] = __byte_perm (w[33], w[34], selector); - w[39] = __byte_perm (w[32], w[33], selector); - w[38] = __byte_perm (w[31], w[32], selector); - w[37] = __byte_perm (w[30], w[31], selector); - w[36] = __byte_perm (w[29], w[30], selector); - w[35] = __byte_perm (w[28], w[29], selector); - w[34] = __byte_perm (w[27], w[28], selector); - w[33] = __byte_perm (w[26], w[27], selector); - w[32] = __byte_perm (w[25], w[26], selector); - w[31] = __byte_perm (w[24], w[25], selector); - w[30] = __byte_perm (w[23], w[24], selector); - w[29] = __byte_perm (w[22], w[23], selector); - w[28] = __byte_perm (w[21], w[22], selector); - w[27] = __byte_perm (w[20], w[21], selector); - w[26] = __byte_perm (w[19], w[20], selector); - w[25] = __byte_perm (w[18], w[19], selector); - w[24] = __byte_perm (w[17], w[18], selector); - w[23] = __byte_perm (w[16], w[17], selector); - w[22] = __byte_perm (w[15], w[16], selector); - w[21] = __byte_perm (w[14], w[15], selector); - w[20] = __byte_perm (w[13], w[14], selector); - w[19] = __byte_perm (w[12], w[13], selector); - w[18] = __byte_perm (w[11], w[12], selector); - w[17] = __byte_perm (w[10], w[11], selector); - w[16] = __byte_perm (w[ 9], w[10], selector); - w[15] = __byte_perm (w[ 8], w[ 9], selector); - w[14] = __byte_perm (w[ 7], w[ 8], selector); - w[13] = __byte_perm (w[ 6], w[ 7], selector); - w[12] = __byte_perm (w[ 5], w[ 6], selector); - w[11] = __byte_perm (w[ 4], w[ 5], selector); - w[10] = __byte_perm (w[ 3], w[ 4], selector); - w[ 9] = __byte_perm (w[ 2], w[ 3], selector); - w[ 8] = __byte_perm (w[ 1], w[ 2], selector); - w[ 7] = __byte_perm (w[ 0], w[ 1], selector); - w[ 6] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[56], w[57], selector); + w[62] = hc_byte_perm (w[55], w[56], selector); + w[61] = hc_byte_perm (w[54], w[55], selector); + w[60] = hc_byte_perm (w[53], w[54], selector); + w[59] = hc_byte_perm (w[52], w[53], selector); + w[58] = hc_byte_perm (w[51], w[52], selector); + w[57] = hc_byte_perm (w[50], w[51], selector); + w[56] = hc_byte_perm (w[49], w[50], selector); + w[55] = hc_byte_perm (w[48], w[49], selector); + w[54] = hc_byte_perm (w[47], w[48], selector); + w[53] = hc_byte_perm (w[46], w[47], selector); + w[52] = hc_byte_perm (w[45], w[46], selector); + w[51] = hc_byte_perm (w[44], w[45], selector); + w[50] = hc_byte_perm (w[43], w[44], selector); + w[49] = hc_byte_perm (w[42], w[43], selector); + w[48] = hc_byte_perm (w[41], w[42], selector); + w[47] = hc_byte_perm (w[40], w[41], selector); + w[46] = hc_byte_perm (w[39], w[40], selector); + w[45] = hc_byte_perm (w[38], w[39], selector); + w[44] = hc_byte_perm (w[37], w[38], selector); + w[43] = hc_byte_perm (w[36], w[37], selector); + w[42] = hc_byte_perm (w[35], w[36], selector); + w[41] = hc_byte_perm (w[34], w[35], selector); + w[40] = hc_byte_perm (w[33], w[34], selector); + w[39] = hc_byte_perm (w[32], w[33], selector); + w[38] = hc_byte_perm (w[31], w[32], selector); + w[37] = hc_byte_perm (w[30], w[31], selector); + w[36] = hc_byte_perm (w[29], w[30], selector); + w[35] = hc_byte_perm (w[28], w[29], selector); + w[34] = hc_byte_perm (w[27], w[28], selector); + w[33] = hc_byte_perm (w[26], w[27], selector); + w[32] = hc_byte_perm (w[25], w[26], selector); + w[31] = hc_byte_perm (w[24], w[25], selector); + w[30] = hc_byte_perm (w[23], w[24], selector); + w[29] = hc_byte_perm (w[22], w[23], selector); + w[28] = hc_byte_perm (w[21], w[22], selector); + w[27] = hc_byte_perm (w[20], w[21], selector); + w[26] = hc_byte_perm (w[19], w[20], selector); + w[25] = hc_byte_perm (w[18], w[19], selector); + w[24] = hc_byte_perm (w[17], w[18], selector); + w[23] = hc_byte_perm (w[16], w[17], selector); + w[22] = hc_byte_perm (w[15], w[16], selector); + w[21] = hc_byte_perm (w[14], w[15], selector); + w[20] = hc_byte_perm (w[13], w[14], selector); + w[19] = hc_byte_perm (w[12], w[13], selector); + w[18] = hc_byte_perm (w[11], w[12], selector); + w[17] = hc_byte_perm (w[10], w[11], selector); + w[16] = hc_byte_perm (w[ 9], w[10], selector); + w[15] = hc_byte_perm (w[ 8], w[ 9], selector); + w[14] = hc_byte_perm (w[ 7], w[ 8], selector); + w[13] = hc_byte_perm (w[ 6], w[ 7], selector); + w[12] = hc_byte_perm (w[ 5], w[ 6], selector); + w[11] = hc_byte_perm (w[ 4], w[ 5], selector); + w[10] = hc_byte_perm (w[ 3], w[ 4], selector); + w[ 9] = hc_byte_perm (w[ 2], w[ 3], selector); + w[ 8] = hc_byte_perm (w[ 1], w[ 2], selector); + w[ 7] = hc_byte_perm (w[ 0], w[ 1], selector); + w[ 6] = hc_byte_perm ( 0, w[ 0], selector); w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; @@ -16512,63 +16512,63 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 7: - w[63] = __byte_perm (w[55], w[56], selector); - w[62] = __byte_perm (w[54], w[55], selector); - w[61] = __byte_perm (w[53], w[54], selector); - w[60] = __byte_perm (w[52], w[53], selector); - w[59] = __byte_perm (w[51], w[52], selector); - w[58] = __byte_perm (w[50], w[51], selector); - w[57] = __byte_perm (w[49], w[50], selector); - w[56] = __byte_perm (w[48], w[49], selector); - w[55] = __byte_perm (w[47], w[48], selector); - w[54] = __byte_perm (w[46], w[47], selector); - w[53] = __byte_perm (w[45], w[46], selector); - w[52] = __byte_perm (w[44], w[45], selector); - w[51] = __byte_perm (w[43], w[44], selector); - w[50] = __byte_perm (w[42], w[43], selector); - w[49] = __byte_perm (w[41], w[42], selector); - w[48] = __byte_perm (w[40], w[41], selector); - w[47] = __byte_perm (w[39], w[40], selector); - w[46] = __byte_perm (w[38], w[39], selector); - w[45] = __byte_perm (w[37], w[38], selector); - w[44] = __byte_perm (w[36], w[37], selector); - w[43] = __byte_perm (w[35], w[36], selector); - w[42] = __byte_perm (w[34], w[35], selector); - w[41] = __byte_perm (w[33], w[34], selector); - w[40] = __byte_perm (w[32], w[33], selector); - w[39] = __byte_perm (w[31], w[32], selector); - w[38] = __byte_perm (w[30], w[31], selector); - w[37] = __byte_perm (w[29], w[30], selector); - w[36] = __byte_perm (w[28], w[29], selector); - w[35] = __byte_perm (w[27], w[28], selector); - w[34] = __byte_perm (w[26], w[27], selector); - w[33] = __byte_perm (w[25], w[26], selector); - w[32] = __byte_perm (w[24], w[25], selector); - w[31] = __byte_perm (w[23], w[24], selector); - w[30] = __byte_perm (w[22], w[23], selector); - w[29] = __byte_perm (w[21], w[22], selector); - w[28] = __byte_perm (w[20], w[21], selector); - w[27] = __byte_perm (w[19], w[20], selector); - w[26] = __byte_perm (w[18], w[19], selector); - w[25] = __byte_perm (w[17], w[18], selector); - w[24] = __byte_perm (w[16], w[17], selector); - w[23] = __byte_perm (w[15], w[16], selector); - w[22] = __byte_perm (w[14], w[15], selector); - w[21] = __byte_perm (w[13], w[14], selector); - w[20] = __byte_perm (w[12], w[13], selector); - w[19] = __byte_perm (w[11], w[12], selector); - w[18] = __byte_perm (w[10], w[11], selector); - w[17] = __byte_perm (w[ 9], w[10], selector); - w[16] = __byte_perm (w[ 8], w[ 9], selector); - w[15] = __byte_perm (w[ 7], w[ 8], selector); - w[14] = __byte_perm (w[ 6], w[ 7], selector); - w[13] = __byte_perm (w[ 5], w[ 6], selector); - w[12] = __byte_perm (w[ 4], w[ 5], selector); - w[11] = __byte_perm (w[ 3], w[ 4], selector); - w[10] = __byte_perm (w[ 2], w[ 3], selector); - w[ 9] = __byte_perm (w[ 1], w[ 2], selector); - w[ 8] = __byte_perm (w[ 0], w[ 1], selector); - w[ 7] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[55], w[56], selector); + w[62] = hc_byte_perm (w[54], w[55], selector); + w[61] = hc_byte_perm (w[53], w[54], selector); + w[60] = hc_byte_perm (w[52], w[53], selector); + w[59] = hc_byte_perm (w[51], w[52], selector); + w[58] = hc_byte_perm (w[50], w[51], selector); + w[57] = hc_byte_perm (w[49], w[50], selector); + w[56] = hc_byte_perm (w[48], w[49], selector); + w[55] = hc_byte_perm (w[47], w[48], selector); + w[54] = hc_byte_perm (w[46], w[47], selector); + w[53] = hc_byte_perm (w[45], w[46], selector); + w[52] = hc_byte_perm (w[44], w[45], selector); + w[51] = hc_byte_perm (w[43], w[44], selector); + w[50] = hc_byte_perm (w[42], w[43], selector); + w[49] = hc_byte_perm (w[41], w[42], selector); + w[48] = hc_byte_perm (w[40], w[41], selector); + w[47] = hc_byte_perm (w[39], w[40], selector); + w[46] = hc_byte_perm (w[38], w[39], selector); + w[45] = hc_byte_perm (w[37], w[38], selector); + w[44] = hc_byte_perm (w[36], w[37], selector); + w[43] = hc_byte_perm (w[35], w[36], selector); + w[42] = hc_byte_perm (w[34], w[35], selector); + w[41] = hc_byte_perm (w[33], w[34], selector); + w[40] = hc_byte_perm (w[32], w[33], selector); + w[39] = hc_byte_perm (w[31], w[32], selector); + w[38] = hc_byte_perm (w[30], w[31], selector); + w[37] = hc_byte_perm (w[29], w[30], selector); + w[36] = hc_byte_perm (w[28], w[29], selector); + w[35] = hc_byte_perm (w[27], w[28], selector); + w[34] = hc_byte_perm (w[26], w[27], selector); + w[33] = hc_byte_perm (w[25], w[26], selector); + w[32] = hc_byte_perm (w[24], w[25], selector); + w[31] = hc_byte_perm (w[23], w[24], selector); + w[30] = hc_byte_perm (w[22], w[23], selector); + w[29] = hc_byte_perm (w[21], w[22], selector); + w[28] = hc_byte_perm (w[20], w[21], selector); + w[27] = hc_byte_perm (w[19], w[20], selector); + w[26] = hc_byte_perm (w[18], w[19], selector); + w[25] = hc_byte_perm (w[17], w[18], selector); + w[24] = hc_byte_perm (w[16], w[17], selector); + w[23] = hc_byte_perm (w[15], w[16], selector); + w[22] = hc_byte_perm (w[14], w[15], selector); + w[21] = hc_byte_perm (w[13], w[14], selector); + w[20] = hc_byte_perm (w[12], w[13], selector); + w[19] = hc_byte_perm (w[11], w[12], selector); + w[18] = hc_byte_perm (w[10], w[11], selector); + w[17] = hc_byte_perm (w[ 9], w[10], selector); + w[16] = hc_byte_perm (w[ 8], w[ 9], selector); + w[15] = hc_byte_perm (w[ 7], w[ 8], selector); + w[14] = hc_byte_perm (w[ 6], w[ 7], selector); + w[13] = hc_byte_perm (w[ 5], w[ 6], selector); + w[12] = hc_byte_perm (w[ 4], w[ 5], selector); + w[11] = hc_byte_perm (w[ 3], w[ 4], selector); + w[10] = hc_byte_perm (w[ 2], w[ 3], selector); + w[ 9] = hc_byte_perm (w[ 1], w[ 2], selector); + w[ 8] = hc_byte_perm (w[ 0], w[ 1], selector); + w[ 7] = hc_byte_perm ( 0, w[ 0], selector); w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; @@ -16580,62 +16580,62 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 8: - w[63] = __byte_perm (w[54], w[55], selector); - w[62] = __byte_perm (w[53], w[54], selector); - w[61] = __byte_perm (w[52], w[53], selector); - w[60] = __byte_perm (w[51], w[52], selector); - w[59] = __byte_perm (w[50], w[51], selector); - w[58] = __byte_perm (w[49], w[50], selector); - w[57] = __byte_perm (w[48], w[49], selector); - w[56] = __byte_perm (w[47], w[48], selector); - w[55] = __byte_perm (w[46], w[47], selector); - w[54] = __byte_perm (w[45], w[46], selector); - w[53] = __byte_perm (w[44], w[45], selector); - w[52] = __byte_perm (w[43], w[44], selector); - w[51] = __byte_perm (w[42], w[43], selector); - w[50] = __byte_perm (w[41], w[42], selector); - w[49] = __byte_perm (w[40], w[41], selector); - w[48] = __byte_perm (w[39], w[40], selector); - w[47] = __byte_perm (w[38], w[39], selector); - w[46] = __byte_perm (w[37], w[38], selector); - w[45] = __byte_perm (w[36], w[37], selector); - w[44] = __byte_perm (w[35], w[36], selector); - w[43] = __byte_perm (w[34], w[35], selector); - w[42] = __byte_perm (w[33], w[34], selector); - w[41] = __byte_perm (w[32], w[33], selector); - w[40] = __byte_perm (w[31], w[32], selector); - w[39] = __byte_perm (w[30], w[31], selector); - w[38] = __byte_perm (w[29], w[30], selector); - w[37] = __byte_perm (w[28], w[29], selector); - w[36] = __byte_perm (w[27], w[28], selector); - w[35] = __byte_perm (w[26], w[27], selector); - w[34] = __byte_perm (w[25], w[26], selector); - w[33] = __byte_perm (w[24], w[25], selector); - w[32] = __byte_perm (w[23], w[24], selector); - w[31] = __byte_perm (w[22], w[23], selector); - w[30] = __byte_perm (w[21], w[22], selector); - w[29] = __byte_perm (w[20], w[21], selector); - w[28] = __byte_perm (w[19], w[20], selector); - w[27] = __byte_perm (w[18], w[19], selector); - w[26] = __byte_perm (w[17], w[18], selector); - w[25] = __byte_perm (w[16], w[17], selector); - w[24] = __byte_perm (w[15], w[16], selector); - w[23] = __byte_perm (w[14], w[15], selector); - w[22] = __byte_perm (w[13], w[14], selector); - w[21] = __byte_perm (w[12], w[13], selector); - w[20] = __byte_perm (w[11], w[12], selector); - w[19] = __byte_perm (w[10], w[11], selector); - w[18] = __byte_perm (w[ 9], w[10], selector); - w[17] = __byte_perm (w[ 8], w[ 9], selector); - w[16] = __byte_perm (w[ 7], w[ 8], selector); - w[15] = __byte_perm (w[ 6], w[ 7], selector); - w[14] = __byte_perm (w[ 5], w[ 6], selector); - w[13] = __byte_perm (w[ 4], w[ 5], selector); - w[12] = __byte_perm (w[ 3], w[ 4], selector); - w[11] = __byte_perm (w[ 2], w[ 3], selector); - w[10] = __byte_perm (w[ 1], w[ 2], selector); - w[ 9] = __byte_perm (w[ 0], w[ 1], selector); - w[ 8] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[54], w[55], selector); + w[62] = hc_byte_perm (w[53], w[54], selector); + w[61] = hc_byte_perm (w[52], w[53], selector); + w[60] = hc_byte_perm (w[51], w[52], selector); + w[59] = hc_byte_perm (w[50], w[51], selector); + w[58] = hc_byte_perm (w[49], w[50], selector); + w[57] = hc_byte_perm (w[48], w[49], selector); + w[56] = hc_byte_perm (w[47], w[48], selector); + w[55] = hc_byte_perm (w[46], w[47], selector); + w[54] = hc_byte_perm (w[45], w[46], selector); + w[53] = hc_byte_perm (w[44], w[45], selector); + w[52] = hc_byte_perm (w[43], w[44], selector); + w[51] = hc_byte_perm (w[42], w[43], selector); + w[50] = hc_byte_perm (w[41], w[42], selector); + w[49] = hc_byte_perm (w[40], w[41], selector); + w[48] = hc_byte_perm (w[39], w[40], selector); + w[47] = hc_byte_perm (w[38], w[39], selector); + w[46] = hc_byte_perm (w[37], w[38], selector); + w[45] = hc_byte_perm (w[36], w[37], selector); + w[44] = hc_byte_perm (w[35], w[36], selector); + w[43] = hc_byte_perm (w[34], w[35], selector); + w[42] = hc_byte_perm (w[33], w[34], selector); + w[41] = hc_byte_perm (w[32], w[33], selector); + w[40] = hc_byte_perm (w[31], w[32], selector); + w[39] = hc_byte_perm (w[30], w[31], selector); + w[38] = hc_byte_perm (w[29], w[30], selector); + w[37] = hc_byte_perm (w[28], w[29], selector); + w[36] = hc_byte_perm (w[27], w[28], selector); + w[35] = hc_byte_perm (w[26], w[27], selector); + w[34] = hc_byte_perm (w[25], w[26], selector); + w[33] = hc_byte_perm (w[24], w[25], selector); + w[32] = hc_byte_perm (w[23], w[24], selector); + w[31] = hc_byte_perm (w[22], w[23], selector); + w[30] = hc_byte_perm (w[21], w[22], selector); + w[29] = hc_byte_perm (w[20], w[21], selector); + w[28] = hc_byte_perm (w[19], w[20], selector); + w[27] = hc_byte_perm (w[18], w[19], selector); + w[26] = hc_byte_perm (w[17], w[18], selector); + w[25] = hc_byte_perm (w[16], w[17], selector); + w[24] = hc_byte_perm (w[15], w[16], selector); + w[23] = hc_byte_perm (w[14], w[15], selector); + w[22] = hc_byte_perm (w[13], w[14], selector); + w[21] = hc_byte_perm (w[12], w[13], selector); + w[20] = hc_byte_perm (w[11], w[12], selector); + w[19] = hc_byte_perm (w[10], w[11], selector); + w[18] = hc_byte_perm (w[ 9], w[10], selector); + w[17] = hc_byte_perm (w[ 8], w[ 9], selector); + w[16] = hc_byte_perm (w[ 7], w[ 8], selector); + w[15] = hc_byte_perm (w[ 6], w[ 7], selector); + w[14] = hc_byte_perm (w[ 5], w[ 6], selector); + w[13] = hc_byte_perm (w[ 4], w[ 5], selector); + w[12] = hc_byte_perm (w[ 3], w[ 4], selector); + w[11] = hc_byte_perm (w[ 2], w[ 3], selector); + w[10] = hc_byte_perm (w[ 1], w[ 2], selector); + w[ 9] = hc_byte_perm (w[ 0], w[ 1], selector); + w[ 8] = hc_byte_perm ( 0, w[ 0], selector); w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; @@ -16648,61 +16648,61 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 9: - w[63] = __byte_perm (w[53], w[54], selector); - w[62] = __byte_perm (w[52], w[53], selector); - w[61] = __byte_perm (w[51], w[52], selector); - w[60] = __byte_perm (w[50], w[51], selector); - w[59] = __byte_perm (w[49], w[50], selector); - w[58] = __byte_perm (w[48], w[49], selector); - w[57] = __byte_perm (w[47], w[48], selector); - w[56] = __byte_perm (w[46], w[47], selector); - w[55] = __byte_perm (w[45], w[46], selector); - w[54] = __byte_perm (w[44], w[45], selector); - w[53] = __byte_perm (w[43], w[44], selector); - w[52] = __byte_perm (w[42], w[43], selector); - w[51] = __byte_perm (w[41], w[42], selector); - w[50] = __byte_perm (w[40], w[41], selector); - w[49] = __byte_perm (w[39], w[40], selector); - w[48] = __byte_perm (w[38], w[39], selector); - w[47] = __byte_perm (w[37], w[38], selector); - w[46] = __byte_perm (w[36], w[37], selector); - w[45] = __byte_perm (w[35], w[36], selector); - w[44] = __byte_perm (w[34], w[35], selector); - w[43] = __byte_perm (w[33], w[34], selector); - w[42] = __byte_perm (w[32], w[33], selector); - w[41] = __byte_perm (w[31], w[32], selector); - w[40] = __byte_perm (w[30], w[31], selector); - w[39] = __byte_perm (w[29], w[30], selector); - w[38] = __byte_perm (w[28], w[29], selector); - w[37] = __byte_perm (w[27], w[28], selector); - w[36] = __byte_perm (w[26], w[27], selector); - w[35] = __byte_perm (w[25], w[26], selector); - w[34] = __byte_perm (w[24], w[25], selector); - w[33] = __byte_perm (w[23], w[24], selector); - w[32] = __byte_perm (w[22], w[23], selector); - w[31] = __byte_perm (w[21], w[22], selector); - w[30] = __byte_perm (w[20], w[21], selector); - w[29] = __byte_perm (w[19], w[20], selector); - w[28] = __byte_perm (w[18], w[19], selector); - w[27] = __byte_perm (w[17], w[18], selector); - w[26] = __byte_perm (w[16], w[17], selector); - w[25] = __byte_perm (w[15], w[16], selector); - w[24] = __byte_perm (w[14], w[15], selector); - w[23] = __byte_perm (w[13], w[14], selector); - w[22] = __byte_perm (w[12], w[13], selector); - w[21] = __byte_perm (w[11], w[12], selector); - w[20] = __byte_perm (w[10], w[11], selector); - w[19] = __byte_perm (w[ 9], w[10], selector); - w[18] = __byte_perm (w[ 8], w[ 9], selector); - w[17] = __byte_perm (w[ 7], w[ 8], selector); - w[16] = __byte_perm (w[ 6], w[ 7], selector); - w[15] = __byte_perm (w[ 5], w[ 6], selector); - w[14] = __byte_perm (w[ 4], w[ 5], selector); - w[13] = __byte_perm (w[ 3], w[ 4], selector); - w[12] = __byte_perm (w[ 2], w[ 3], selector); - w[11] = __byte_perm (w[ 1], w[ 2], selector); - w[10] = __byte_perm (w[ 0], w[ 1], selector); - w[ 9] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[53], w[54], selector); + w[62] = hc_byte_perm (w[52], w[53], selector); + w[61] = hc_byte_perm (w[51], w[52], selector); + w[60] = hc_byte_perm (w[50], w[51], selector); + w[59] = hc_byte_perm (w[49], w[50], selector); + w[58] = hc_byte_perm (w[48], w[49], selector); + w[57] = hc_byte_perm (w[47], w[48], selector); + w[56] = hc_byte_perm (w[46], w[47], selector); + w[55] = hc_byte_perm (w[45], w[46], selector); + w[54] = hc_byte_perm (w[44], w[45], selector); + w[53] = hc_byte_perm (w[43], w[44], selector); + w[52] = hc_byte_perm (w[42], w[43], selector); + w[51] = hc_byte_perm (w[41], w[42], selector); + w[50] = hc_byte_perm (w[40], w[41], selector); + w[49] = hc_byte_perm (w[39], w[40], selector); + w[48] = hc_byte_perm (w[38], w[39], selector); + w[47] = hc_byte_perm (w[37], w[38], selector); + w[46] = hc_byte_perm (w[36], w[37], selector); + w[45] = hc_byte_perm (w[35], w[36], selector); + w[44] = hc_byte_perm (w[34], w[35], selector); + w[43] = hc_byte_perm (w[33], w[34], selector); + w[42] = hc_byte_perm (w[32], w[33], selector); + w[41] = hc_byte_perm (w[31], w[32], selector); + w[40] = hc_byte_perm (w[30], w[31], selector); + w[39] = hc_byte_perm (w[29], w[30], selector); + w[38] = hc_byte_perm (w[28], w[29], selector); + w[37] = hc_byte_perm (w[27], w[28], selector); + w[36] = hc_byte_perm (w[26], w[27], selector); + w[35] = hc_byte_perm (w[25], w[26], selector); + w[34] = hc_byte_perm (w[24], w[25], selector); + w[33] = hc_byte_perm (w[23], w[24], selector); + w[32] = hc_byte_perm (w[22], w[23], selector); + w[31] = hc_byte_perm (w[21], w[22], selector); + w[30] = hc_byte_perm (w[20], w[21], selector); + w[29] = hc_byte_perm (w[19], w[20], selector); + w[28] = hc_byte_perm (w[18], w[19], selector); + w[27] = hc_byte_perm (w[17], w[18], selector); + w[26] = hc_byte_perm (w[16], w[17], selector); + w[25] = hc_byte_perm (w[15], w[16], selector); + w[24] = hc_byte_perm (w[14], w[15], selector); + w[23] = hc_byte_perm (w[13], w[14], selector); + w[22] = hc_byte_perm (w[12], w[13], selector); + w[21] = hc_byte_perm (w[11], w[12], selector); + w[20] = hc_byte_perm (w[10], w[11], selector); + w[19] = hc_byte_perm (w[ 9], w[10], selector); + w[18] = hc_byte_perm (w[ 8], w[ 9], selector); + w[17] = hc_byte_perm (w[ 7], w[ 8], selector); + w[16] = hc_byte_perm (w[ 6], w[ 7], selector); + w[15] = hc_byte_perm (w[ 5], w[ 6], selector); + w[14] = hc_byte_perm (w[ 4], w[ 5], selector); + w[13] = hc_byte_perm (w[ 3], w[ 4], selector); + w[12] = hc_byte_perm (w[ 2], w[ 3], selector); + w[11] = hc_byte_perm (w[ 1], w[ 2], selector); + w[10] = hc_byte_perm (w[ 0], w[ 1], selector); + w[ 9] = hc_byte_perm ( 0, w[ 0], selector); w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; @@ -16716,60 +16716,60 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 10: - w[63] = __byte_perm (w[52], w[53], selector); - w[62] = __byte_perm (w[51], w[52], selector); - w[61] = __byte_perm (w[50], w[51], selector); - w[60] = __byte_perm (w[49], w[50], selector); - w[59] = __byte_perm (w[48], w[49], selector); - w[58] = __byte_perm (w[47], w[48], selector); - w[57] = __byte_perm (w[46], w[47], selector); - w[56] = __byte_perm (w[45], w[46], selector); - w[55] = __byte_perm (w[44], w[45], selector); - w[54] = __byte_perm (w[43], w[44], selector); - w[53] = __byte_perm (w[42], w[43], selector); - w[52] = __byte_perm (w[41], w[42], selector); - w[51] = __byte_perm (w[40], w[41], selector); - w[50] = __byte_perm (w[39], w[40], selector); - w[49] = __byte_perm (w[38], w[39], selector); - w[48] = __byte_perm (w[37], w[38], selector); - w[47] = __byte_perm (w[36], w[37], selector); - w[46] = __byte_perm (w[35], w[36], selector); - w[45] = __byte_perm (w[34], w[35], selector); - w[44] = __byte_perm (w[33], w[34], selector); - w[43] = __byte_perm (w[32], w[33], selector); - w[42] = __byte_perm (w[31], w[32], selector); - w[41] = __byte_perm (w[30], w[31], selector); - w[40] = __byte_perm (w[29], w[30], selector); - w[39] = __byte_perm (w[28], w[29], selector); - w[38] = __byte_perm (w[27], w[28], selector); - w[37] = __byte_perm (w[26], w[27], selector); - w[36] = __byte_perm (w[25], w[26], selector); - w[35] = __byte_perm (w[24], w[25], selector); - w[34] = __byte_perm (w[23], w[24], selector); - w[33] = __byte_perm (w[22], w[23], selector); - w[32] = __byte_perm (w[21], w[22], selector); - w[31] = __byte_perm (w[20], w[21], selector); - w[30] = __byte_perm (w[19], w[20], selector); - w[29] = __byte_perm (w[18], w[19], selector); - w[28] = __byte_perm (w[17], w[18], selector); - w[27] = __byte_perm (w[16], w[17], selector); - w[26] = __byte_perm (w[15], w[16], selector); - w[25] = __byte_perm (w[14], w[15], selector); - w[24] = __byte_perm (w[13], w[14], selector); - w[23] = __byte_perm (w[12], w[13], selector); - w[22] = __byte_perm (w[11], w[12], selector); - w[21] = __byte_perm (w[10], w[11], selector); - w[20] = __byte_perm (w[ 9], w[10], selector); - w[19] = __byte_perm (w[ 8], w[ 9], selector); - w[18] = __byte_perm (w[ 7], w[ 8], selector); - w[17] = __byte_perm (w[ 6], w[ 7], selector); - w[16] = __byte_perm (w[ 5], w[ 6], selector); - w[15] = __byte_perm (w[ 4], w[ 5], selector); - w[14] = __byte_perm (w[ 3], w[ 4], selector); - w[13] = __byte_perm (w[ 2], w[ 3], selector); - w[12] = __byte_perm (w[ 1], w[ 2], selector); - w[11] = __byte_perm (w[ 0], w[ 1], selector); - w[10] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[52], w[53], selector); + w[62] = hc_byte_perm (w[51], w[52], selector); + w[61] = hc_byte_perm (w[50], w[51], selector); + w[60] = hc_byte_perm (w[49], w[50], selector); + w[59] = hc_byte_perm (w[48], w[49], selector); + w[58] = hc_byte_perm (w[47], w[48], selector); + w[57] = hc_byte_perm (w[46], w[47], selector); + w[56] = hc_byte_perm (w[45], w[46], selector); + w[55] = hc_byte_perm (w[44], w[45], selector); + w[54] = hc_byte_perm (w[43], w[44], selector); + w[53] = hc_byte_perm (w[42], w[43], selector); + w[52] = hc_byte_perm (w[41], w[42], selector); + w[51] = hc_byte_perm (w[40], w[41], selector); + w[50] = hc_byte_perm (w[39], w[40], selector); + w[49] = hc_byte_perm (w[38], w[39], selector); + w[48] = hc_byte_perm (w[37], w[38], selector); + w[47] = hc_byte_perm (w[36], w[37], selector); + w[46] = hc_byte_perm (w[35], w[36], selector); + w[45] = hc_byte_perm (w[34], w[35], selector); + w[44] = hc_byte_perm (w[33], w[34], selector); + w[43] = hc_byte_perm (w[32], w[33], selector); + w[42] = hc_byte_perm (w[31], w[32], selector); + w[41] = hc_byte_perm (w[30], w[31], selector); + w[40] = hc_byte_perm (w[29], w[30], selector); + w[39] = hc_byte_perm (w[28], w[29], selector); + w[38] = hc_byte_perm (w[27], w[28], selector); + w[37] = hc_byte_perm (w[26], w[27], selector); + w[36] = hc_byte_perm (w[25], w[26], selector); + w[35] = hc_byte_perm (w[24], w[25], selector); + w[34] = hc_byte_perm (w[23], w[24], selector); + w[33] = hc_byte_perm (w[22], w[23], selector); + w[32] = hc_byte_perm (w[21], w[22], selector); + w[31] = hc_byte_perm (w[20], w[21], selector); + w[30] = hc_byte_perm (w[19], w[20], selector); + w[29] = hc_byte_perm (w[18], w[19], selector); + w[28] = hc_byte_perm (w[17], w[18], selector); + w[27] = hc_byte_perm (w[16], w[17], selector); + w[26] = hc_byte_perm (w[15], w[16], selector); + w[25] = hc_byte_perm (w[14], w[15], selector); + w[24] = hc_byte_perm (w[13], w[14], selector); + w[23] = hc_byte_perm (w[12], w[13], selector); + w[22] = hc_byte_perm (w[11], w[12], selector); + w[21] = hc_byte_perm (w[10], w[11], selector); + w[20] = hc_byte_perm (w[ 9], w[10], selector); + w[19] = hc_byte_perm (w[ 8], w[ 9], selector); + w[18] = hc_byte_perm (w[ 7], w[ 8], selector); + w[17] = hc_byte_perm (w[ 6], w[ 7], selector); + w[16] = hc_byte_perm (w[ 5], w[ 6], selector); + w[15] = hc_byte_perm (w[ 4], w[ 5], selector); + w[14] = hc_byte_perm (w[ 3], w[ 4], selector); + w[13] = hc_byte_perm (w[ 2], w[ 3], selector); + w[12] = hc_byte_perm (w[ 1], w[ 2], selector); + w[11] = hc_byte_perm (w[ 0], w[ 1], selector); + w[10] = hc_byte_perm ( 0, w[ 0], selector); w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; @@ -16784,59 +16784,59 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 11: - w[63] = __byte_perm (w[51], w[52], selector); - w[62] = __byte_perm (w[50], w[51], selector); - w[61] = __byte_perm (w[49], w[50], selector); - w[60] = __byte_perm (w[48], w[49], selector); - w[59] = __byte_perm (w[47], w[48], selector); - w[58] = __byte_perm (w[46], w[47], selector); - w[57] = __byte_perm (w[45], w[46], selector); - w[56] = __byte_perm (w[44], w[45], selector); - w[55] = __byte_perm (w[43], w[44], selector); - w[54] = __byte_perm (w[42], w[43], selector); - w[53] = __byte_perm (w[41], w[42], selector); - w[52] = __byte_perm (w[40], w[41], selector); - w[51] = __byte_perm (w[39], w[40], selector); - w[50] = __byte_perm (w[38], w[39], selector); - w[49] = __byte_perm (w[37], w[38], selector); - w[48] = __byte_perm (w[36], w[37], selector); - w[47] = __byte_perm (w[35], w[36], selector); - w[46] = __byte_perm (w[34], w[35], selector); - w[45] = __byte_perm (w[33], w[34], selector); - w[44] = __byte_perm (w[32], w[33], selector); - w[43] = __byte_perm (w[31], w[32], selector); - w[42] = __byte_perm (w[30], w[31], selector); - w[41] = __byte_perm (w[29], w[30], selector); - w[40] = __byte_perm (w[28], w[29], selector); - w[39] = __byte_perm (w[27], w[28], selector); - w[38] = __byte_perm (w[26], w[27], selector); - w[37] = __byte_perm (w[25], w[26], selector); - w[36] = __byte_perm (w[24], w[25], selector); - w[35] = __byte_perm (w[23], w[24], selector); - w[34] = __byte_perm (w[22], w[23], selector); - w[33] = __byte_perm (w[21], w[22], selector); - w[32] = __byte_perm (w[20], w[21], selector); - w[31] = __byte_perm (w[19], w[20], selector); - w[30] = __byte_perm (w[18], w[19], selector); - w[29] = __byte_perm (w[17], w[18], selector); - w[28] = __byte_perm (w[16], w[17], selector); - w[27] = __byte_perm (w[15], w[16], selector); - w[26] = __byte_perm (w[14], w[15], selector); - w[25] = __byte_perm (w[13], w[14], selector); - w[24] = __byte_perm (w[12], w[13], selector); - w[23] = __byte_perm (w[11], w[12], selector); - w[22] = __byte_perm (w[10], w[11], selector); - w[21] = __byte_perm (w[ 9], w[10], selector); - w[20] = __byte_perm (w[ 8], w[ 9], selector); - w[19] = __byte_perm (w[ 7], w[ 8], selector); - w[18] = __byte_perm (w[ 6], w[ 7], selector); - w[17] = __byte_perm (w[ 5], w[ 6], selector); - w[16] = __byte_perm (w[ 4], w[ 5], selector); - w[15] = __byte_perm (w[ 3], w[ 4], selector); - w[14] = __byte_perm (w[ 2], w[ 3], selector); - w[13] = __byte_perm (w[ 1], w[ 2], selector); - w[12] = __byte_perm (w[ 0], w[ 1], selector); - w[11] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[51], w[52], selector); + w[62] = hc_byte_perm (w[50], w[51], selector); + w[61] = hc_byte_perm (w[49], w[50], selector); + w[60] = hc_byte_perm (w[48], w[49], selector); + w[59] = hc_byte_perm (w[47], w[48], selector); + w[58] = hc_byte_perm (w[46], w[47], selector); + w[57] = hc_byte_perm (w[45], w[46], selector); + w[56] = hc_byte_perm (w[44], w[45], selector); + w[55] = hc_byte_perm (w[43], w[44], selector); + w[54] = hc_byte_perm (w[42], w[43], selector); + w[53] = hc_byte_perm (w[41], w[42], selector); + w[52] = hc_byte_perm (w[40], w[41], selector); + w[51] = hc_byte_perm (w[39], w[40], selector); + w[50] = hc_byte_perm (w[38], w[39], selector); + w[49] = hc_byte_perm (w[37], w[38], selector); + w[48] = hc_byte_perm (w[36], w[37], selector); + w[47] = hc_byte_perm (w[35], w[36], selector); + w[46] = hc_byte_perm (w[34], w[35], selector); + w[45] = hc_byte_perm (w[33], w[34], selector); + w[44] = hc_byte_perm (w[32], w[33], selector); + w[43] = hc_byte_perm (w[31], w[32], selector); + w[42] = hc_byte_perm (w[30], w[31], selector); + w[41] = hc_byte_perm (w[29], w[30], selector); + w[40] = hc_byte_perm (w[28], w[29], selector); + w[39] = hc_byte_perm (w[27], w[28], selector); + w[38] = hc_byte_perm (w[26], w[27], selector); + w[37] = hc_byte_perm (w[25], w[26], selector); + w[36] = hc_byte_perm (w[24], w[25], selector); + w[35] = hc_byte_perm (w[23], w[24], selector); + w[34] = hc_byte_perm (w[22], w[23], selector); + w[33] = hc_byte_perm (w[21], w[22], selector); + w[32] = hc_byte_perm (w[20], w[21], selector); + w[31] = hc_byte_perm (w[19], w[20], selector); + w[30] = hc_byte_perm (w[18], w[19], selector); + w[29] = hc_byte_perm (w[17], w[18], selector); + w[28] = hc_byte_perm (w[16], w[17], selector); + w[27] = hc_byte_perm (w[15], w[16], selector); + w[26] = hc_byte_perm (w[14], w[15], selector); + w[25] = hc_byte_perm (w[13], w[14], selector); + w[24] = hc_byte_perm (w[12], w[13], selector); + w[23] = hc_byte_perm (w[11], w[12], selector); + w[22] = hc_byte_perm (w[10], w[11], selector); + w[21] = hc_byte_perm (w[ 9], w[10], selector); + w[20] = hc_byte_perm (w[ 8], w[ 9], selector); + w[19] = hc_byte_perm (w[ 7], w[ 8], selector); + w[18] = hc_byte_perm (w[ 6], w[ 7], selector); + w[17] = hc_byte_perm (w[ 5], w[ 6], selector); + w[16] = hc_byte_perm (w[ 4], w[ 5], selector); + w[15] = hc_byte_perm (w[ 3], w[ 4], selector); + w[14] = hc_byte_perm (w[ 2], w[ 3], selector); + w[13] = hc_byte_perm (w[ 1], w[ 2], selector); + w[12] = hc_byte_perm (w[ 0], w[ 1], selector); + w[11] = hc_byte_perm ( 0, w[ 0], selector); w[10] = 0; w[ 9] = 0; w[ 8] = 0; @@ -16852,58 +16852,58 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 12: - w[63] = __byte_perm (w[50], w[51], selector); - w[62] = __byte_perm (w[49], w[50], selector); - w[61] = __byte_perm (w[48], w[49], selector); - w[60] = __byte_perm (w[47], w[48], selector); - w[59] = __byte_perm (w[46], w[47], selector); - w[58] = __byte_perm (w[45], w[46], selector); - w[57] = __byte_perm (w[44], w[45], selector); - w[56] = __byte_perm (w[43], w[44], selector); - w[55] = __byte_perm (w[42], w[43], selector); - w[54] = __byte_perm (w[41], w[42], selector); - w[53] = __byte_perm (w[40], w[41], selector); - w[52] = __byte_perm (w[39], w[40], selector); - w[51] = __byte_perm (w[38], w[39], selector); - w[50] = __byte_perm (w[37], w[38], selector); - w[49] = __byte_perm (w[36], w[37], selector); - w[48] = __byte_perm (w[35], w[36], selector); - w[47] = __byte_perm (w[34], w[35], selector); - w[46] = __byte_perm (w[33], w[34], selector); - w[45] = __byte_perm (w[32], w[33], selector); - w[44] = __byte_perm (w[31], w[32], selector); - w[43] = __byte_perm (w[30], w[31], selector); - w[42] = __byte_perm (w[29], w[30], selector); - w[41] = __byte_perm (w[28], w[29], selector); - w[40] = __byte_perm (w[27], w[28], selector); - w[39] = __byte_perm (w[26], w[27], selector); - w[38] = __byte_perm (w[25], w[26], selector); - w[37] = __byte_perm (w[24], w[25], selector); - w[36] = __byte_perm (w[23], w[24], selector); - w[35] = __byte_perm (w[22], w[23], selector); - w[34] = __byte_perm (w[21], w[22], selector); - w[33] = __byte_perm (w[20], w[21], selector); - w[32] = __byte_perm (w[19], w[20], selector); - w[31] = __byte_perm (w[18], w[19], selector); - w[30] = __byte_perm (w[17], w[18], selector); - w[29] = __byte_perm (w[16], w[17], selector); - w[28] = __byte_perm (w[15], w[16], selector); - w[27] = __byte_perm (w[14], w[15], selector); - w[26] = __byte_perm (w[13], w[14], selector); - w[25] = __byte_perm (w[12], w[13], selector); - w[24] = __byte_perm (w[11], w[12], selector); - w[23] = __byte_perm (w[10], w[11], selector); - w[22] = __byte_perm (w[ 9], w[10], selector); - w[21] = __byte_perm (w[ 8], w[ 9], selector); - w[20] = __byte_perm (w[ 7], w[ 8], selector); - w[19] = __byte_perm (w[ 6], w[ 7], selector); - w[18] = __byte_perm (w[ 5], w[ 6], selector); - w[17] = __byte_perm (w[ 4], w[ 5], selector); - w[16] = __byte_perm (w[ 3], w[ 4], selector); - w[15] = __byte_perm (w[ 2], w[ 3], selector); - w[14] = __byte_perm (w[ 1], w[ 2], selector); - w[13] = __byte_perm (w[ 0], w[ 1], selector); - w[12] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[50], w[51], selector); + w[62] = hc_byte_perm (w[49], w[50], selector); + w[61] = hc_byte_perm (w[48], w[49], selector); + w[60] = hc_byte_perm (w[47], w[48], selector); + w[59] = hc_byte_perm (w[46], w[47], selector); + w[58] = hc_byte_perm (w[45], w[46], selector); + w[57] = hc_byte_perm (w[44], w[45], selector); + w[56] = hc_byte_perm (w[43], w[44], selector); + w[55] = hc_byte_perm (w[42], w[43], selector); + w[54] = hc_byte_perm (w[41], w[42], selector); + w[53] = hc_byte_perm (w[40], w[41], selector); + w[52] = hc_byte_perm (w[39], w[40], selector); + w[51] = hc_byte_perm (w[38], w[39], selector); + w[50] = hc_byte_perm (w[37], w[38], selector); + w[49] = hc_byte_perm (w[36], w[37], selector); + w[48] = hc_byte_perm (w[35], w[36], selector); + w[47] = hc_byte_perm (w[34], w[35], selector); + w[46] = hc_byte_perm (w[33], w[34], selector); + w[45] = hc_byte_perm (w[32], w[33], selector); + w[44] = hc_byte_perm (w[31], w[32], selector); + w[43] = hc_byte_perm (w[30], w[31], selector); + w[42] = hc_byte_perm (w[29], w[30], selector); + w[41] = hc_byte_perm (w[28], w[29], selector); + w[40] = hc_byte_perm (w[27], w[28], selector); + w[39] = hc_byte_perm (w[26], w[27], selector); + w[38] = hc_byte_perm (w[25], w[26], selector); + w[37] = hc_byte_perm (w[24], w[25], selector); + w[36] = hc_byte_perm (w[23], w[24], selector); + w[35] = hc_byte_perm (w[22], w[23], selector); + w[34] = hc_byte_perm (w[21], w[22], selector); + w[33] = hc_byte_perm (w[20], w[21], selector); + w[32] = hc_byte_perm (w[19], w[20], selector); + w[31] = hc_byte_perm (w[18], w[19], selector); + w[30] = hc_byte_perm (w[17], w[18], selector); + w[29] = hc_byte_perm (w[16], w[17], selector); + w[28] = hc_byte_perm (w[15], w[16], selector); + w[27] = hc_byte_perm (w[14], w[15], selector); + w[26] = hc_byte_perm (w[13], w[14], selector); + w[25] = hc_byte_perm (w[12], w[13], selector); + w[24] = hc_byte_perm (w[11], w[12], selector); + w[23] = hc_byte_perm (w[10], w[11], selector); + w[22] = hc_byte_perm (w[ 9], w[10], selector); + w[21] = hc_byte_perm (w[ 8], w[ 9], selector); + w[20] = hc_byte_perm (w[ 7], w[ 8], selector); + w[19] = hc_byte_perm (w[ 6], w[ 7], selector); + w[18] = hc_byte_perm (w[ 5], w[ 6], selector); + w[17] = hc_byte_perm (w[ 4], w[ 5], selector); + w[16] = hc_byte_perm (w[ 3], w[ 4], selector); + w[15] = hc_byte_perm (w[ 2], w[ 3], selector); + w[14] = hc_byte_perm (w[ 1], w[ 2], selector); + w[13] = hc_byte_perm (w[ 0], w[ 1], selector); + w[12] = hc_byte_perm ( 0, w[ 0], selector); w[11] = 0; w[10] = 0; w[ 9] = 0; @@ -16920,57 +16920,57 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 13: - w[63] = __byte_perm (w[49], w[50], selector); - w[62] = __byte_perm (w[48], w[49], selector); - w[61] = __byte_perm (w[47], w[48], selector); - w[60] = __byte_perm (w[46], w[47], selector); - w[59] = __byte_perm (w[45], w[46], selector); - w[58] = __byte_perm (w[44], w[45], selector); - w[57] = __byte_perm (w[43], w[44], selector); - w[56] = __byte_perm (w[42], w[43], selector); - w[55] = __byte_perm (w[41], w[42], selector); - w[54] = __byte_perm (w[40], w[41], selector); - w[53] = __byte_perm (w[39], w[40], selector); - w[52] = __byte_perm (w[38], w[39], selector); - w[51] = __byte_perm (w[37], w[38], selector); - w[50] = __byte_perm (w[36], w[37], selector); - w[49] = __byte_perm (w[35], w[36], selector); - w[48] = __byte_perm (w[34], w[35], selector); - w[47] = __byte_perm (w[33], w[34], selector); - w[46] = __byte_perm (w[32], w[33], selector); - w[45] = __byte_perm (w[31], w[32], selector); - w[44] = __byte_perm (w[30], w[31], selector); - w[43] = __byte_perm (w[29], w[30], selector); - w[42] = __byte_perm (w[28], w[29], selector); - w[41] = __byte_perm (w[27], w[28], selector); - w[40] = __byte_perm (w[26], w[27], selector); - w[39] = __byte_perm (w[25], w[26], selector); - w[38] = __byte_perm (w[24], w[25], selector); - w[37] = __byte_perm (w[23], w[24], selector); - w[36] = __byte_perm (w[22], w[23], selector); - w[35] = __byte_perm (w[21], w[22], selector); - w[34] = __byte_perm (w[20], w[21], selector); - w[33] = __byte_perm (w[19], w[20], selector); - w[32] = __byte_perm (w[18], w[19], selector); - w[31] = __byte_perm (w[17], w[18], selector); - w[30] = __byte_perm (w[16], w[17], selector); - w[29] = __byte_perm (w[15], w[16], selector); - w[28] = __byte_perm (w[14], w[15], selector); - w[27] = __byte_perm (w[13], w[14], selector); - w[26] = __byte_perm (w[12], w[13], selector); - w[25] = __byte_perm (w[11], w[12], selector); - w[24] = __byte_perm (w[10], w[11], selector); - w[23] = __byte_perm (w[ 9], w[10], selector); - w[22] = __byte_perm (w[ 8], w[ 9], selector); - w[21] = __byte_perm (w[ 7], w[ 8], selector); - w[20] = __byte_perm (w[ 6], w[ 7], selector); - w[19] = __byte_perm (w[ 5], w[ 6], selector); - w[18] = __byte_perm (w[ 4], w[ 5], selector); - w[17] = __byte_perm (w[ 3], w[ 4], selector); - w[16] = __byte_perm (w[ 2], w[ 3], selector); - w[15] = __byte_perm (w[ 1], w[ 2], selector); - w[14] = __byte_perm (w[ 0], w[ 1], selector); - w[13] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[49], w[50], selector); + w[62] = hc_byte_perm (w[48], w[49], selector); + w[61] = hc_byte_perm (w[47], w[48], selector); + w[60] = hc_byte_perm (w[46], w[47], selector); + w[59] = hc_byte_perm (w[45], w[46], selector); + w[58] = hc_byte_perm (w[44], w[45], selector); + w[57] = hc_byte_perm (w[43], w[44], selector); + w[56] = hc_byte_perm (w[42], w[43], selector); + w[55] = hc_byte_perm (w[41], w[42], selector); + w[54] = hc_byte_perm (w[40], w[41], selector); + w[53] = hc_byte_perm (w[39], w[40], selector); + w[52] = hc_byte_perm (w[38], w[39], selector); + w[51] = hc_byte_perm (w[37], w[38], selector); + w[50] = hc_byte_perm (w[36], w[37], selector); + w[49] = hc_byte_perm (w[35], w[36], selector); + w[48] = hc_byte_perm (w[34], w[35], selector); + w[47] = hc_byte_perm (w[33], w[34], selector); + w[46] = hc_byte_perm (w[32], w[33], selector); + w[45] = hc_byte_perm (w[31], w[32], selector); + w[44] = hc_byte_perm (w[30], w[31], selector); + w[43] = hc_byte_perm (w[29], w[30], selector); + w[42] = hc_byte_perm (w[28], w[29], selector); + w[41] = hc_byte_perm (w[27], w[28], selector); + w[40] = hc_byte_perm (w[26], w[27], selector); + w[39] = hc_byte_perm (w[25], w[26], selector); + w[38] = hc_byte_perm (w[24], w[25], selector); + w[37] = hc_byte_perm (w[23], w[24], selector); + w[36] = hc_byte_perm (w[22], w[23], selector); + w[35] = hc_byte_perm (w[21], w[22], selector); + w[34] = hc_byte_perm (w[20], w[21], selector); + w[33] = hc_byte_perm (w[19], w[20], selector); + w[32] = hc_byte_perm (w[18], w[19], selector); + w[31] = hc_byte_perm (w[17], w[18], selector); + w[30] = hc_byte_perm (w[16], w[17], selector); + w[29] = hc_byte_perm (w[15], w[16], selector); + w[28] = hc_byte_perm (w[14], w[15], selector); + w[27] = hc_byte_perm (w[13], w[14], selector); + w[26] = hc_byte_perm (w[12], w[13], selector); + w[25] = hc_byte_perm (w[11], w[12], selector); + w[24] = hc_byte_perm (w[10], w[11], selector); + w[23] = hc_byte_perm (w[ 9], w[10], selector); + w[22] = hc_byte_perm (w[ 8], w[ 9], selector); + w[21] = hc_byte_perm (w[ 7], w[ 8], selector); + w[20] = hc_byte_perm (w[ 6], w[ 7], selector); + w[19] = hc_byte_perm (w[ 5], w[ 6], selector); + w[18] = hc_byte_perm (w[ 4], w[ 5], selector); + w[17] = hc_byte_perm (w[ 3], w[ 4], selector); + w[16] = hc_byte_perm (w[ 2], w[ 3], selector); + w[15] = hc_byte_perm (w[ 1], w[ 2], selector); + w[14] = hc_byte_perm (w[ 0], w[ 1], selector); + w[13] = hc_byte_perm ( 0, w[ 0], selector); w[12] = 0; w[11] = 0; w[10] = 0; @@ -16988,56 +16988,56 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 14: - w[63] = __byte_perm (w[48], w[49], selector); - w[62] = __byte_perm (w[47], w[48], selector); - w[61] = __byte_perm (w[46], w[47], selector); - w[60] = __byte_perm (w[45], w[46], selector); - w[59] = __byte_perm (w[44], w[45], selector); - w[58] = __byte_perm (w[43], w[44], selector); - w[57] = __byte_perm (w[42], w[43], selector); - w[56] = __byte_perm (w[41], w[42], selector); - w[55] = __byte_perm (w[40], w[41], selector); - w[54] = __byte_perm (w[39], w[40], selector); - w[53] = __byte_perm (w[38], w[39], selector); - w[52] = __byte_perm (w[37], w[38], selector); - w[51] = __byte_perm (w[36], w[37], selector); - w[50] = __byte_perm (w[35], w[36], selector); - w[49] = __byte_perm (w[34], w[35], selector); - w[48] = __byte_perm (w[33], w[34], selector); - w[47] = __byte_perm (w[32], w[33], selector); - w[46] = __byte_perm (w[31], w[32], selector); - w[45] = __byte_perm (w[30], w[31], selector); - w[44] = __byte_perm (w[29], w[30], selector); - w[43] = __byte_perm (w[28], w[29], selector); - w[42] = __byte_perm (w[27], w[28], selector); - w[41] = __byte_perm (w[26], w[27], selector); - w[40] = __byte_perm (w[25], w[26], selector); - w[39] = __byte_perm (w[24], w[25], selector); - w[38] = __byte_perm (w[23], w[24], selector); - w[37] = __byte_perm (w[22], w[23], selector); - w[36] = __byte_perm (w[21], w[22], selector); - w[35] = __byte_perm (w[20], w[21], selector); - w[34] = __byte_perm (w[19], w[20], selector); - w[33] = __byte_perm (w[18], w[19], selector); - w[32] = __byte_perm (w[17], w[18], selector); - w[31] = __byte_perm (w[16], w[17], selector); - w[30] = __byte_perm (w[15], w[16], selector); - w[29] = __byte_perm (w[14], w[15], selector); - w[28] = __byte_perm (w[13], w[14], selector); - w[27] = __byte_perm (w[12], w[13], selector); - w[26] = __byte_perm (w[11], w[12], selector); - w[25] = __byte_perm (w[10], w[11], selector); - w[24] = __byte_perm (w[ 9], w[10], selector); - w[23] = __byte_perm (w[ 8], w[ 9], selector); - w[22] = __byte_perm (w[ 7], w[ 8], selector); - w[21] = __byte_perm (w[ 6], w[ 7], selector); - w[20] = __byte_perm (w[ 5], w[ 6], selector); - w[19] = __byte_perm (w[ 4], w[ 5], selector); - w[18] = __byte_perm (w[ 3], w[ 4], selector); - w[17] = __byte_perm (w[ 2], w[ 3], selector); - w[16] = __byte_perm (w[ 1], w[ 2], selector); - w[15] = __byte_perm (w[ 0], w[ 1], selector); - w[14] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[48], w[49], selector); + w[62] = hc_byte_perm (w[47], w[48], selector); + w[61] = hc_byte_perm (w[46], w[47], selector); + w[60] = hc_byte_perm (w[45], w[46], selector); + w[59] = hc_byte_perm (w[44], w[45], selector); + w[58] = hc_byte_perm (w[43], w[44], selector); + w[57] = hc_byte_perm (w[42], w[43], selector); + w[56] = hc_byte_perm (w[41], w[42], selector); + w[55] = hc_byte_perm (w[40], w[41], selector); + w[54] = hc_byte_perm (w[39], w[40], selector); + w[53] = hc_byte_perm (w[38], w[39], selector); + w[52] = hc_byte_perm (w[37], w[38], selector); + w[51] = hc_byte_perm (w[36], w[37], selector); + w[50] = hc_byte_perm (w[35], w[36], selector); + w[49] = hc_byte_perm (w[34], w[35], selector); + w[48] = hc_byte_perm (w[33], w[34], selector); + w[47] = hc_byte_perm (w[32], w[33], selector); + w[46] = hc_byte_perm (w[31], w[32], selector); + w[45] = hc_byte_perm (w[30], w[31], selector); + w[44] = hc_byte_perm (w[29], w[30], selector); + w[43] = hc_byte_perm (w[28], w[29], selector); + w[42] = hc_byte_perm (w[27], w[28], selector); + w[41] = hc_byte_perm (w[26], w[27], selector); + w[40] = hc_byte_perm (w[25], w[26], selector); + w[39] = hc_byte_perm (w[24], w[25], selector); + w[38] = hc_byte_perm (w[23], w[24], selector); + w[37] = hc_byte_perm (w[22], w[23], selector); + w[36] = hc_byte_perm (w[21], w[22], selector); + w[35] = hc_byte_perm (w[20], w[21], selector); + w[34] = hc_byte_perm (w[19], w[20], selector); + w[33] = hc_byte_perm (w[18], w[19], selector); + w[32] = hc_byte_perm (w[17], w[18], selector); + w[31] = hc_byte_perm (w[16], w[17], selector); + w[30] = hc_byte_perm (w[15], w[16], selector); + w[29] = hc_byte_perm (w[14], w[15], selector); + w[28] = hc_byte_perm (w[13], w[14], selector); + w[27] = hc_byte_perm (w[12], w[13], selector); + w[26] = hc_byte_perm (w[11], w[12], selector); + w[25] = hc_byte_perm (w[10], w[11], selector); + w[24] = hc_byte_perm (w[ 9], w[10], selector); + w[23] = hc_byte_perm (w[ 8], w[ 9], selector); + w[22] = hc_byte_perm (w[ 7], w[ 8], selector); + w[21] = hc_byte_perm (w[ 6], w[ 7], selector); + w[20] = hc_byte_perm (w[ 5], w[ 6], selector); + w[19] = hc_byte_perm (w[ 4], w[ 5], selector); + w[18] = hc_byte_perm (w[ 3], w[ 4], selector); + w[17] = hc_byte_perm (w[ 2], w[ 3], selector); + w[16] = hc_byte_perm (w[ 1], w[ 2], selector); + w[15] = hc_byte_perm (w[ 0], w[ 1], selector); + w[14] = hc_byte_perm ( 0, w[ 0], selector); w[13] = 0; w[12] = 0; w[11] = 0; @@ -17056,55 +17056,55 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 15: - w[63] = __byte_perm (w[47], w[48], selector); - w[62] = __byte_perm (w[46], w[47], selector); - w[61] = __byte_perm (w[45], w[46], selector); - w[60] = __byte_perm (w[44], w[45], selector); - w[59] = __byte_perm (w[43], w[44], selector); - w[58] = __byte_perm (w[42], w[43], selector); - w[57] = __byte_perm (w[41], w[42], selector); - w[56] = __byte_perm (w[40], w[41], selector); - w[55] = __byte_perm (w[39], w[40], selector); - w[54] = __byte_perm (w[38], w[39], selector); - w[53] = __byte_perm (w[37], w[38], selector); - w[52] = __byte_perm (w[36], w[37], selector); - w[51] = __byte_perm (w[35], w[36], selector); - w[50] = __byte_perm (w[34], w[35], selector); - w[49] = __byte_perm (w[33], w[34], selector); - w[48] = __byte_perm (w[32], w[33], selector); - w[47] = __byte_perm (w[31], w[32], selector); - w[46] = __byte_perm (w[30], w[31], selector); - w[45] = __byte_perm (w[29], w[30], selector); - w[44] = __byte_perm (w[28], w[29], selector); - w[43] = __byte_perm (w[27], w[28], selector); - w[42] = __byte_perm (w[26], w[27], selector); - w[41] = __byte_perm (w[25], w[26], selector); - w[40] = __byte_perm (w[24], w[25], selector); - w[39] = __byte_perm (w[23], w[24], selector); - w[38] = __byte_perm (w[22], w[23], selector); - w[37] = __byte_perm (w[21], w[22], selector); - w[36] = __byte_perm (w[20], w[21], selector); - w[35] = __byte_perm (w[19], w[20], selector); - w[34] = __byte_perm (w[18], w[19], selector); - w[33] = __byte_perm (w[17], w[18], selector); - w[32] = __byte_perm (w[16], w[17], selector); - w[31] = __byte_perm (w[15], w[16], selector); - w[30] = __byte_perm (w[14], w[15], selector); - w[29] = __byte_perm (w[13], w[14], selector); - w[28] = __byte_perm (w[12], w[13], selector); - w[27] = __byte_perm (w[11], w[12], selector); - w[26] = __byte_perm (w[10], w[11], selector); - w[25] = __byte_perm (w[ 9], w[10], selector); - w[24] = __byte_perm (w[ 8], w[ 9], selector); - w[23] = __byte_perm (w[ 7], w[ 8], selector); - w[22] = __byte_perm (w[ 6], w[ 7], selector); - w[21] = __byte_perm (w[ 5], w[ 6], selector); - w[20] = __byte_perm (w[ 4], w[ 5], selector); - w[19] = __byte_perm (w[ 3], w[ 4], selector); - w[18] = __byte_perm (w[ 2], w[ 3], selector); - w[17] = __byte_perm (w[ 1], w[ 2], selector); - w[16] = __byte_perm (w[ 0], w[ 1], selector); - w[15] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[47], w[48], selector); + w[62] = hc_byte_perm (w[46], w[47], selector); + w[61] = hc_byte_perm (w[45], w[46], selector); + w[60] = hc_byte_perm (w[44], w[45], selector); + w[59] = hc_byte_perm (w[43], w[44], selector); + w[58] = hc_byte_perm (w[42], w[43], selector); + w[57] = hc_byte_perm (w[41], w[42], selector); + w[56] = hc_byte_perm (w[40], w[41], selector); + w[55] = hc_byte_perm (w[39], w[40], selector); + w[54] = hc_byte_perm (w[38], w[39], selector); + w[53] = hc_byte_perm (w[37], w[38], selector); + w[52] = hc_byte_perm (w[36], w[37], selector); + w[51] = hc_byte_perm (w[35], w[36], selector); + w[50] = hc_byte_perm (w[34], w[35], selector); + w[49] = hc_byte_perm (w[33], w[34], selector); + w[48] = hc_byte_perm (w[32], w[33], selector); + w[47] = hc_byte_perm (w[31], w[32], selector); + w[46] = hc_byte_perm (w[30], w[31], selector); + w[45] = hc_byte_perm (w[29], w[30], selector); + w[44] = hc_byte_perm (w[28], w[29], selector); + w[43] = hc_byte_perm (w[27], w[28], selector); + w[42] = hc_byte_perm (w[26], w[27], selector); + w[41] = hc_byte_perm (w[25], w[26], selector); + w[40] = hc_byte_perm (w[24], w[25], selector); + w[39] = hc_byte_perm (w[23], w[24], selector); + w[38] = hc_byte_perm (w[22], w[23], selector); + w[37] = hc_byte_perm (w[21], w[22], selector); + w[36] = hc_byte_perm (w[20], w[21], selector); + w[35] = hc_byte_perm (w[19], w[20], selector); + w[34] = hc_byte_perm (w[18], w[19], selector); + w[33] = hc_byte_perm (w[17], w[18], selector); + w[32] = hc_byte_perm (w[16], w[17], selector); + w[31] = hc_byte_perm (w[15], w[16], selector); + w[30] = hc_byte_perm (w[14], w[15], selector); + w[29] = hc_byte_perm (w[13], w[14], selector); + w[28] = hc_byte_perm (w[12], w[13], selector); + w[27] = hc_byte_perm (w[11], w[12], selector); + w[26] = hc_byte_perm (w[10], w[11], selector); + w[25] = hc_byte_perm (w[ 9], w[10], selector); + w[24] = hc_byte_perm (w[ 8], w[ 9], selector); + w[23] = hc_byte_perm (w[ 7], w[ 8], selector); + w[22] = hc_byte_perm (w[ 6], w[ 7], selector); + w[21] = hc_byte_perm (w[ 5], w[ 6], selector); + w[20] = hc_byte_perm (w[ 4], w[ 5], selector); + w[19] = hc_byte_perm (w[ 3], w[ 4], selector); + w[18] = hc_byte_perm (w[ 2], w[ 3], selector); + w[17] = hc_byte_perm (w[ 1], w[ 2], selector); + w[16] = hc_byte_perm (w[ 0], w[ 1], selector); + w[15] = hc_byte_perm ( 0, w[ 0], selector); w[14] = 0; w[13] = 0; w[12] = 0; @@ -17124,54 +17124,54 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 16: - w[63] = __byte_perm (w[46], w[47], selector); - w[62] = __byte_perm (w[45], w[46], selector); - w[61] = __byte_perm (w[44], w[45], selector); - w[60] = __byte_perm (w[43], w[44], selector); - w[59] = __byte_perm (w[42], w[43], selector); - w[58] = __byte_perm (w[41], w[42], selector); - w[57] = __byte_perm (w[40], w[41], selector); - w[56] = __byte_perm (w[39], w[40], selector); - w[55] = __byte_perm (w[38], w[39], selector); - w[54] = __byte_perm (w[37], w[38], selector); - w[53] = __byte_perm (w[36], w[37], selector); - w[52] = __byte_perm (w[35], w[36], selector); - w[51] = __byte_perm (w[34], w[35], selector); - w[50] = __byte_perm (w[33], w[34], selector); - w[49] = __byte_perm (w[32], w[33], selector); - w[48] = __byte_perm (w[31], w[32], selector); - w[47] = __byte_perm (w[30], w[31], selector); - w[46] = __byte_perm (w[29], w[30], selector); - w[45] = __byte_perm (w[28], w[29], selector); - w[44] = __byte_perm (w[27], w[28], selector); - w[43] = __byte_perm (w[26], w[27], selector); - w[42] = __byte_perm (w[25], w[26], selector); - w[41] = __byte_perm (w[24], w[25], selector); - w[40] = __byte_perm (w[23], w[24], selector); - w[39] = __byte_perm (w[22], w[23], selector); - w[38] = __byte_perm (w[21], w[22], selector); - w[37] = __byte_perm (w[20], w[21], selector); - w[36] = __byte_perm (w[19], w[20], selector); - w[35] = __byte_perm (w[18], w[19], selector); - w[34] = __byte_perm (w[17], w[18], selector); - w[33] = __byte_perm (w[16], w[17], selector); - w[32] = __byte_perm (w[15], w[16], selector); - w[31] = __byte_perm (w[14], w[15], selector); - w[30] = __byte_perm (w[13], w[14], selector); - w[29] = __byte_perm (w[12], w[13], selector); - w[28] = __byte_perm (w[11], w[12], selector); - w[27] = __byte_perm (w[10], w[11], selector); - w[26] = __byte_perm (w[ 9], w[10], selector); - w[25] = __byte_perm (w[ 8], w[ 9], selector); - w[24] = __byte_perm (w[ 7], w[ 8], selector); - w[23] = __byte_perm (w[ 6], w[ 7], selector); - w[22] = __byte_perm (w[ 5], w[ 6], selector); - w[21] = __byte_perm (w[ 4], w[ 5], selector); - w[20] = __byte_perm (w[ 3], w[ 4], selector); - w[19] = __byte_perm (w[ 2], w[ 3], selector); - w[18] = __byte_perm (w[ 1], w[ 2], selector); - w[17] = __byte_perm (w[ 0], w[ 1], selector); - w[16] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[46], w[47], selector); + w[62] = hc_byte_perm (w[45], w[46], selector); + w[61] = hc_byte_perm (w[44], w[45], selector); + w[60] = hc_byte_perm (w[43], w[44], selector); + w[59] = hc_byte_perm (w[42], w[43], selector); + w[58] = hc_byte_perm (w[41], w[42], selector); + w[57] = hc_byte_perm (w[40], w[41], selector); + w[56] = hc_byte_perm (w[39], w[40], selector); + w[55] = hc_byte_perm (w[38], w[39], selector); + w[54] = hc_byte_perm (w[37], w[38], selector); + w[53] = hc_byte_perm (w[36], w[37], selector); + w[52] = hc_byte_perm (w[35], w[36], selector); + w[51] = hc_byte_perm (w[34], w[35], selector); + w[50] = hc_byte_perm (w[33], w[34], selector); + w[49] = hc_byte_perm (w[32], w[33], selector); + w[48] = hc_byte_perm (w[31], w[32], selector); + w[47] = hc_byte_perm (w[30], w[31], selector); + w[46] = hc_byte_perm (w[29], w[30], selector); + w[45] = hc_byte_perm (w[28], w[29], selector); + w[44] = hc_byte_perm (w[27], w[28], selector); + w[43] = hc_byte_perm (w[26], w[27], selector); + w[42] = hc_byte_perm (w[25], w[26], selector); + w[41] = hc_byte_perm (w[24], w[25], selector); + w[40] = hc_byte_perm (w[23], w[24], selector); + w[39] = hc_byte_perm (w[22], w[23], selector); + w[38] = hc_byte_perm (w[21], w[22], selector); + w[37] = hc_byte_perm (w[20], w[21], selector); + w[36] = hc_byte_perm (w[19], w[20], selector); + w[35] = hc_byte_perm (w[18], w[19], selector); + w[34] = hc_byte_perm (w[17], w[18], selector); + w[33] = hc_byte_perm (w[16], w[17], selector); + w[32] = hc_byte_perm (w[15], w[16], selector); + w[31] = hc_byte_perm (w[14], w[15], selector); + w[30] = hc_byte_perm (w[13], w[14], selector); + w[29] = hc_byte_perm (w[12], w[13], selector); + w[28] = hc_byte_perm (w[11], w[12], selector); + w[27] = hc_byte_perm (w[10], w[11], selector); + w[26] = hc_byte_perm (w[ 9], w[10], selector); + w[25] = hc_byte_perm (w[ 8], w[ 9], selector); + w[24] = hc_byte_perm (w[ 7], w[ 8], selector); + w[23] = hc_byte_perm (w[ 6], w[ 7], selector); + w[22] = hc_byte_perm (w[ 5], w[ 6], selector); + w[21] = hc_byte_perm (w[ 4], w[ 5], selector); + w[20] = hc_byte_perm (w[ 3], w[ 4], selector); + w[19] = hc_byte_perm (w[ 2], w[ 3], selector); + w[18] = hc_byte_perm (w[ 1], w[ 2], selector); + w[17] = hc_byte_perm (w[ 0], w[ 1], selector); + w[16] = hc_byte_perm ( 0, w[ 0], selector); w[15] = 0; w[14] = 0; w[13] = 0; @@ -17192,53 +17192,53 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 17: - w[63] = __byte_perm (w[45], w[46], selector); - w[62] = __byte_perm (w[44], w[45], selector); - w[61] = __byte_perm (w[43], w[44], selector); - w[60] = __byte_perm (w[42], w[43], selector); - w[59] = __byte_perm (w[41], w[42], selector); - w[58] = __byte_perm (w[40], w[41], selector); - w[57] = __byte_perm (w[39], w[40], selector); - w[56] = __byte_perm (w[38], w[39], selector); - w[55] = __byte_perm (w[37], w[38], selector); - w[54] = __byte_perm (w[36], w[37], selector); - w[53] = __byte_perm (w[35], w[36], selector); - w[52] = __byte_perm (w[34], w[35], selector); - w[51] = __byte_perm (w[33], w[34], selector); - w[50] = __byte_perm (w[32], w[33], selector); - w[49] = __byte_perm (w[31], w[32], selector); - w[48] = __byte_perm (w[30], w[31], selector); - w[47] = __byte_perm (w[29], w[30], selector); - w[46] = __byte_perm (w[28], w[29], selector); - w[45] = __byte_perm (w[27], w[28], selector); - w[44] = __byte_perm (w[26], w[27], selector); - w[43] = __byte_perm (w[25], w[26], selector); - w[42] = __byte_perm (w[24], w[25], selector); - w[41] = __byte_perm (w[23], w[24], selector); - w[40] = __byte_perm (w[22], w[23], selector); - w[39] = __byte_perm (w[21], w[22], selector); - w[38] = __byte_perm (w[20], w[21], selector); - w[37] = __byte_perm (w[19], w[20], selector); - w[36] = __byte_perm (w[18], w[19], selector); - w[35] = __byte_perm (w[17], w[18], selector); - w[34] = __byte_perm (w[16], w[17], selector); - w[33] = __byte_perm (w[15], w[16], selector); - w[32] = __byte_perm (w[14], w[15], selector); - w[31] = __byte_perm (w[13], w[14], selector); - w[30] = __byte_perm (w[12], w[13], selector); - w[29] = __byte_perm (w[11], w[12], selector); - w[28] = __byte_perm (w[10], w[11], selector); - w[27] = __byte_perm (w[ 9], w[10], selector); - w[26] = __byte_perm (w[ 8], w[ 9], selector); - w[25] = __byte_perm (w[ 7], w[ 8], selector); - w[24] = __byte_perm (w[ 6], w[ 7], selector); - w[23] = __byte_perm (w[ 5], w[ 6], selector); - w[22] = __byte_perm (w[ 4], w[ 5], selector); - w[21] = __byte_perm (w[ 3], w[ 4], selector); - w[20] = __byte_perm (w[ 2], w[ 3], selector); - w[19] = __byte_perm (w[ 1], w[ 2], selector); - w[18] = __byte_perm (w[ 0], w[ 1], selector); - w[17] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[45], w[46], selector); + w[62] = hc_byte_perm (w[44], w[45], selector); + w[61] = hc_byte_perm (w[43], w[44], selector); + w[60] = hc_byte_perm (w[42], w[43], selector); + w[59] = hc_byte_perm (w[41], w[42], selector); + w[58] = hc_byte_perm (w[40], w[41], selector); + w[57] = hc_byte_perm (w[39], w[40], selector); + w[56] = hc_byte_perm (w[38], w[39], selector); + w[55] = hc_byte_perm (w[37], w[38], selector); + w[54] = hc_byte_perm (w[36], w[37], selector); + w[53] = hc_byte_perm (w[35], w[36], selector); + w[52] = hc_byte_perm (w[34], w[35], selector); + w[51] = hc_byte_perm (w[33], w[34], selector); + w[50] = hc_byte_perm (w[32], w[33], selector); + w[49] = hc_byte_perm (w[31], w[32], selector); + w[48] = hc_byte_perm (w[30], w[31], selector); + w[47] = hc_byte_perm (w[29], w[30], selector); + w[46] = hc_byte_perm (w[28], w[29], selector); + w[45] = hc_byte_perm (w[27], w[28], selector); + w[44] = hc_byte_perm (w[26], w[27], selector); + w[43] = hc_byte_perm (w[25], w[26], selector); + w[42] = hc_byte_perm (w[24], w[25], selector); + w[41] = hc_byte_perm (w[23], w[24], selector); + w[40] = hc_byte_perm (w[22], w[23], selector); + w[39] = hc_byte_perm (w[21], w[22], selector); + w[38] = hc_byte_perm (w[20], w[21], selector); + w[37] = hc_byte_perm (w[19], w[20], selector); + w[36] = hc_byte_perm (w[18], w[19], selector); + w[35] = hc_byte_perm (w[17], w[18], selector); + w[34] = hc_byte_perm (w[16], w[17], selector); + w[33] = hc_byte_perm (w[15], w[16], selector); + w[32] = hc_byte_perm (w[14], w[15], selector); + w[31] = hc_byte_perm (w[13], w[14], selector); + w[30] = hc_byte_perm (w[12], w[13], selector); + w[29] = hc_byte_perm (w[11], w[12], selector); + w[28] = hc_byte_perm (w[10], w[11], selector); + w[27] = hc_byte_perm (w[ 9], w[10], selector); + w[26] = hc_byte_perm (w[ 8], w[ 9], selector); + w[25] = hc_byte_perm (w[ 7], w[ 8], selector); + w[24] = hc_byte_perm (w[ 6], w[ 7], selector); + w[23] = hc_byte_perm (w[ 5], w[ 6], selector); + w[22] = hc_byte_perm (w[ 4], w[ 5], selector); + w[21] = hc_byte_perm (w[ 3], w[ 4], selector); + w[20] = hc_byte_perm (w[ 2], w[ 3], selector); + w[19] = hc_byte_perm (w[ 1], w[ 2], selector); + w[18] = hc_byte_perm (w[ 0], w[ 1], selector); + w[17] = hc_byte_perm ( 0, w[ 0], selector); w[16] = 0; w[15] = 0; w[14] = 0; @@ -17260,52 +17260,52 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 18: - w[63] = __byte_perm (w[44], w[45], selector); - w[62] = __byte_perm (w[43], w[44], selector); - w[61] = __byte_perm (w[42], w[43], selector); - w[60] = __byte_perm (w[41], w[42], selector); - w[59] = __byte_perm (w[40], w[41], selector); - w[58] = __byte_perm (w[39], w[40], selector); - w[57] = __byte_perm (w[38], w[39], selector); - w[56] = __byte_perm (w[37], w[38], selector); - w[55] = __byte_perm (w[36], w[37], selector); - w[54] = __byte_perm (w[35], w[36], selector); - w[53] = __byte_perm (w[34], w[35], selector); - w[52] = __byte_perm (w[33], w[34], selector); - w[51] = __byte_perm (w[32], w[33], selector); - w[50] = __byte_perm (w[31], w[32], selector); - w[49] = __byte_perm (w[30], w[31], selector); - w[48] = __byte_perm (w[29], w[30], selector); - w[47] = __byte_perm (w[28], w[29], selector); - w[46] = __byte_perm (w[27], w[28], selector); - w[45] = __byte_perm (w[26], w[27], selector); - w[44] = __byte_perm (w[25], w[26], selector); - w[43] = __byte_perm (w[24], w[25], selector); - w[42] = __byte_perm (w[23], w[24], selector); - w[41] = __byte_perm (w[22], w[23], selector); - w[40] = __byte_perm (w[21], w[22], selector); - w[39] = __byte_perm (w[20], w[21], selector); - w[38] = __byte_perm (w[19], w[20], selector); - w[37] = __byte_perm (w[18], w[19], selector); - w[36] = __byte_perm (w[17], w[18], selector); - w[35] = __byte_perm (w[16], w[17], selector); - w[34] = __byte_perm (w[15], w[16], selector); - w[33] = __byte_perm (w[14], w[15], selector); - w[32] = __byte_perm (w[13], w[14], selector); - w[31] = __byte_perm (w[12], w[13], selector); - w[30] = __byte_perm (w[11], w[12], selector); - w[29] = __byte_perm (w[10], w[11], selector); - w[28] = __byte_perm (w[ 9], w[10], selector); - w[27] = __byte_perm (w[ 8], w[ 9], selector); - w[26] = __byte_perm (w[ 7], w[ 8], selector); - w[25] = __byte_perm (w[ 6], w[ 7], selector); - w[24] = __byte_perm (w[ 5], w[ 6], selector); - w[23] = __byte_perm (w[ 4], w[ 5], selector); - w[22] = __byte_perm (w[ 3], w[ 4], selector); - w[21] = __byte_perm (w[ 2], w[ 3], selector); - w[20] = __byte_perm (w[ 1], w[ 2], selector); - w[19] = __byte_perm (w[ 0], w[ 1], selector); - w[18] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[44], w[45], selector); + w[62] = hc_byte_perm (w[43], w[44], selector); + w[61] = hc_byte_perm (w[42], w[43], selector); + w[60] = hc_byte_perm (w[41], w[42], selector); + w[59] = hc_byte_perm (w[40], w[41], selector); + w[58] = hc_byte_perm (w[39], w[40], selector); + w[57] = hc_byte_perm (w[38], w[39], selector); + w[56] = hc_byte_perm (w[37], w[38], selector); + w[55] = hc_byte_perm (w[36], w[37], selector); + w[54] = hc_byte_perm (w[35], w[36], selector); + w[53] = hc_byte_perm (w[34], w[35], selector); + w[52] = hc_byte_perm (w[33], w[34], selector); + w[51] = hc_byte_perm (w[32], w[33], selector); + w[50] = hc_byte_perm (w[31], w[32], selector); + w[49] = hc_byte_perm (w[30], w[31], selector); + w[48] = hc_byte_perm (w[29], w[30], selector); + w[47] = hc_byte_perm (w[28], w[29], selector); + w[46] = hc_byte_perm (w[27], w[28], selector); + w[45] = hc_byte_perm (w[26], w[27], selector); + w[44] = hc_byte_perm (w[25], w[26], selector); + w[43] = hc_byte_perm (w[24], w[25], selector); + w[42] = hc_byte_perm (w[23], w[24], selector); + w[41] = hc_byte_perm (w[22], w[23], selector); + w[40] = hc_byte_perm (w[21], w[22], selector); + w[39] = hc_byte_perm (w[20], w[21], selector); + w[38] = hc_byte_perm (w[19], w[20], selector); + w[37] = hc_byte_perm (w[18], w[19], selector); + w[36] = hc_byte_perm (w[17], w[18], selector); + w[35] = hc_byte_perm (w[16], w[17], selector); + w[34] = hc_byte_perm (w[15], w[16], selector); + w[33] = hc_byte_perm (w[14], w[15], selector); + w[32] = hc_byte_perm (w[13], w[14], selector); + w[31] = hc_byte_perm (w[12], w[13], selector); + w[30] = hc_byte_perm (w[11], w[12], selector); + w[29] = hc_byte_perm (w[10], w[11], selector); + w[28] = hc_byte_perm (w[ 9], w[10], selector); + w[27] = hc_byte_perm (w[ 8], w[ 9], selector); + w[26] = hc_byte_perm (w[ 7], w[ 8], selector); + w[25] = hc_byte_perm (w[ 6], w[ 7], selector); + w[24] = hc_byte_perm (w[ 5], w[ 6], selector); + w[23] = hc_byte_perm (w[ 4], w[ 5], selector); + w[22] = hc_byte_perm (w[ 3], w[ 4], selector); + w[21] = hc_byte_perm (w[ 2], w[ 3], selector); + w[20] = hc_byte_perm (w[ 1], w[ 2], selector); + w[19] = hc_byte_perm (w[ 0], w[ 1], selector); + w[18] = hc_byte_perm ( 0, w[ 0], selector); w[17] = 0; w[16] = 0; w[15] = 0; @@ -17328,51 +17328,51 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 19: - w[63] = __byte_perm (w[43], w[44], selector); - w[62] = __byte_perm (w[42], w[43], selector); - w[61] = __byte_perm (w[41], w[42], selector); - w[60] = __byte_perm (w[40], w[41], selector); - w[59] = __byte_perm (w[39], w[40], selector); - w[58] = __byte_perm (w[38], w[39], selector); - w[57] = __byte_perm (w[37], w[38], selector); - w[56] = __byte_perm (w[36], w[37], selector); - w[55] = __byte_perm (w[35], w[36], selector); - w[54] = __byte_perm (w[34], w[35], selector); - w[53] = __byte_perm (w[33], w[34], selector); - w[52] = __byte_perm (w[32], w[33], selector); - w[51] = __byte_perm (w[31], w[32], selector); - w[50] = __byte_perm (w[30], w[31], selector); - w[49] = __byte_perm (w[29], w[30], selector); - w[48] = __byte_perm (w[28], w[29], selector); - w[47] = __byte_perm (w[27], w[28], selector); - w[46] = __byte_perm (w[26], w[27], selector); - w[45] = __byte_perm (w[25], w[26], selector); - w[44] = __byte_perm (w[24], w[25], selector); - w[43] = __byte_perm (w[23], w[24], selector); - w[42] = __byte_perm (w[22], w[23], selector); - w[41] = __byte_perm (w[21], w[22], selector); - w[40] = __byte_perm (w[20], w[21], selector); - w[39] = __byte_perm (w[19], w[20], selector); - w[38] = __byte_perm (w[18], w[19], selector); - w[37] = __byte_perm (w[17], w[18], selector); - w[36] = __byte_perm (w[16], w[17], selector); - w[35] = __byte_perm (w[15], w[16], selector); - w[34] = __byte_perm (w[14], w[15], selector); - w[33] = __byte_perm (w[13], w[14], selector); - w[32] = __byte_perm (w[12], w[13], selector); - w[31] = __byte_perm (w[11], w[12], selector); - w[30] = __byte_perm (w[10], w[11], selector); - w[29] = __byte_perm (w[ 9], w[10], selector); - w[28] = __byte_perm (w[ 8], w[ 9], selector); - w[27] = __byte_perm (w[ 7], w[ 8], selector); - w[26] = __byte_perm (w[ 6], w[ 7], selector); - w[25] = __byte_perm (w[ 5], w[ 6], selector); - w[24] = __byte_perm (w[ 4], w[ 5], selector); - w[23] = __byte_perm (w[ 3], w[ 4], selector); - w[22] = __byte_perm (w[ 2], w[ 3], selector); - w[21] = __byte_perm (w[ 1], w[ 2], selector); - w[20] = __byte_perm (w[ 0], w[ 1], selector); - w[19] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[43], w[44], selector); + w[62] = hc_byte_perm (w[42], w[43], selector); + w[61] = hc_byte_perm (w[41], w[42], selector); + w[60] = hc_byte_perm (w[40], w[41], selector); + w[59] = hc_byte_perm (w[39], w[40], selector); + w[58] = hc_byte_perm (w[38], w[39], selector); + w[57] = hc_byte_perm (w[37], w[38], selector); + w[56] = hc_byte_perm (w[36], w[37], selector); + w[55] = hc_byte_perm (w[35], w[36], selector); + w[54] = hc_byte_perm (w[34], w[35], selector); + w[53] = hc_byte_perm (w[33], w[34], selector); + w[52] = hc_byte_perm (w[32], w[33], selector); + w[51] = hc_byte_perm (w[31], w[32], selector); + w[50] = hc_byte_perm (w[30], w[31], selector); + w[49] = hc_byte_perm (w[29], w[30], selector); + w[48] = hc_byte_perm (w[28], w[29], selector); + w[47] = hc_byte_perm (w[27], w[28], selector); + w[46] = hc_byte_perm (w[26], w[27], selector); + w[45] = hc_byte_perm (w[25], w[26], selector); + w[44] = hc_byte_perm (w[24], w[25], selector); + w[43] = hc_byte_perm (w[23], w[24], selector); + w[42] = hc_byte_perm (w[22], w[23], selector); + w[41] = hc_byte_perm (w[21], w[22], selector); + w[40] = hc_byte_perm (w[20], w[21], selector); + w[39] = hc_byte_perm (w[19], w[20], selector); + w[38] = hc_byte_perm (w[18], w[19], selector); + w[37] = hc_byte_perm (w[17], w[18], selector); + w[36] = hc_byte_perm (w[16], w[17], selector); + w[35] = hc_byte_perm (w[15], w[16], selector); + w[34] = hc_byte_perm (w[14], w[15], selector); + w[33] = hc_byte_perm (w[13], w[14], selector); + w[32] = hc_byte_perm (w[12], w[13], selector); + w[31] = hc_byte_perm (w[11], w[12], selector); + w[30] = hc_byte_perm (w[10], w[11], selector); + w[29] = hc_byte_perm (w[ 9], w[10], selector); + w[28] = hc_byte_perm (w[ 8], w[ 9], selector); + w[27] = hc_byte_perm (w[ 7], w[ 8], selector); + w[26] = hc_byte_perm (w[ 6], w[ 7], selector); + w[25] = hc_byte_perm (w[ 5], w[ 6], selector); + w[24] = hc_byte_perm (w[ 4], w[ 5], selector); + w[23] = hc_byte_perm (w[ 3], w[ 4], selector); + w[22] = hc_byte_perm (w[ 2], w[ 3], selector); + w[21] = hc_byte_perm (w[ 1], w[ 2], selector); + w[20] = hc_byte_perm (w[ 0], w[ 1], selector); + w[19] = hc_byte_perm ( 0, w[ 0], selector); w[18] = 0; w[17] = 0; w[16] = 0; @@ -17396,50 +17396,50 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 20: - w[63] = __byte_perm (w[42], w[43], selector); - w[62] = __byte_perm (w[41], w[42], selector); - w[61] = __byte_perm (w[40], w[41], selector); - w[60] = __byte_perm (w[39], w[40], selector); - w[59] = __byte_perm (w[38], w[39], selector); - w[58] = __byte_perm (w[37], w[38], selector); - w[57] = __byte_perm (w[36], w[37], selector); - w[56] = __byte_perm (w[35], w[36], selector); - w[55] = __byte_perm (w[34], w[35], selector); - w[54] = __byte_perm (w[33], w[34], selector); - w[53] = __byte_perm (w[32], w[33], selector); - w[52] = __byte_perm (w[31], w[32], selector); - w[51] = __byte_perm (w[30], w[31], selector); - w[50] = __byte_perm (w[29], w[30], selector); - w[49] = __byte_perm (w[28], w[29], selector); - w[48] = __byte_perm (w[27], w[28], selector); - w[47] = __byte_perm (w[26], w[27], selector); - w[46] = __byte_perm (w[25], w[26], selector); - w[45] = __byte_perm (w[24], w[25], selector); - w[44] = __byte_perm (w[23], w[24], selector); - w[43] = __byte_perm (w[22], w[23], selector); - w[42] = __byte_perm (w[21], w[22], selector); - w[41] = __byte_perm (w[20], w[21], selector); - w[40] = __byte_perm (w[19], w[20], selector); - w[39] = __byte_perm (w[18], w[19], selector); - w[38] = __byte_perm (w[17], w[18], selector); - w[37] = __byte_perm (w[16], w[17], selector); - w[36] = __byte_perm (w[15], w[16], selector); - w[35] = __byte_perm (w[14], w[15], selector); - w[34] = __byte_perm (w[13], w[14], selector); - w[33] = __byte_perm (w[12], w[13], selector); - w[32] = __byte_perm (w[11], w[12], selector); - w[31] = __byte_perm (w[10], w[11], selector); - w[30] = __byte_perm (w[ 9], w[10], selector); - w[29] = __byte_perm (w[ 8], w[ 9], selector); - w[28] = __byte_perm (w[ 7], w[ 8], selector); - w[27] = __byte_perm (w[ 6], w[ 7], selector); - w[26] = __byte_perm (w[ 5], w[ 6], selector); - w[25] = __byte_perm (w[ 4], w[ 5], selector); - w[24] = __byte_perm (w[ 3], w[ 4], selector); - w[23] = __byte_perm (w[ 2], w[ 3], selector); - w[22] = __byte_perm (w[ 1], w[ 2], selector); - w[21] = __byte_perm (w[ 0], w[ 1], selector); - w[20] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[42], w[43], selector); + w[62] = hc_byte_perm (w[41], w[42], selector); + w[61] = hc_byte_perm (w[40], w[41], selector); + w[60] = hc_byte_perm (w[39], w[40], selector); + w[59] = hc_byte_perm (w[38], w[39], selector); + w[58] = hc_byte_perm (w[37], w[38], selector); + w[57] = hc_byte_perm (w[36], w[37], selector); + w[56] = hc_byte_perm (w[35], w[36], selector); + w[55] = hc_byte_perm (w[34], w[35], selector); + w[54] = hc_byte_perm (w[33], w[34], selector); + w[53] = hc_byte_perm (w[32], w[33], selector); + w[52] = hc_byte_perm (w[31], w[32], selector); + w[51] = hc_byte_perm (w[30], w[31], selector); + w[50] = hc_byte_perm (w[29], w[30], selector); + w[49] = hc_byte_perm (w[28], w[29], selector); + w[48] = hc_byte_perm (w[27], w[28], selector); + w[47] = hc_byte_perm (w[26], w[27], selector); + w[46] = hc_byte_perm (w[25], w[26], selector); + w[45] = hc_byte_perm (w[24], w[25], selector); + w[44] = hc_byte_perm (w[23], w[24], selector); + w[43] = hc_byte_perm (w[22], w[23], selector); + w[42] = hc_byte_perm (w[21], w[22], selector); + w[41] = hc_byte_perm (w[20], w[21], selector); + w[40] = hc_byte_perm (w[19], w[20], selector); + w[39] = hc_byte_perm (w[18], w[19], selector); + w[38] = hc_byte_perm (w[17], w[18], selector); + w[37] = hc_byte_perm (w[16], w[17], selector); + w[36] = hc_byte_perm (w[15], w[16], selector); + w[35] = hc_byte_perm (w[14], w[15], selector); + w[34] = hc_byte_perm (w[13], w[14], selector); + w[33] = hc_byte_perm (w[12], w[13], selector); + w[32] = hc_byte_perm (w[11], w[12], selector); + w[31] = hc_byte_perm (w[10], w[11], selector); + w[30] = hc_byte_perm (w[ 9], w[10], selector); + w[29] = hc_byte_perm (w[ 8], w[ 9], selector); + w[28] = hc_byte_perm (w[ 7], w[ 8], selector); + w[27] = hc_byte_perm (w[ 6], w[ 7], selector); + w[26] = hc_byte_perm (w[ 5], w[ 6], selector); + w[25] = hc_byte_perm (w[ 4], w[ 5], selector); + w[24] = hc_byte_perm (w[ 3], w[ 4], selector); + w[23] = hc_byte_perm (w[ 2], w[ 3], selector); + w[22] = hc_byte_perm (w[ 1], w[ 2], selector); + w[21] = hc_byte_perm (w[ 0], w[ 1], selector); + w[20] = hc_byte_perm ( 0, w[ 0], selector); w[19] = 0; w[18] = 0; w[17] = 0; @@ -17464,49 +17464,49 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 21: - w[63] = __byte_perm (w[41], w[42], selector); - w[62] = __byte_perm (w[40], w[41], selector); - w[61] = __byte_perm (w[39], w[40], selector); - w[60] = __byte_perm (w[38], w[39], selector); - w[59] = __byte_perm (w[37], w[38], selector); - w[58] = __byte_perm (w[36], w[37], selector); - w[57] = __byte_perm (w[35], w[36], selector); - w[56] = __byte_perm (w[34], w[35], selector); - w[55] = __byte_perm (w[33], w[34], selector); - w[54] = __byte_perm (w[32], w[33], selector); - w[53] = __byte_perm (w[31], w[32], selector); - w[52] = __byte_perm (w[30], w[31], selector); - w[51] = __byte_perm (w[29], w[30], selector); - w[50] = __byte_perm (w[28], w[29], selector); - w[49] = __byte_perm (w[27], w[28], selector); - w[48] = __byte_perm (w[26], w[27], selector); - w[47] = __byte_perm (w[25], w[26], selector); - w[46] = __byte_perm (w[24], w[25], selector); - w[45] = __byte_perm (w[23], w[24], selector); - w[44] = __byte_perm (w[22], w[23], selector); - w[43] = __byte_perm (w[21], w[22], selector); - w[42] = __byte_perm (w[20], w[21], selector); - w[41] = __byte_perm (w[19], w[20], selector); - w[40] = __byte_perm (w[18], w[19], selector); - w[39] = __byte_perm (w[17], w[18], selector); - w[38] = __byte_perm (w[16], w[17], selector); - w[37] = __byte_perm (w[15], w[16], selector); - w[36] = __byte_perm (w[14], w[15], selector); - w[35] = __byte_perm (w[13], w[14], selector); - w[34] = __byte_perm (w[12], w[13], selector); - w[33] = __byte_perm (w[11], w[12], selector); - w[32] = __byte_perm (w[10], w[11], selector); - w[31] = __byte_perm (w[ 9], w[10], selector); - w[30] = __byte_perm (w[ 8], w[ 9], selector); - w[29] = __byte_perm (w[ 7], w[ 8], selector); - w[28] = __byte_perm (w[ 6], w[ 7], selector); - w[27] = __byte_perm (w[ 5], w[ 6], selector); - w[26] = __byte_perm (w[ 4], w[ 5], selector); - w[25] = __byte_perm (w[ 3], w[ 4], selector); - w[24] = __byte_perm (w[ 2], w[ 3], selector); - w[23] = __byte_perm (w[ 1], w[ 2], selector); - w[22] = __byte_perm (w[ 0], w[ 1], selector); - w[21] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[41], w[42], selector); + w[62] = hc_byte_perm (w[40], w[41], selector); + w[61] = hc_byte_perm (w[39], w[40], selector); + w[60] = hc_byte_perm (w[38], w[39], selector); + w[59] = hc_byte_perm (w[37], w[38], selector); + w[58] = hc_byte_perm (w[36], w[37], selector); + w[57] = hc_byte_perm (w[35], w[36], selector); + w[56] = hc_byte_perm (w[34], w[35], selector); + w[55] = hc_byte_perm (w[33], w[34], selector); + w[54] = hc_byte_perm (w[32], w[33], selector); + w[53] = hc_byte_perm (w[31], w[32], selector); + w[52] = hc_byte_perm (w[30], w[31], selector); + w[51] = hc_byte_perm (w[29], w[30], selector); + w[50] = hc_byte_perm (w[28], w[29], selector); + w[49] = hc_byte_perm (w[27], w[28], selector); + w[48] = hc_byte_perm (w[26], w[27], selector); + w[47] = hc_byte_perm (w[25], w[26], selector); + w[46] = hc_byte_perm (w[24], w[25], selector); + w[45] = hc_byte_perm (w[23], w[24], selector); + w[44] = hc_byte_perm (w[22], w[23], selector); + w[43] = hc_byte_perm (w[21], w[22], selector); + w[42] = hc_byte_perm (w[20], w[21], selector); + w[41] = hc_byte_perm (w[19], w[20], selector); + w[40] = hc_byte_perm (w[18], w[19], selector); + w[39] = hc_byte_perm (w[17], w[18], selector); + w[38] = hc_byte_perm (w[16], w[17], selector); + w[37] = hc_byte_perm (w[15], w[16], selector); + w[36] = hc_byte_perm (w[14], w[15], selector); + w[35] = hc_byte_perm (w[13], w[14], selector); + w[34] = hc_byte_perm (w[12], w[13], selector); + w[33] = hc_byte_perm (w[11], w[12], selector); + w[32] = hc_byte_perm (w[10], w[11], selector); + w[31] = hc_byte_perm (w[ 9], w[10], selector); + w[30] = hc_byte_perm (w[ 8], w[ 9], selector); + w[29] = hc_byte_perm (w[ 7], w[ 8], selector); + w[28] = hc_byte_perm (w[ 6], w[ 7], selector); + w[27] = hc_byte_perm (w[ 5], w[ 6], selector); + w[26] = hc_byte_perm (w[ 4], w[ 5], selector); + w[25] = hc_byte_perm (w[ 3], w[ 4], selector); + w[24] = hc_byte_perm (w[ 2], w[ 3], selector); + w[23] = hc_byte_perm (w[ 1], w[ 2], selector); + w[22] = hc_byte_perm (w[ 0], w[ 1], selector); + w[21] = hc_byte_perm ( 0, w[ 0], selector); w[20] = 0; w[19] = 0; w[18] = 0; @@ -17532,48 +17532,48 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 22: - w[63] = __byte_perm (w[40], w[41], selector); - w[62] = __byte_perm (w[39], w[40], selector); - w[61] = __byte_perm (w[38], w[39], selector); - w[60] = __byte_perm (w[37], w[38], selector); - w[59] = __byte_perm (w[36], w[37], selector); - w[58] = __byte_perm (w[35], w[36], selector); - w[57] = __byte_perm (w[34], w[35], selector); - w[56] = __byte_perm (w[33], w[34], selector); - w[55] = __byte_perm (w[32], w[33], selector); - w[54] = __byte_perm (w[31], w[32], selector); - w[53] = __byte_perm (w[30], w[31], selector); - w[52] = __byte_perm (w[29], w[30], selector); - w[51] = __byte_perm (w[28], w[29], selector); - w[50] = __byte_perm (w[27], w[28], selector); - w[49] = __byte_perm (w[26], w[27], selector); - w[48] = __byte_perm (w[25], w[26], selector); - w[47] = __byte_perm (w[24], w[25], selector); - w[46] = __byte_perm (w[23], w[24], selector); - w[45] = __byte_perm (w[22], w[23], selector); - w[44] = __byte_perm (w[21], w[22], selector); - w[43] = __byte_perm (w[20], w[21], selector); - w[42] = __byte_perm (w[19], w[20], selector); - w[41] = __byte_perm (w[18], w[19], selector); - w[40] = __byte_perm (w[17], w[18], selector); - w[39] = __byte_perm (w[16], w[17], selector); - w[38] = __byte_perm (w[15], w[16], selector); - w[37] = __byte_perm (w[14], w[15], selector); - w[36] = __byte_perm (w[13], w[14], selector); - w[35] = __byte_perm (w[12], w[13], selector); - w[34] = __byte_perm (w[11], w[12], selector); - w[33] = __byte_perm (w[10], w[11], selector); - w[32] = __byte_perm (w[ 9], w[10], selector); - w[31] = __byte_perm (w[ 8], w[ 9], selector); - w[30] = __byte_perm (w[ 7], w[ 8], selector); - w[29] = __byte_perm (w[ 6], w[ 7], selector); - w[28] = __byte_perm (w[ 5], w[ 6], selector); - w[27] = __byte_perm (w[ 4], w[ 5], selector); - w[26] = __byte_perm (w[ 3], w[ 4], selector); - w[25] = __byte_perm (w[ 2], w[ 3], selector); - w[24] = __byte_perm (w[ 1], w[ 2], selector); - w[23] = __byte_perm (w[ 0], w[ 1], selector); - w[22] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[40], w[41], selector); + w[62] = hc_byte_perm (w[39], w[40], selector); + w[61] = hc_byte_perm (w[38], w[39], selector); + w[60] = hc_byte_perm (w[37], w[38], selector); + w[59] = hc_byte_perm (w[36], w[37], selector); + w[58] = hc_byte_perm (w[35], w[36], selector); + w[57] = hc_byte_perm (w[34], w[35], selector); + w[56] = hc_byte_perm (w[33], w[34], selector); + w[55] = hc_byte_perm (w[32], w[33], selector); + w[54] = hc_byte_perm (w[31], w[32], selector); + w[53] = hc_byte_perm (w[30], w[31], selector); + w[52] = hc_byte_perm (w[29], w[30], selector); + w[51] = hc_byte_perm (w[28], w[29], selector); + w[50] = hc_byte_perm (w[27], w[28], selector); + w[49] = hc_byte_perm (w[26], w[27], selector); + w[48] = hc_byte_perm (w[25], w[26], selector); + w[47] = hc_byte_perm (w[24], w[25], selector); + w[46] = hc_byte_perm (w[23], w[24], selector); + w[45] = hc_byte_perm (w[22], w[23], selector); + w[44] = hc_byte_perm (w[21], w[22], selector); + w[43] = hc_byte_perm (w[20], w[21], selector); + w[42] = hc_byte_perm (w[19], w[20], selector); + w[41] = hc_byte_perm (w[18], w[19], selector); + w[40] = hc_byte_perm (w[17], w[18], selector); + w[39] = hc_byte_perm (w[16], w[17], selector); + w[38] = hc_byte_perm (w[15], w[16], selector); + w[37] = hc_byte_perm (w[14], w[15], selector); + w[36] = hc_byte_perm (w[13], w[14], selector); + w[35] = hc_byte_perm (w[12], w[13], selector); + w[34] = hc_byte_perm (w[11], w[12], selector); + w[33] = hc_byte_perm (w[10], w[11], selector); + w[32] = hc_byte_perm (w[ 9], w[10], selector); + w[31] = hc_byte_perm (w[ 8], w[ 9], selector); + w[30] = hc_byte_perm (w[ 7], w[ 8], selector); + w[29] = hc_byte_perm (w[ 6], w[ 7], selector); + w[28] = hc_byte_perm (w[ 5], w[ 6], selector); + w[27] = hc_byte_perm (w[ 4], w[ 5], selector); + w[26] = hc_byte_perm (w[ 3], w[ 4], selector); + w[25] = hc_byte_perm (w[ 2], w[ 3], selector); + w[24] = hc_byte_perm (w[ 1], w[ 2], selector); + w[23] = hc_byte_perm (w[ 0], w[ 1], selector); + w[22] = hc_byte_perm ( 0, w[ 0], selector); w[21] = 0; w[20] = 0; w[19] = 0; @@ -17600,47 +17600,47 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 23: - w[63] = __byte_perm (w[39], w[40], selector); - w[62] = __byte_perm (w[38], w[39], selector); - w[61] = __byte_perm (w[37], w[38], selector); - w[60] = __byte_perm (w[36], w[37], selector); - w[59] = __byte_perm (w[35], w[36], selector); - w[58] = __byte_perm (w[34], w[35], selector); - w[57] = __byte_perm (w[33], w[34], selector); - w[56] = __byte_perm (w[32], w[33], selector); - w[55] = __byte_perm (w[31], w[32], selector); - w[54] = __byte_perm (w[30], w[31], selector); - w[53] = __byte_perm (w[29], w[30], selector); - w[52] = __byte_perm (w[28], w[29], selector); - w[51] = __byte_perm (w[27], w[28], selector); - w[50] = __byte_perm (w[26], w[27], selector); - w[49] = __byte_perm (w[25], w[26], selector); - w[48] = __byte_perm (w[24], w[25], selector); - w[47] = __byte_perm (w[23], w[24], selector); - w[46] = __byte_perm (w[22], w[23], selector); - w[45] = __byte_perm (w[21], w[22], selector); - w[44] = __byte_perm (w[20], w[21], selector); - w[43] = __byte_perm (w[19], w[20], selector); - w[42] = __byte_perm (w[18], w[19], selector); - w[41] = __byte_perm (w[17], w[18], selector); - w[40] = __byte_perm (w[16], w[17], selector); - w[39] = __byte_perm (w[15], w[16], selector); - w[38] = __byte_perm (w[14], w[15], selector); - w[37] = __byte_perm (w[13], w[14], selector); - w[36] = __byte_perm (w[12], w[13], selector); - w[35] = __byte_perm (w[11], w[12], selector); - w[34] = __byte_perm (w[10], w[11], selector); - w[33] = __byte_perm (w[ 9], w[10], selector); - w[32] = __byte_perm (w[ 8], w[ 9], selector); - w[31] = __byte_perm (w[ 7], w[ 8], selector); - w[30] = __byte_perm (w[ 6], w[ 7], selector); - w[29] = __byte_perm (w[ 5], w[ 6], selector); - w[28] = __byte_perm (w[ 4], w[ 5], selector); - w[27] = __byte_perm (w[ 3], w[ 4], selector); - w[26] = __byte_perm (w[ 2], w[ 3], selector); - w[25] = __byte_perm (w[ 1], w[ 2], selector); - w[24] = __byte_perm (w[ 0], w[ 1], selector); - w[23] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[39], w[40], selector); + w[62] = hc_byte_perm (w[38], w[39], selector); + w[61] = hc_byte_perm (w[37], w[38], selector); + w[60] = hc_byte_perm (w[36], w[37], selector); + w[59] = hc_byte_perm (w[35], w[36], selector); + w[58] = hc_byte_perm (w[34], w[35], selector); + w[57] = hc_byte_perm (w[33], w[34], selector); + w[56] = hc_byte_perm (w[32], w[33], selector); + w[55] = hc_byte_perm (w[31], w[32], selector); + w[54] = hc_byte_perm (w[30], w[31], selector); + w[53] = hc_byte_perm (w[29], w[30], selector); + w[52] = hc_byte_perm (w[28], w[29], selector); + w[51] = hc_byte_perm (w[27], w[28], selector); + w[50] = hc_byte_perm (w[26], w[27], selector); + w[49] = hc_byte_perm (w[25], w[26], selector); + w[48] = hc_byte_perm (w[24], w[25], selector); + w[47] = hc_byte_perm (w[23], w[24], selector); + w[46] = hc_byte_perm (w[22], w[23], selector); + w[45] = hc_byte_perm (w[21], w[22], selector); + w[44] = hc_byte_perm (w[20], w[21], selector); + w[43] = hc_byte_perm (w[19], w[20], selector); + w[42] = hc_byte_perm (w[18], w[19], selector); + w[41] = hc_byte_perm (w[17], w[18], selector); + w[40] = hc_byte_perm (w[16], w[17], selector); + w[39] = hc_byte_perm (w[15], w[16], selector); + w[38] = hc_byte_perm (w[14], w[15], selector); + w[37] = hc_byte_perm (w[13], w[14], selector); + w[36] = hc_byte_perm (w[12], w[13], selector); + w[35] = hc_byte_perm (w[11], w[12], selector); + w[34] = hc_byte_perm (w[10], w[11], selector); + w[33] = hc_byte_perm (w[ 9], w[10], selector); + w[32] = hc_byte_perm (w[ 8], w[ 9], selector); + w[31] = hc_byte_perm (w[ 7], w[ 8], selector); + w[30] = hc_byte_perm (w[ 6], w[ 7], selector); + w[29] = hc_byte_perm (w[ 5], w[ 6], selector); + w[28] = hc_byte_perm (w[ 4], w[ 5], selector); + w[27] = hc_byte_perm (w[ 3], w[ 4], selector); + w[26] = hc_byte_perm (w[ 2], w[ 3], selector); + w[25] = hc_byte_perm (w[ 1], w[ 2], selector); + w[24] = hc_byte_perm (w[ 0], w[ 1], selector); + w[23] = hc_byte_perm ( 0, w[ 0], selector); w[22] = 0; w[21] = 0; w[20] = 0; @@ -17668,46 +17668,46 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 24: - w[63] = __byte_perm (w[38], w[39], selector); - w[62] = __byte_perm (w[37], w[38], selector); - w[61] = __byte_perm (w[36], w[37], selector); - w[60] = __byte_perm (w[35], w[36], selector); - w[59] = __byte_perm (w[34], w[35], selector); - w[58] = __byte_perm (w[33], w[34], selector); - w[57] = __byte_perm (w[32], w[33], selector); - w[56] = __byte_perm (w[31], w[32], selector); - w[55] = __byte_perm (w[30], w[31], selector); - w[54] = __byte_perm (w[29], w[30], selector); - w[53] = __byte_perm (w[28], w[29], selector); - w[52] = __byte_perm (w[27], w[28], selector); - w[51] = __byte_perm (w[26], w[27], selector); - w[50] = __byte_perm (w[25], w[26], selector); - w[49] = __byte_perm (w[24], w[25], selector); - w[48] = __byte_perm (w[23], w[24], selector); - w[47] = __byte_perm (w[22], w[23], selector); - w[46] = __byte_perm (w[21], w[22], selector); - w[45] = __byte_perm (w[20], w[21], selector); - w[44] = __byte_perm (w[19], w[20], selector); - w[43] = __byte_perm (w[18], w[19], selector); - w[42] = __byte_perm (w[17], w[18], selector); - w[41] = __byte_perm (w[16], w[17], selector); - w[40] = __byte_perm (w[15], w[16], selector); - w[39] = __byte_perm (w[14], w[15], selector); - w[38] = __byte_perm (w[13], w[14], selector); - w[37] = __byte_perm (w[12], w[13], selector); - w[36] = __byte_perm (w[11], w[12], selector); - w[35] = __byte_perm (w[10], w[11], selector); - w[34] = __byte_perm (w[ 9], w[10], selector); - w[33] = __byte_perm (w[ 8], w[ 9], selector); - w[32] = __byte_perm (w[ 7], w[ 8], selector); - w[31] = __byte_perm (w[ 6], w[ 7], selector); - w[30] = __byte_perm (w[ 5], w[ 6], selector); - w[29] = __byte_perm (w[ 4], w[ 5], selector); - w[28] = __byte_perm (w[ 3], w[ 4], selector); - w[27] = __byte_perm (w[ 2], w[ 3], selector); - w[26] = __byte_perm (w[ 1], w[ 2], selector); - w[25] = __byte_perm (w[ 0], w[ 1], selector); - w[24] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[38], w[39], selector); + w[62] = hc_byte_perm (w[37], w[38], selector); + w[61] = hc_byte_perm (w[36], w[37], selector); + w[60] = hc_byte_perm (w[35], w[36], selector); + w[59] = hc_byte_perm (w[34], w[35], selector); + w[58] = hc_byte_perm (w[33], w[34], selector); + w[57] = hc_byte_perm (w[32], w[33], selector); + w[56] = hc_byte_perm (w[31], w[32], selector); + w[55] = hc_byte_perm (w[30], w[31], selector); + w[54] = hc_byte_perm (w[29], w[30], selector); + w[53] = hc_byte_perm (w[28], w[29], selector); + w[52] = hc_byte_perm (w[27], w[28], selector); + w[51] = hc_byte_perm (w[26], w[27], selector); + w[50] = hc_byte_perm (w[25], w[26], selector); + w[49] = hc_byte_perm (w[24], w[25], selector); + w[48] = hc_byte_perm (w[23], w[24], selector); + w[47] = hc_byte_perm (w[22], w[23], selector); + w[46] = hc_byte_perm (w[21], w[22], selector); + w[45] = hc_byte_perm (w[20], w[21], selector); + w[44] = hc_byte_perm (w[19], w[20], selector); + w[43] = hc_byte_perm (w[18], w[19], selector); + w[42] = hc_byte_perm (w[17], w[18], selector); + w[41] = hc_byte_perm (w[16], w[17], selector); + w[40] = hc_byte_perm (w[15], w[16], selector); + w[39] = hc_byte_perm (w[14], w[15], selector); + w[38] = hc_byte_perm (w[13], w[14], selector); + w[37] = hc_byte_perm (w[12], w[13], selector); + w[36] = hc_byte_perm (w[11], w[12], selector); + w[35] = hc_byte_perm (w[10], w[11], selector); + w[34] = hc_byte_perm (w[ 9], w[10], selector); + w[33] = hc_byte_perm (w[ 8], w[ 9], selector); + w[32] = hc_byte_perm (w[ 7], w[ 8], selector); + w[31] = hc_byte_perm (w[ 6], w[ 7], selector); + w[30] = hc_byte_perm (w[ 5], w[ 6], selector); + w[29] = hc_byte_perm (w[ 4], w[ 5], selector); + w[28] = hc_byte_perm (w[ 3], w[ 4], selector); + w[27] = hc_byte_perm (w[ 2], w[ 3], selector); + w[26] = hc_byte_perm (w[ 1], w[ 2], selector); + w[25] = hc_byte_perm (w[ 0], w[ 1], selector); + w[24] = hc_byte_perm ( 0, w[ 0], selector); w[23] = 0; w[22] = 0; w[21] = 0; @@ -17736,45 +17736,45 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 25: - w[63] = __byte_perm (w[37], w[38], selector); - w[62] = __byte_perm (w[36], w[37], selector); - w[61] = __byte_perm (w[35], w[36], selector); - w[60] = __byte_perm (w[34], w[35], selector); - w[59] = __byte_perm (w[33], w[34], selector); - w[58] = __byte_perm (w[32], w[33], selector); - w[57] = __byte_perm (w[31], w[32], selector); - w[56] = __byte_perm (w[30], w[31], selector); - w[55] = __byte_perm (w[29], w[30], selector); - w[54] = __byte_perm (w[28], w[29], selector); - w[53] = __byte_perm (w[27], w[28], selector); - w[52] = __byte_perm (w[26], w[27], selector); - w[51] = __byte_perm (w[25], w[26], selector); - w[50] = __byte_perm (w[24], w[25], selector); - w[49] = __byte_perm (w[23], w[24], selector); - w[48] = __byte_perm (w[22], w[23], selector); - w[47] = __byte_perm (w[21], w[22], selector); - w[46] = __byte_perm (w[20], w[21], selector); - w[45] = __byte_perm (w[19], w[20], selector); - w[44] = __byte_perm (w[18], w[19], selector); - w[43] = __byte_perm (w[17], w[18], selector); - w[42] = __byte_perm (w[16], w[17], selector); - w[41] = __byte_perm (w[15], w[16], selector); - w[40] = __byte_perm (w[14], w[15], selector); - w[39] = __byte_perm (w[13], w[14], selector); - w[38] = __byte_perm (w[12], w[13], selector); - w[37] = __byte_perm (w[11], w[12], selector); - w[36] = __byte_perm (w[10], w[11], selector); - w[35] = __byte_perm (w[ 9], w[10], selector); - w[34] = __byte_perm (w[ 8], w[ 9], selector); - w[33] = __byte_perm (w[ 7], w[ 8], selector); - w[32] = __byte_perm (w[ 6], w[ 7], selector); - w[31] = __byte_perm (w[ 5], w[ 6], selector); - w[30] = __byte_perm (w[ 4], w[ 5], selector); - w[29] = __byte_perm (w[ 3], w[ 4], selector); - w[28] = __byte_perm (w[ 2], w[ 3], selector); - w[27] = __byte_perm (w[ 1], w[ 2], selector); - w[26] = __byte_perm (w[ 0], w[ 1], selector); - w[25] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[37], w[38], selector); + w[62] = hc_byte_perm (w[36], w[37], selector); + w[61] = hc_byte_perm (w[35], w[36], selector); + w[60] = hc_byte_perm (w[34], w[35], selector); + w[59] = hc_byte_perm (w[33], w[34], selector); + w[58] = hc_byte_perm (w[32], w[33], selector); + w[57] = hc_byte_perm (w[31], w[32], selector); + w[56] = hc_byte_perm (w[30], w[31], selector); + w[55] = hc_byte_perm (w[29], w[30], selector); + w[54] = hc_byte_perm (w[28], w[29], selector); + w[53] = hc_byte_perm (w[27], w[28], selector); + w[52] = hc_byte_perm (w[26], w[27], selector); + w[51] = hc_byte_perm (w[25], w[26], selector); + w[50] = hc_byte_perm (w[24], w[25], selector); + w[49] = hc_byte_perm (w[23], w[24], selector); + w[48] = hc_byte_perm (w[22], w[23], selector); + w[47] = hc_byte_perm (w[21], w[22], selector); + w[46] = hc_byte_perm (w[20], w[21], selector); + w[45] = hc_byte_perm (w[19], w[20], selector); + w[44] = hc_byte_perm (w[18], w[19], selector); + w[43] = hc_byte_perm (w[17], w[18], selector); + w[42] = hc_byte_perm (w[16], w[17], selector); + w[41] = hc_byte_perm (w[15], w[16], selector); + w[40] = hc_byte_perm (w[14], w[15], selector); + w[39] = hc_byte_perm (w[13], w[14], selector); + w[38] = hc_byte_perm (w[12], w[13], selector); + w[37] = hc_byte_perm (w[11], w[12], selector); + w[36] = hc_byte_perm (w[10], w[11], selector); + w[35] = hc_byte_perm (w[ 9], w[10], selector); + w[34] = hc_byte_perm (w[ 8], w[ 9], selector); + w[33] = hc_byte_perm (w[ 7], w[ 8], selector); + w[32] = hc_byte_perm (w[ 6], w[ 7], selector); + w[31] = hc_byte_perm (w[ 5], w[ 6], selector); + w[30] = hc_byte_perm (w[ 4], w[ 5], selector); + w[29] = hc_byte_perm (w[ 3], w[ 4], selector); + w[28] = hc_byte_perm (w[ 2], w[ 3], selector); + w[27] = hc_byte_perm (w[ 1], w[ 2], selector); + w[26] = hc_byte_perm (w[ 0], w[ 1], selector); + w[25] = hc_byte_perm ( 0, w[ 0], selector); w[24] = 0; w[23] = 0; w[22] = 0; @@ -17804,44 +17804,44 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 26: - w[63] = __byte_perm (w[36], w[37], selector); - w[62] = __byte_perm (w[35], w[36], selector); - w[61] = __byte_perm (w[34], w[35], selector); - w[60] = __byte_perm (w[33], w[34], selector); - w[59] = __byte_perm (w[32], w[33], selector); - w[58] = __byte_perm (w[31], w[32], selector); - w[57] = __byte_perm (w[30], w[31], selector); - w[56] = __byte_perm (w[29], w[30], selector); - w[55] = __byte_perm (w[28], w[29], selector); - w[54] = __byte_perm (w[27], w[28], selector); - w[53] = __byte_perm (w[26], w[27], selector); - w[52] = __byte_perm (w[25], w[26], selector); - w[51] = __byte_perm (w[24], w[25], selector); - w[50] = __byte_perm (w[23], w[24], selector); - w[49] = __byte_perm (w[22], w[23], selector); - w[48] = __byte_perm (w[21], w[22], selector); - w[47] = __byte_perm (w[20], w[21], selector); - w[46] = __byte_perm (w[19], w[20], selector); - w[45] = __byte_perm (w[18], w[19], selector); - w[44] = __byte_perm (w[17], w[18], selector); - w[43] = __byte_perm (w[16], w[17], selector); - w[42] = __byte_perm (w[15], w[16], selector); - w[41] = __byte_perm (w[14], w[15], selector); - w[40] = __byte_perm (w[13], w[14], selector); - w[39] = __byte_perm (w[12], w[13], selector); - w[38] = __byte_perm (w[11], w[12], selector); - w[37] = __byte_perm (w[10], w[11], selector); - w[36] = __byte_perm (w[ 9], w[10], selector); - w[35] = __byte_perm (w[ 8], w[ 9], selector); - w[34] = __byte_perm (w[ 7], w[ 8], selector); - w[33] = __byte_perm (w[ 6], w[ 7], selector); - w[32] = __byte_perm (w[ 5], w[ 6], selector); - w[31] = __byte_perm (w[ 4], w[ 5], selector); - w[30] = __byte_perm (w[ 3], w[ 4], selector); - w[29] = __byte_perm (w[ 2], w[ 3], selector); - w[28] = __byte_perm (w[ 1], w[ 2], selector); - w[27] = __byte_perm (w[ 0], w[ 1], selector); - w[26] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[36], w[37], selector); + w[62] = hc_byte_perm (w[35], w[36], selector); + w[61] = hc_byte_perm (w[34], w[35], selector); + w[60] = hc_byte_perm (w[33], w[34], selector); + w[59] = hc_byte_perm (w[32], w[33], selector); + w[58] = hc_byte_perm (w[31], w[32], selector); + w[57] = hc_byte_perm (w[30], w[31], selector); + w[56] = hc_byte_perm (w[29], w[30], selector); + w[55] = hc_byte_perm (w[28], w[29], selector); + w[54] = hc_byte_perm (w[27], w[28], selector); + w[53] = hc_byte_perm (w[26], w[27], selector); + w[52] = hc_byte_perm (w[25], w[26], selector); + w[51] = hc_byte_perm (w[24], w[25], selector); + w[50] = hc_byte_perm (w[23], w[24], selector); + w[49] = hc_byte_perm (w[22], w[23], selector); + w[48] = hc_byte_perm (w[21], w[22], selector); + w[47] = hc_byte_perm (w[20], w[21], selector); + w[46] = hc_byte_perm (w[19], w[20], selector); + w[45] = hc_byte_perm (w[18], w[19], selector); + w[44] = hc_byte_perm (w[17], w[18], selector); + w[43] = hc_byte_perm (w[16], w[17], selector); + w[42] = hc_byte_perm (w[15], w[16], selector); + w[41] = hc_byte_perm (w[14], w[15], selector); + w[40] = hc_byte_perm (w[13], w[14], selector); + w[39] = hc_byte_perm (w[12], w[13], selector); + w[38] = hc_byte_perm (w[11], w[12], selector); + w[37] = hc_byte_perm (w[10], w[11], selector); + w[36] = hc_byte_perm (w[ 9], w[10], selector); + w[35] = hc_byte_perm (w[ 8], w[ 9], selector); + w[34] = hc_byte_perm (w[ 7], w[ 8], selector); + w[33] = hc_byte_perm (w[ 6], w[ 7], selector); + w[32] = hc_byte_perm (w[ 5], w[ 6], selector); + w[31] = hc_byte_perm (w[ 4], w[ 5], selector); + w[30] = hc_byte_perm (w[ 3], w[ 4], selector); + w[29] = hc_byte_perm (w[ 2], w[ 3], selector); + w[28] = hc_byte_perm (w[ 1], w[ 2], selector); + w[27] = hc_byte_perm (w[ 0], w[ 1], selector); + w[26] = hc_byte_perm ( 0, w[ 0], selector); w[25] = 0; w[24] = 0; w[23] = 0; @@ -17872,43 +17872,43 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 27: - w[63] = __byte_perm (w[35], w[36], selector); - w[62] = __byte_perm (w[34], w[35], selector); - w[61] = __byte_perm (w[33], w[34], selector); - w[60] = __byte_perm (w[32], w[33], selector); - w[59] = __byte_perm (w[31], w[32], selector); - w[58] = __byte_perm (w[30], w[31], selector); - w[57] = __byte_perm (w[29], w[30], selector); - w[56] = __byte_perm (w[28], w[29], selector); - w[55] = __byte_perm (w[27], w[28], selector); - w[54] = __byte_perm (w[26], w[27], selector); - w[53] = __byte_perm (w[25], w[26], selector); - w[52] = __byte_perm (w[24], w[25], selector); - w[51] = __byte_perm (w[23], w[24], selector); - w[50] = __byte_perm (w[22], w[23], selector); - w[49] = __byte_perm (w[21], w[22], selector); - w[48] = __byte_perm (w[20], w[21], selector); - w[47] = __byte_perm (w[19], w[20], selector); - w[46] = __byte_perm (w[18], w[19], selector); - w[45] = __byte_perm (w[17], w[18], selector); - w[44] = __byte_perm (w[16], w[17], selector); - w[43] = __byte_perm (w[15], w[16], selector); - w[42] = __byte_perm (w[14], w[15], selector); - w[41] = __byte_perm (w[13], w[14], selector); - w[40] = __byte_perm (w[12], w[13], selector); - w[39] = __byte_perm (w[11], w[12], selector); - w[38] = __byte_perm (w[10], w[11], selector); - w[37] = __byte_perm (w[ 9], w[10], selector); - w[36] = __byte_perm (w[ 8], w[ 9], selector); - w[35] = __byte_perm (w[ 7], w[ 8], selector); - w[34] = __byte_perm (w[ 6], w[ 7], selector); - w[33] = __byte_perm (w[ 5], w[ 6], selector); - w[32] = __byte_perm (w[ 4], w[ 5], selector); - w[31] = __byte_perm (w[ 3], w[ 4], selector); - w[30] = __byte_perm (w[ 2], w[ 3], selector); - w[29] = __byte_perm (w[ 1], w[ 2], selector); - w[28] = __byte_perm (w[ 0], w[ 1], selector); - w[27] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[35], w[36], selector); + w[62] = hc_byte_perm (w[34], w[35], selector); + w[61] = hc_byte_perm (w[33], w[34], selector); + w[60] = hc_byte_perm (w[32], w[33], selector); + w[59] = hc_byte_perm (w[31], w[32], selector); + w[58] = hc_byte_perm (w[30], w[31], selector); + w[57] = hc_byte_perm (w[29], w[30], selector); + w[56] = hc_byte_perm (w[28], w[29], selector); + w[55] = hc_byte_perm (w[27], w[28], selector); + w[54] = hc_byte_perm (w[26], w[27], selector); + w[53] = hc_byte_perm (w[25], w[26], selector); + w[52] = hc_byte_perm (w[24], w[25], selector); + w[51] = hc_byte_perm (w[23], w[24], selector); + w[50] = hc_byte_perm (w[22], w[23], selector); + w[49] = hc_byte_perm (w[21], w[22], selector); + w[48] = hc_byte_perm (w[20], w[21], selector); + w[47] = hc_byte_perm (w[19], w[20], selector); + w[46] = hc_byte_perm (w[18], w[19], selector); + w[45] = hc_byte_perm (w[17], w[18], selector); + w[44] = hc_byte_perm (w[16], w[17], selector); + w[43] = hc_byte_perm (w[15], w[16], selector); + w[42] = hc_byte_perm (w[14], w[15], selector); + w[41] = hc_byte_perm (w[13], w[14], selector); + w[40] = hc_byte_perm (w[12], w[13], selector); + w[39] = hc_byte_perm (w[11], w[12], selector); + w[38] = hc_byte_perm (w[10], w[11], selector); + w[37] = hc_byte_perm (w[ 9], w[10], selector); + w[36] = hc_byte_perm (w[ 8], w[ 9], selector); + w[35] = hc_byte_perm (w[ 7], w[ 8], selector); + w[34] = hc_byte_perm (w[ 6], w[ 7], selector); + w[33] = hc_byte_perm (w[ 5], w[ 6], selector); + w[32] = hc_byte_perm (w[ 4], w[ 5], selector); + w[31] = hc_byte_perm (w[ 3], w[ 4], selector); + w[30] = hc_byte_perm (w[ 2], w[ 3], selector); + w[29] = hc_byte_perm (w[ 1], w[ 2], selector); + w[28] = hc_byte_perm (w[ 0], w[ 1], selector); + w[27] = hc_byte_perm ( 0, w[ 0], selector); w[26] = 0; w[25] = 0; w[24] = 0; @@ -17940,42 +17940,42 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 28: - w[63] = __byte_perm (w[34], w[35], selector); - w[62] = __byte_perm (w[33], w[34], selector); - w[61] = __byte_perm (w[32], w[33], selector); - w[60] = __byte_perm (w[31], w[32], selector); - w[59] = __byte_perm (w[30], w[31], selector); - w[58] = __byte_perm (w[29], w[30], selector); - w[57] = __byte_perm (w[28], w[29], selector); - w[56] = __byte_perm (w[27], w[28], selector); - w[55] = __byte_perm (w[26], w[27], selector); - w[54] = __byte_perm (w[25], w[26], selector); - w[53] = __byte_perm (w[24], w[25], selector); - w[52] = __byte_perm (w[23], w[24], selector); - w[51] = __byte_perm (w[22], w[23], selector); - w[50] = __byte_perm (w[21], w[22], selector); - w[49] = __byte_perm (w[20], w[21], selector); - w[48] = __byte_perm (w[19], w[20], selector); - w[47] = __byte_perm (w[18], w[19], selector); - w[46] = __byte_perm (w[17], w[18], selector); - w[45] = __byte_perm (w[16], w[17], selector); - w[44] = __byte_perm (w[15], w[16], selector); - w[43] = __byte_perm (w[14], w[15], selector); - w[42] = __byte_perm (w[13], w[14], selector); - w[41] = __byte_perm (w[12], w[13], selector); - w[40] = __byte_perm (w[11], w[12], selector); - w[39] = __byte_perm (w[10], w[11], selector); - w[38] = __byte_perm (w[ 9], w[10], selector); - w[37] = __byte_perm (w[ 8], w[ 9], selector); - w[36] = __byte_perm (w[ 7], w[ 8], selector); - w[35] = __byte_perm (w[ 6], w[ 7], selector); - w[34] = __byte_perm (w[ 5], w[ 6], selector); - w[33] = __byte_perm (w[ 4], w[ 5], selector); - w[32] = __byte_perm (w[ 3], w[ 4], selector); - w[31] = __byte_perm (w[ 2], w[ 3], selector); - w[30] = __byte_perm (w[ 1], w[ 2], selector); - w[29] = __byte_perm (w[ 0], w[ 1], selector); - w[28] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[34], w[35], selector); + w[62] = hc_byte_perm (w[33], w[34], selector); + w[61] = hc_byte_perm (w[32], w[33], selector); + w[60] = hc_byte_perm (w[31], w[32], selector); + w[59] = hc_byte_perm (w[30], w[31], selector); + w[58] = hc_byte_perm (w[29], w[30], selector); + w[57] = hc_byte_perm (w[28], w[29], selector); + w[56] = hc_byte_perm (w[27], w[28], selector); + w[55] = hc_byte_perm (w[26], w[27], selector); + w[54] = hc_byte_perm (w[25], w[26], selector); + w[53] = hc_byte_perm (w[24], w[25], selector); + w[52] = hc_byte_perm (w[23], w[24], selector); + w[51] = hc_byte_perm (w[22], w[23], selector); + w[50] = hc_byte_perm (w[21], w[22], selector); + w[49] = hc_byte_perm (w[20], w[21], selector); + w[48] = hc_byte_perm (w[19], w[20], selector); + w[47] = hc_byte_perm (w[18], w[19], selector); + w[46] = hc_byte_perm (w[17], w[18], selector); + w[45] = hc_byte_perm (w[16], w[17], selector); + w[44] = hc_byte_perm (w[15], w[16], selector); + w[43] = hc_byte_perm (w[14], w[15], selector); + w[42] = hc_byte_perm (w[13], w[14], selector); + w[41] = hc_byte_perm (w[12], w[13], selector); + w[40] = hc_byte_perm (w[11], w[12], selector); + w[39] = hc_byte_perm (w[10], w[11], selector); + w[38] = hc_byte_perm (w[ 9], w[10], selector); + w[37] = hc_byte_perm (w[ 8], w[ 9], selector); + w[36] = hc_byte_perm (w[ 7], w[ 8], selector); + w[35] = hc_byte_perm (w[ 6], w[ 7], selector); + w[34] = hc_byte_perm (w[ 5], w[ 6], selector); + w[33] = hc_byte_perm (w[ 4], w[ 5], selector); + w[32] = hc_byte_perm (w[ 3], w[ 4], selector); + w[31] = hc_byte_perm (w[ 2], w[ 3], selector); + w[30] = hc_byte_perm (w[ 1], w[ 2], selector); + w[29] = hc_byte_perm (w[ 0], w[ 1], selector); + w[28] = hc_byte_perm ( 0, w[ 0], selector); w[27] = 0; w[26] = 0; w[25] = 0; @@ -18008,41 +18008,41 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 29: - w[63] = __byte_perm (w[33], w[34], selector); - w[62] = __byte_perm (w[32], w[33], selector); - w[61] = __byte_perm (w[31], w[32], selector); - w[60] = __byte_perm (w[30], w[31], selector); - w[59] = __byte_perm (w[29], w[30], selector); - w[58] = __byte_perm (w[28], w[29], selector); - w[57] = __byte_perm (w[27], w[28], selector); - w[56] = __byte_perm (w[26], w[27], selector); - w[55] = __byte_perm (w[25], w[26], selector); - w[54] = __byte_perm (w[24], w[25], selector); - w[53] = __byte_perm (w[23], w[24], selector); - w[52] = __byte_perm (w[22], w[23], selector); - w[51] = __byte_perm (w[21], w[22], selector); - w[50] = __byte_perm (w[20], w[21], selector); - w[49] = __byte_perm (w[19], w[20], selector); - w[48] = __byte_perm (w[18], w[19], selector); - w[47] = __byte_perm (w[17], w[18], selector); - w[46] = __byte_perm (w[16], w[17], selector); - w[45] = __byte_perm (w[15], w[16], selector); - w[44] = __byte_perm (w[14], w[15], selector); - w[43] = __byte_perm (w[13], w[14], selector); - w[42] = __byte_perm (w[12], w[13], selector); - w[41] = __byte_perm (w[11], w[12], selector); - w[40] = __byte_perm (w[10], w[11], selector); - w[39] = __byte_perm (w[ 9], w[10], selector); - w[38] = __byte_perm (w[ 8], w[ 9], selector); - w[37] = __byte_perm (w[ 7], w[ 8], selector); - w[36] = __byte_perm (w[ 6], w[ 7], selector); - w[35] = __byte_perm (w[ 5], w[ 6], selector); - w[34] = __byte_perm (w[ 4], w[ 5], selector); - w[33] = __byte_perm (w[ 3], w[ 4], selector); - w[32] = __byte_perm (w[ 2], w[ 3], selector); - w[31] = __byte_perm (w[ 1], w[ 2], selector); - w[30] = __byte_perm (w[ 0], w[ 1], selector); - w[29] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[33], w[34], selector); + w[62] = hc_byte_perm (w[32], w[33], selector); + w[61] = hc_byte_perm (w[31], w[32], selector); + w[60] = hc_byte_perm (w[30], w[31], selector); + w[59] = hc_byte_perm (w[29], w[30], selector); + w[58] = hc_byte_perm (w[28], w[29], selector); + w[57] = hc_byte_perm (w[27], w[28], selector); + w[56] = hc_byte_perm (w[26], w[27], selector); + w[55] = hc_byte_perm (w[25], w[26], selector); + w[54] = hc_byte_perm (w[24], w[25], selector); + w[53] = hc_byte_perm (w[23], w[24], selector); + w[52] = hc_byte_perm (w[22], w[23], selector); + w[51] = hc_byte_perm (w[21], w[22], selector); + w[50] = hc_byte_perm (w[20], w[21], selector); + w[49] = hc_byte_perm (w[19], w[20], selector); + w[48] = hc_byte_perm (w[18], w[19], selector); + w[47] = hc_byte_perm (w[17], w[18], selector); + w[46] = hc_byte_perm (w[16], w[17], selector); + w[45] = hc_byte_perm (w[15], w[16], selector); + w[44] = hc_byte_perm (w[14], w[15], selector); + w[43] = hc_byte_perm (w[13], w[14], selector); + w[42] = hc_byte_perm (w[12], w[13], selector); + w[41] = hc_byte_perm (w[11], w[12], selector); + w[40] = hc_byte_perm (w[10], w[11], selector); + w[39] = hc_byte_perm (w[ 9], w[10], selector); + w[38] = hc_byte_perm (w[ 8], w[ 9], selector); + w[37] = hc_byte_perm (w[ 7], w[ 8], selector); + w[36] = hc_byte_perm (w[ 6], w[ 7], selector); + w[35] = hc_byte_perm (w[ 5], w[ 6], selector); + w[34] = hc_byte_perm (w[ 4], w[ 5], selector); + w[33] = hc_byte_perm (w[ 3], w[ 4], selector); + w[32] = hc_byte_perm (w[ 2], w[ 3], selector); + w[31] = hc_byte_perm (w[ 1], w[ 2], selector); + w[30] = hc_byte_perm (w[ 0], w[ 1], selector); + w[29] = hc_byte_perm ( 0, w[ 0], selector); w[28] = 0; w[27] = 0; w[26] = 0; @@ -18076,40 +18076,40 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 30: - w[63] = __byte_perm (w[32], w[33], selector); - w[62] = __byte_perm (w[31], w[32], selector); - w[61] = __byte_perm (w[30], w[31], selector); - w[60] = __byte_perm (w[29], w[30], selector); - w[59] = __byte_perm (w[28], w[29], selector); - w[58] = __byte_perm (w[27], w[28], selector); - w[57] = __byte_perm (w[26], w[27], selector); - w[56] = __byte_perm (w[25], w[26], selector); - w[55] = __byte_perm (w[24], w[25], selector); - w[54] = __byte_perm (w[23], w[24], selector); - w[53] = __byte_perm (w[22], w[23], selector); - w[52] = __byte_perm (w[21], w[22], selector); - w[51] = __byte_perm (w[20], w[21], selector); - w[50] = __byte_perm (w[19], w[20], selector); - w[49] = __byte_perm (w[18], w[19], selector); - w[48] = __byte_perm (w[17], w[18], selector); - w[47] = __byte_perm (w[16], w[17], selector); - w[46] = __byte_perm (w[15], w[16], selector); - w[45] = __byte_perm (w[14], w[15], selector); - w[44] = __byte_perm (w[13], w[14], selector); - w[43] = __byte_perm (w[12], w[13], selector); - w[42] = __byte_perm (w[11], w[12], selector); - w[41] = __byte_perm (w[10], w[11], selector); - w[40] = __byte_perm (w[ 9], w[10], selector); - w[39] = __byte_perm (w[ 8], w[ 9], selector); - w[38] = __byte_perm (w[ 7], w[ 8], selector); - w[37] = __byte_perm (w[ 6], w[ 7], selector); - w[36] = __byte_perm (w[ 5], w[ 6], selector); - w[35] = __byte_perm (w[ 4], w[ 5], selector); - w[34] = __byte_perm (w[ 3], w[ 4], selector); - w[33] = __byte_perm (w[ 2], w[ 3], selector); - w[32] = __byte_perm (w[ 1], w[ 2], selector); - w[31] = __byte_perm (w[ 0], w[ 1], selector); - w[30] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[32], w[33], selector); + w[62] = hc_byte_perm (w[31], w[32], selector); + w[61] = hc_byte_perm (w[30], w[31], selector); + w[60] = hc_byte_perm (w[29], w[30], selector); + w[59] = hc_byte_perm (w[28], w[29], selector); + w[58] = hc_byte_perm (w[27], w[28], selector); + w[57] = hc_byte_perm (w[26], w[27], selector); + w[56] = hc_byte_perm (w[25], w[26], selector); + w[55] = hc_byte_perm (w[24], w[25], selector); + w[54] = hc_byte_perm (w[23], w[24], selector); + w[53] = hc_byte_perm (w[22], w[23], selector); + w[52] = hc_byte_perm (w[21], w[22], selector); + w[51] = hc_byte_perm (w[20], w[21], selector); + w[50] = hc_byte_perm (w[19], w[20], selector); + w[49] = hc_byte_perm (w[18], w[19], selector); + w[48] = hc_byte_perm (w[17], w[18], selector); + w[47] = hc_byte_perm (w[16], w[17], selector); + w[46] = hc_byte_perm (w[15], w[16], selector); + w[45] = hc_byte_perm (w[14], w[15], selector); + w[44] = hc_byte_perm (w[13], w[14], selector); + w[43] = hc_byte_perm (w[12], w[13], selector); + w[42] = hc_byte_perm (w[11], w[12], selector); + w[41] = hc_byte_perm (w[10], w[11], selector); + w[40] = hc_byte_perm (w[ 9], w[10], selector); + w[39] = hc_byte_perm (w[ 8], w[ 9], selector); + w[38] = hc_byte_perm (w[ 7], w[ 8], selector); + w[37] = hc_byte_perm (w[ 6], w[ 7], selector); + w[36] = hc_byte_perm (w[ 5], w[ 6], selector); + w[35] = hc_byte_perm (w[ 4], w[ 5], selector); + w[34] = hc_byte_perm (w[ 3], w[ 4], selector); + w[33] = hc_byte_perm (w[ 2], w[ 3], selector); + w[32] = hc_byte_perm (w[ 1], w[ 2], selector); + w[31] = hc_byte_perm (w[ 0], w[ 1], selector); + w[30] = hc_byte_perm ( 0, w[ 0], selector); w[29] = 0; w[28] = 0; w[27] = 0; @@ -18144,39 +18144,39 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 31: - w[63] = __byte_perm (w[31], w[32], selector); - w[62] = __byte_perm (w[30], w[31], selector); - w[61] = __byte_perm (w[29], w[30], selector); - w[60] = __byte_perm (w[28], w[29], selector); - w[59] = __byte_perm (w[27], w[28], selector); - w[58] = __byte_perm (w[26], w[27], selector); - w[57] = __byte_perm (w[25], w[26], selector); - w[56] = __byte_perm (w[24], w[25], selector); - w[55] = __byte_perm (w[23], w[24], selector); - w[54] = __byte_perm (w[22], w[23], selector); - w[53] = __byte_perm (w[21], w[22], selector); - w[52] = __byte_perm (w[20], w[21], selector); - w[51] = __byte_perm (w[19], w[20], selector); - w[50] = __byte_perm (w[18], w[19], selector); - w[49] = __byte_perm (w[17], w[18], selector); - w[48] = __byte_perm (w[16], w[17], selector); - w[47] = __byte_perm (w[15], w[16], selector); - w[46] = __byte_perm (w[14], w[15], selector); - w[45] = __byte_perm (w[13], w[14], selector); - w[44] = __byte_perm (w[12], w[13], selector); - w[43] = __byte_perm (w[11], w[12], selector); - w[42] = __byte_perm (w[10], w[11], selector); - w[41] = __byte_perm (w[ 9], w[10], selector); - w[40] = __byte_perm (w[ 8], w[ 9], selector); - w[39] = __byte_perm (w[ 7], w[ 8], selector); - w[38] = __byte_perm (w[ 6], w[ 7], selector); - w[37] = __byte_perm (w[ 5], w[ 6], selector); - w[36] = __byte_perm (w[ 4], w[ 5], selector); - w[35] = __byte_perm (w[ 3], w[ 4], selector); - w[34] = __byte_perm (w[ 2], w[ 3], selector); - w[33] = __byte_perm (w[ 1], w[ 2], selector); - w[32] = __byte_perm (w[ 0], w[ 1], selector); - w[31] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[31], w[32], selector); + w[62] = hc_byte_perm (w[30], w[31], selector); + w[61] = hc_byte_perm (w[29], w[30], selector); + w[60] = hc_byte_perm (w[28], w[29], selector); + w[59] = hc_byte_perm (w[27], w[28], selector); + w[58] = hc_byte_perm (w[26], w[27], selector); + w[57] = hc_byte_perm (w[25], w[26], selector); + w[56] = hc_byte_perm (w[24], w[25], selector); + w[55] = hc_byte_perm (w[23], w[24], selector); + w[54] = hc_byte_perm (w[22], w[23], selector); + w[53] = hc_byte_perm (w[21], w[22], selector); + w[52] = hc_byte_perm (w[20], w[21], selector); + w[51] = hc_byte_perm (w[19], w[20], selector); + w[50] = hc_byte_perm (w[18], w[19], selector); + w[49] = hc_byte_perm (w[17], w[18], selector); + w[48] = hc_byte_perm (w[16], w[17], selector); + w[47] = hc_byte_perm (w[15], w[16], selector); + w[46] = hc_byte_perm (w[14], w[15], selector); + w[45] = hc_byte_perm (w[13], w[14], selector); + w[44] = hc_byte_perm (w[12], w[13], selector); + w[43] = hc_byte_perm (w[11], w[12], selector); + w[42] = hc_byte_perm (w[10], w[11], selector); + w[41] = hc_byte_perm (w[ 9], w[10], selector); + w[40] = hc_byte_perm (w[ 8], w[ 9], selector); + w[39] = hc_byte_perm (w[ 7], w[ 8], selector); + w[38] = hc_byte_perm (w[ 6], w[ 7], selector); + w[37] = hc_byte_perm (w[ 5], w[ 6], selector); + w[36] = hc_byte_perm (w[ 4], w[ 5], selector); + w[35] = hc_byte_perm (w[ 3], w[ 4], selector); + w[34] = hc_byte_perm (w[ 2], w[ 3], selector); + w[33] = hc_byte_perm (w[ 1], w[ 2], selector); + w[32] = hc_byte_perm (w[ 0], w[ 1], selector); + w[31] = hc_byte_perm ( 0, w[ 0], selector); w[30] = 0; w[29] = 0; w[28] = 0; @@ -18212,38 +18212,38 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 32: - w[63] = __byte_perm (w[30], w[31], selector); - w[62] = __byte_perm (w[29], w[30], selector); - w[61] = __byte_perm (w[28], w[29], selector); - w[60] = __byte_perm (w[27], w[28], selector); - w[59] = __byte_perm (w[26], w[27], selector); - w[58] = __byte_perm (w[25], w[26], selector); - w[57] = __byte_perm (w[24], w[25], selector); - w[56] = __byte_perm (w[23], w[24], selector); - w[55] = __byte_perm (w[22], w[23], selector); - w[54] = __byte_perm (w[21], w[22], selector); - w[53] = __byte_perm (w[20], w[21], selector); - w[52] = __byte_perm (w[19], w[20], selector); - w[51] = __byte_perm (w[18], w[19], selector); - w[50] = __byte_perm (w[17], w[18], selector); - w[49] = __byte_perm (w[16], w[17], selector); - w[48] = __byte_perm (w[15], w[16], selector); - w[47] = __byte_perm (w[14], w[15], selector); - w[46] = __byte_perm (w[13], w[14], selector); - w[45] = __byte_perm (w[12], w[13], selector); - w[44] = __byte_perm (w[11], w[12], selector); - w[43] = __byte_perm (w[10], w[11], selector); - w[42] = __byte_perm (w[ 9], w[10], selector); - w[41] = __byte_perm (w[ 8], w[ 9], selector); - w[40] = __byte_perm (w[ 7], w[ 8], selector); - w[39] = __byte_perm (w[ 6], w[ 7], selector); - w[38] = __byte_perm (w[ 5], w[ 6], selector); - w[37] = __byte_perm (w[ 4], w[ 5], selector); - w[36] = __byte_perm (w[ 3], w[ 4], selector); - w[35] = __byte_perm (w[ 2], w[ 3], selector); - w[34] = __byte_perm (w[ 1], w[ 2], selector); - w[33] = __byte_perm (w[ 0], w[ 1], selector); - w[32] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[30], w[31], selector); + w[62] = hc_byte_perm (w[29], w[30], selector); + w[61] = hc_byte_perm (w[28], w[29], selector); + w[60] = hc_byte_perm (w[27], w[28], selector); + w[59] = hc_byte_perm (w[26], w[27], selector); + w[58] = hc_byte_perm (w[25], w[26], selector); + w[57] = hc_byte_perm (w[24], w[25], selector); + w[56] = hc_byte_perm (w[23], w[24], selector); + w[55] = hc_byte_perm (w[22], w[23], selector); + w[54] = hc_byte_perm (w[21], w[22], selector); + w[53] = hc_byte_perm (w[20], w[21], selector); + w[52] = hc_byte_perm (w[19], w[20], selector); + w[51] = hc_byte_perm (w[18], w[19], selector); + w[50] = hc_byte_perm (w[17], w[18], selector); + w[49] = hc_byte_perm (w[16], w[17], selector); + w[48] = hc_byte_perm (w[15], w[16], selector); + w[47] = hc_byte_perm (w[14], w[15], selector); + w[46] = hc_byte_perm (w[13], w[14], selector); + w[45] = hc_byte_perm (w[12], w[13], selector); + w[44] = hc_byte_perm (w[11], w[12], selector); + w[43] = hc_byte_perm (w[10], w[11], selector); + w[42] = hc_byte_perm (w[ 9], w[10], selector); + w[41] = hc_byte_perm (w[ 8], w[ 9], selector); + w[40] = hc_byte_perm (w[ 7], w[ 8], selector); + w[39] = hc_byte_perm (w[ 6], w[ 7], selector); + w[38] = hc_byte_perm (w[ 5], w[ 6], selector); + w[37] = hc_byte_perm (w[ 4], w[ 5], selector); + w[36] = hc_byte_perm (w[ 3], w[ 4], selector); + w[35] = hc_byte_perm (w[ 2], w[ 3], selector); + w[34] = hc_byte_perm (w[ 1], w[ 2], selector); + w[33] = hc_byte_perm (w[ 0], w[ 1], selector); + w[32] = hc_byte_perm ( 0, w[ 0], selector); w[31] = 0; w[30] = 0; w[29] = 0; @@ -18280,37 +18280,37 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 33: - w[63] = __byte_perm (w[29], w[30], selector); - w[62] = __byte_perm (w[28], w[29], selector); - w[61] = __byte_perm (w[27], w[28], selector); - w[60] = __byte_perm (w[26], w[27], selector); - w[59] = __byte_perm (w[25], w[26], selector); - w[58] = __byte_perm (w[24], w[25], selector); - w[57] = __byte_perm (w[23], w[24], selector); - w[56] = __byte_perm (w[22], w[23], selector); - w[55] = __byte_perm (w[21], w[22], selector); - w[54] = __byte_perm (w[20], w[21], selector); - w[53] = __byte_perm (w[19], w[20], selector); - w[52] = __byte_perm (w[18], w[19], selector); - w[51] = __byte_perm (w[17], w[18], selector); - w[50] = __byte_perm (w[16], w[17], selector); - w[49] = __byte_perm (w[15], w[16], selector); - w[48] = __byte_perm (w[14], w[15], selector); - w[47] = __byte_perm (w[13], w[14], selector); - w[46] = __byte_perm (w[12], w[13], selector); - w[45] = __byte_perm (w[11], w[12], selector); - w[44] = __byte_perm (w[10], w[11], selector); - w[43] = __byte_perm (w[ 9], w[10], selector); - w[42] = __byte_perm (w[ 8], w[ 9], selector); - w[41] = __byte_perm (w[ 7], w[ 8], selector); - w[40] = __byte_perm (w[ 6], w[ 7], selector); - w[39] = __byte_perm (w[ 5], w[ 6], selector); - w[38] = __byte_perm (w[ 4], w[ 5], selector); - w[37] = __byte_perm (w[ 3], w[ 4], selector); - w[36] = __byte_perm (w[ 2], w[ 3], selector); - w[35] = __byte_perm (w[ 1], w[ 2], selector); - w[34] = __byte_perm (w[ 0], w[ 1], selector); - w[33] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[29], w[30], selector); + w[62] = hc_byte_perm (w[28], w[29], selector); + w[61] = hc_byte_perm (w[27], w[28], selector); + w[60] = hc_byte_perm (w[26], w[27], selector); + w[59] = hc_byte_perm (w[25], w[26], selector); + w[58] = hc_byte_perm (w[24], w[25], selector); + w[57] = hc_byte_perm (w[23], w[24], selector); + w[56] = hc_byte_perm (w[22], w[23], selector); + w[55] = hc_byte_perm (w[21], w[22], selector); + w[54] = hc_byte_perm (w[20], w[21], selector); + w[53] = hc_byte_perm (w[19], w[20], selector); + w[52] = hc_byte_perm (w[18], w[19], selector); + w[51] = hc_byte_perm (w[17], w[18], selector); + w[50] = hc_byte_perm (w[16], w[17], selector); + w[49] = hc_byte_perm (w[15], w[16], selector); + w[48] = hc_byte_perm (w[14], w[15], selector); + w[47] = hc_byte_perm (w[13], w[14], selector); + w[46] = hc_byte_perm (w[12], w[13], selector); + w[45] = hc_byte_perm (w[11], w[12], selector); + w[44] = hc_byte_perm (w[10], w[11], selector); + w[43] = hc_byte_perm (w[ 9], w[10], selector); + w[42] = hc_byte_perm (w[ 8], w[ 9], selector); + w[41] = hc_byte_perm (w[ 7], w[ 8], selector); + w[40] = hc_byte_perm (w[ 6], w[ 7], selector); + w[39] = hc_byte_perm (w[ 5], w[ 6], selector); + w[38] = hc_byte_perm (w[ 4], w[ 5], selector); + w[37] = hc_byte_perm (w[ 3], w[ 4], selector); + w[36] = hc_byte_perm (w[ 2], w[ 3], selector); + w[35] = hc_byte_perm (w[ 1], w[ 2], selector); + w[34] = hc_byte_perm (w[ 0], w[ 1], selector); + w[33] = hc_byte_perm ( 0, w[ 0], selector); w[32] = 0; w[31] = 0; w[30] = 0; @@ -18348,36 +18348,36 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 34: - w[63] = __byte_perm (w[28], w[29], selector); - w[62] = __byte_perm (w[27], w[28], selector); - w[61] = __byte_perm (w[26], w[27], selector); - w[60] = __byte_perm (w[25], w[26], selector); - w[59] = __byte_perm (w[24], w[25], selector); - w[58] = __byte_perm (w[23], w[24], selector); - w[57] = __byte_perm (w[22], w[23], selector); - w[56] = __byte_perm (w[21], w[22], selector); - w[55] = __byte_perm (w[20], w[21], selector); - w[54] = __byte_perm (w[19], w[20], selector); - w[53] = __byte_perm (w[18], w[19], selector); - w[52] = __byte_perm (w[17], w[18], selector); - w[51] = __byte_perm (w[16], w[17], selector); - w[50] = __byte_perm (w[15], w[16], selector); - w[49] = __byte_perm (w[14], w[15], selector); - w[48] = __byte_perm (w[13], w[14], selector); - w[47] = __byte_perm (w[12], w[13], selector); - w[46] = __byte_perm (w[11], w[12], selector); - w[45] = __byte_perm (w[10], w[11], selector); - w[44] = __byte_perm (w[ 9], w[10], selector); - w[43] = __byte_perm (w[ 8], w[ 9], selector); - w[42] = __byte_perm (w[ 7], w[ 8], selector); - w[41] = __byte_perm (w[ 6], w[ 7], selector); - w[40] = __byte_perm (w[ 5], w[ 6], selector); - w[39] = __byte_perm (w[ 4], w[ 5], selector); - w[38] = __byte_perm (w[ 3], w[ 4], selector); - w[37] = __byte_perm (w[ 2], w[ 3], selector); - w[36] = __byte_perm (w[ 1], w[ 2], selector); - w[35] = __byte_perm (w[ 0], w[ 1], selector); - w[34] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[28], w[29], selector); + w[62] = hc_byte_perm (w[27], w[28], selector); + w[61] = hc_byte_perm (w[26], w[27], selector); + w[60] = hc_byte_perm (w[25], w[26], selector); + w[59] = hc_byte_perm (w[24], w[25], selector); + w[58] = hc_byte_perm (w[23], w[24], selector); + w[57] = hc_byte_perm (w[22], w[23], selector); + w[56] = hc_byte_perm (w[21], w[22], selector); + w[55] = hc_byte_perm (w[20], w[21], selector); + w[54] = hc_byte_perm (w[19], w[20], selector); + w[53] = hc_byte_perm (w[18], w[19], selector); + w[52] = hc_byte_perm (w[17], w[18], selector); + w[51] = hc_byte_perm (w[16], w[17], selector); + w[50] = hc_byte_perm (w[15], w[16], selector); + w[49] = hc_byte_perm (w[14], w[15], selector); + w[48] = hc_byte_perm (w[13], w[14], selector); + w[47] = hc_byte_perm (w[12], w[13], selector); + w[46] = hc_byte_perm (w[11], w[12], selector); + w[45] = hc_byte_perm (w[10], w[11], selector); + w[44] = hc_byte_perm (w[ 9], w[10], selector); + w[43] = hc_byte_perm (w[ 8], w[ 9], selector); + w[42] = hc_byte_perm (w[ 7], w[ 8], selector); + w[41] = hc_byte_perm (w[ 6], w[ 7], selector); + w[40] = hc_byte_perm (w[ 5], w[ 6], selector); + w[39] = hc_byte_perm (w[ 4], w[ 5], selector); + w[38] = hc_byte_perm (w[ 3], w[ 4], selector); + w[37] = hc_byte_perm (w[ 2], w[ 3], selector); + w[36] = hc_byte_perm (w[ 1], w[ 2], selector); + w[35] = hc_byte_perm (w[ 0], w[ 1], selector); + w[34] = hc_byte_perm ( 0, w[ 0], selector); w[33] = 0; w[32] = 0; w[31] = 0; @@ -18416,35 +18416,35 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 35: - w[63] = __byte_perm (w[27], w[28], selector); - w[62] = __byte_perm (w[26], w[27], selector); - w[61] = __byte_perm (w[25], w[26], selector); - w[60] = __byte_perm (w[24], w[25], selector); - w[59] = __byte_perm (w[23], w[24], selector); - w[58] = __byte_perm (w[22], w[23], selector); - w[57] = __byte_perm (w[21], w[22], selector); - w[56] = __byte_perm (w[20], w[21], selector); - w[55] = __byte_perm (w[19], w[20], selector); - w[54] = __byte_perm (w[18], w[19], selector); - w[53] = __byte_perm (w[17], w[18], selector); - w[52] = __byte_perm (w[16], w[17], selector); - w[51] = __byte_perm (w[15], w[16], selector); - w[50] = __byte_perm (w[14], w[15], selector); - w[49] = __byte_perm (w[13], w[14], selector); - w[48] = __byte_perm (w[12], w[13], selector); - w[47] = __byte_perm (w[11], w[12], selector); - w[46] = __byte_perm (w[10], w[11], selector); - w[45] = __byte_perm (w[ 9], w[10], selector); - w[44] = __byte_perm (w[ 8], w[ 9], selector); - w[43] = __byte_perm (w[ 7], w[ 8], selector); - w[42] = __byte_perm (w[ 6], w[ 7], selector); - w[41] = __byte_perm (w[ 5], w[ 6], selector); - w[40] = __byte_perm (w[ 4], w[ 5], selector); - w[39] = __byte_perm (w[ 3], w[ 4], selector); - w[38] = __byte_perm (w[ 2], w[ 3], selector); - w[37] = __byte_perm (w[ 1], w[ 2], selector); - w[36] = __byte_perm (w[ 0], w[ 1], selector); - w[35] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[27], w[28], selector); + w[62] = hc_byte_perm (w[26], w[27], selector); + w[61] = hc_byte_perm (w[25], w[26], selector); + w[60] = hc_byte_perm (w[24], w[25], selector); + w[59] = hc_byte_perm (w[23], w[24], selector); + w[58] = hc_byte_perm (w[22], w[23], selector); + w[57] = hc_byte_perm (w[21], w[22], selector); + w[56] = hc_byte_perm (w[20], w[21], selector); + w[55] = hc_byte_perm (w[19], w[20], selector); + w[54] = hc_byte_perm (w[18], w[19], selector); + w[53] = hc_byte_perm (w[17], w[18], selector); + w[52] = hc_byte_perm (w[16], w[17], selector); + w[51] = hc_byte_perm (w[15], w[16], selector); + w[50] = hc_byte_perm (w[14], w[15], selector); + w[49] = hc_byte_perm (w[13], w[14], selector); + w[48] = hc_byte_perm (w[12], w[13], selector); + w[47] = hc_byte_perm (w[11], w[12], selector); + w[46] = hc_byte_perm (w[10], w[11], selector); + w[45] = hc_byte_perm (w[ 9], w[10], selector); + w[44] = hc_byte_perm (w[ 8], w[ 9], selector); + w[43] = hc_byte_perm (w[ 7], w[ 8], selector); + w[42] = hc_byte_perm (w[ 6], w[ 7], selector); + w[41] = hc_byte_perm (w[ 5], w[ 6], selector); + w[40] = hc_byte_perm (w[ 4], w[ 5], selector); + w[39] = hc_byte_perm (w[ 3], w[ 4], selector); + w[38] = hc_byte_perm (w[ 2], w[ 3], selector); + w[37] = hc_byte_perm (w[ 1], w[ 2], selector); + w[36] = hc_byte_perm (w[ 0], w[ 1], selector); + w[35] = hc_byte_perm ( 0, w[ 0], selector); w[34] = 0; w[33] = 0; w[32] = 0; @@ -18484,34 +18484,34 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 36: - w[63] = __byte_perm (w[26], w[27], selector); - w[62] = __byte_perm (w[25], w[26], selector); - w[61] = __byte_perm (w[24], w[25], selector); - w[60] = __byte_perm (w[23], w[24], selector); - w[59] = __byte_perm (w[22], w[23], selector); - w[58] = __byte_perm (w[21], w[22], selector); - w[57] = __byte_perm (w[20], w[21], selector); - w[56] = __byte_perm (w[19], w[20], selector); - w[55] = __byte_perm (w[18], w[19], selector); - w[54] = __byte_perm (w[17], w[18], selector); - w[53] = __byte_perm (w[16], w[17], selector); - w[52] = __byte_perm (w[15], w[16], selector); - w[51] = __byte_perm (w[14], w[15], selector); - w[50] = __byte_perm (w[13], w[14], selector); - w[49] = __byte_perm (w[12], w[13], selector); - w[48] = __byte_perm (w[11], w[12], selector); - w[47] = __byte_perm (w[10], w[11], selector); - w[46] = __byte_perm (w[ 9], w[10], selector); - w[45] = __byte_perm (w[ 8], w[ 9], selector); - w[44] = __byte_perm (w[ 7], w[ 8], selector); - w[43] = __byte_perm (w[ 6], w[ 7], selector); - w[42] = __byte_perm (w[ 5], w[ 6], selector); - w[41] = __byte_perm (w[ 4], w[ 5], selector); - w[40] = __byte_perm (w[ 3], w[ 4], selector); - w[39] = __byte_perm (w[ 2], w[ 3], selector); - w[38] = __byte_perm (w[ 1], w[ 2], selector); - w[37] = __byte_perm (w[ 0], w[ 1], selector); - w[36] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[26], w[27], selector); + w[62] = hc_byte_perm (w[25], w[26], selector); + w[61] = hc_byte_perm (w[24], w[25], selector); + w[60] = hc_byte_perm (w[23], w[24], selector); + w[59] = hc_byte_perm (w[22], w[23], selector); + w[58] = hc_byte_perm (w[21], w[22], selector); + w[57] = hc_byte_perm (w[20], w[21], selector); + w[56] = hc_byte_perm (w[19], w[20], selector); + w[55] = hc_byte_perm (w[18], w[19], selector); + w[54] = hc_byte_perm (w[17], w[18], selector); + w[53] = hc_byte_perm (w[16], w[17], selector); + w[52] = hc_byte_perm (w[15], w[16], selector); + w[51] = hc_byte_perm (w[14], w[15], selector); + w[50] = hc_byte_perm (w[13], w[14], selector); + w[49] = hc_byte_perm (w[12], w[13], selector); + w[48] = hc_byte_perm (w[11], w[12], selector); + w[47] = hc_byte_perm (w[10], w[11], selector); + w[46] = hc_byte_perm (w[ 9], w[10], selector); + w[45] = hc_byte_perm (w[ 8], w[ 9], selector); + w[44] = hc_byte_perm (w[ 7], w[ 8], selector); + w[43] = hc_byte_perm (w[ 6], w[ 7], selector); + w[42] = hc_byte_perm (w[ 5], w[ 6], selector); + w[41] = hc_byte_perm (w[ 4], w[ 5], selector); + w[40] = hc_byte_perm (w[ 3], w[ 4], selector); + w[39] = hc_byte_perm (w[ 2], w[ 3], selector); + w[38] = hc_byte_perm (w[ 1], w[ 2], selector); + w[37] = hc_byte_perm (w[ 0], w[ 1], selector); + w[36] = hc_byte_perm ( 0, w[ 0], selector); w[35] = 0; w[34] = 0; w[33] = 0; @@ -18552,33 +18552,33 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 37: - w[63] = __byte_perm (w[25], w[26], selector); - w[62] = __byte_perm (w[24], w[25], selector); - w[61] = __byte_perm (w[23], w[24], selector); - w[60] = __byte_perm (w[22], w[23], selector); - w[59] = __byte_perm (w[21], w[22], selector); - w[58] = __byte_perm (w[20], w[21], selector); - w[57] = __byte_perm (w[19], w[20], selector); - w[56] = __byte_perm (w[18], w[19], selector); - w[55] = __byte_perm (w[17], w[18], selector); - w[54] = __byte_perm (w[16], w[17], selector); - w[53] = __byte_perm (w[15], w[16], selector); - w[52] = __byte_perm (w[14], w[15], selector); - w[51] = __byte_perm (w[13], w[14], selector); - w[50] = __byte_perm (w[12], w[13], selector); - w[49] = __byte_perm (w[11], w[12], selector); - w[48] = __byte_perm (w[10], w[11], selector); - w[47] = __byte_perm (w[ 9], w[10], selector); - w[46] = __byte_perm (w[ 8], w[ 9], selector); - w[45] = __byte_perm (w[ 7], w[ 8], selector); - w[44] = __byte_perm (w[ 6], w[ 7], selector); - w[43] = __byte_perm (w[ 5], w[ 6], selector); - w[42] = __byte_perm (w[ 4], w[ 5], selector); - w[41] = __byte_perm (w[ 3], w[ 4], selector); - w[40] = __byte_perm (w[ 2], w[ 3], selector); - w[39] = __byte_perm (w[ 1], w[ 2], selector); - w[38] = __byte_perm (w[ 0], w[ 1], selector); - w[37] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[25], w[26], selector); + w[62] = hc_byte_perm (w[24], w[25], selector); + w[61] = hc_byte_perm (w[23], w[24], selector); + w[60] = hc_byte_perm (w[22], w[23], selector); + w[59] = hc_byte_perm (w[21], w[22], selector); + w[58] = hc_byte_perm (w[20], w[21], selector); + w[57] = hc_byte_perm (w[19], w[20], selector); + w[56] = hc_byte_perm (w[18], w[19], selector); + w[55] = hc_byte_perm (w[17], w[18], selector); + w[54] = hc_byte_perm (w[16], w[17], selector); + w[53] = hc_byte_perm (w[15], w[16], selector); + w[52] = hc_byte_perm (w[14], w[15], selector); + w[51] = hc_byte_perm (w[13], w[14], selector); + w[50] = hc_byte_perm (w[12], w[13], selector); + w[49] = hc_byte_perm (w[11], w[12], selector); + w[48] = hc_byte_perm (w[10], w[11], selector); + w[47] = hc_byte_perm (w[ 9], w[10], selector); + w[46] = hc_byte_perm (w[ 8], w[ 9], selector); + w[45] = hc_byte_perm (w[ 7], w[ 8], selector); + w[44] = hc_byte_perm (w[ 6], w[ 7], selector); + w[43] = hc_byte_perm (w[ 5], w[ 6], selector); + w[42] = hc_byte_perm (w[ 4], w[ 5], selector); + w[41] = hc_byte_perm (w[ 3], w[ 4], selector); + w[40] = hc_byte_perm (w[ 2], w[ 3], selector); + w[39] = hc_byte_perm (w[ 1], w[ 2], selector); + w[38] = hc_byte_perm (w[ 0], w[ 1], selector); + w[37] = hc_byte_perm ( 0, w[ 0], selector); w[36] = 0; w[35] = 0; w[34] = 0; @@ -18620,32 +18620,32 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 38: - w[63] = __byte_perm (w[24], w[25], selector); - w[62] = __byte_perm (w[23], w[24], selector); - w[61] = __byte_perm (w[22], w[23], selector); - w[60] = __byte_perm (w[21], w[22], selector); - w[59] = __byte_perm (w[20], w[21], selector); - w[58] = __byte_perm (w[19], w[20], selector); - w[57] = __byte_perm (w[18], w[19], selector); - w[56] = __byte_perm (w[17], w[18], selector); - w[55] = __byte_perm (w[16], w[17], selector); - w[54] = __byte_perm (w[15], w[16], selector); - w[53] = __byte_perm (w[14], w[15], selector); - w[52] = __byte_perm (w[13], w[14], selector); - w[51] = __byte_perm (w[12], w[13], selector); - w[50] = __byte_perm (w[11], w[12], selector); - w[49] = __byte_perm (w[10], w[11], selector); - w[48] = __byte_perm (w[ 9], w[10], selector); - w[47] = __byte_perm (w[ 8], w[ 9], selector); - w[46] = __byte_perm (w[ 7], w[ 8], selector); - w[45] = __byte_perm (w[ 6], w[ 7], selector); - w[44] = __byte_perm (w[ 5], w[ 6], selector); - w[43] = __byte_perm (w[ 4], w[ 5], selector); - w[42] = __byte_perm (w[ 3], w[ 4], selector); - w[41] = __byte_perm (w[ 2], w[ 3], selector); - w[40] = __byte_perm (w[ 1], w[ 2], selector); - w[39] = __byte_perm (w[ 0], w[ 1], selector); - w[38] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[24], w[25], selector); + w[62] = hc_byte_perm (w[23], w[24], selector); + w[61] = hc_byte_perm (w[22], w[23], selector); + w[60] = hc_byte_perm (w[21], w[22], selector); + w[59] = hc_byte_perm (w[20], w[21], selector); + w[58] = hc_byte_perm (w[19], w[20], selector); + w[57] = hc_byte_perm (w[18], w[19], selector); + w[56] = hc_byte_perm (w[17], w[18], selector); + w[55] = hc_byte_perm (w[16], w[17], selector); + w[54] = hc_byte_perm (w[15], w[16], selector); + w[53] = hc_byte_perm (w[14], w[15], selector); + w[52] = hc_byte_perm (w[13], w[14], selector); + w[51] = hc_byte_perm (w[12], w[13], selector); + w[50] = hc_byte_perm (w[11], w[12], selector); + w[49] = hc_byte_perm (w[10], w[11], selector); + w[48] = hc_byte_perm (w[ 9], w[10], selector); + w[47] = hc_byte_perm (w[ 8], w[ 9], selector); + w[46] = hc_byte_perm (w[ 7], w[ 8], selector); + w[45] = hc_byte_perm (w[ 6], w[ 7], selector); + w[44] = hc_byte_perm (w[ 5], w[ 6], selector); + w[43] = hc_byte_perm (w[ 4], w[ 5], selector); + w[42] = hc_byte_perm (w[ 3], w[ 4], selector); + w[41] = hc_byte_perm (w[ 2], w[ 3], selector); + w[40] = hc_byte_perm (w[ 1], w[ 2], selector); + w[39] = hc_byte_perm (w[ 0], w[ 1], selector); + w[38] = hc_byte_perm ( 0, w[ 0], selector); w[37] = 0; w[36] = 0; w[35] = 0; @@ -18688,31 +18688,31 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 39: - w[63] = __byte_perm (w[23], w[24], selector); - w[62] = __byte_perm (w[22], w[23], selector); - w[61] = __byte_perm (w[21], w[22], selector); - w[60] = __byte_perm (w[20], w[21], selector); - w[59] = __byte_perm (w[19], w[20], selector); - w[58] = __byte_perm (w[18], w[19], selector); - w[57] = __byte_perm (w[17], w[18], selector); - w[56] = __byte_perm (w[16], w[17], selector); - w[55] = __byte_perm (w[15], w[16], selector); - w[54] = __byte_perm (w[14], w[15], selector); - w[53] = __byte_perm (w[13], w[14], selector); - w[52] = __byte_perm (w[12], w[13], selector); - w[51] = __byte_perm (w[11], w[12], selector); - w[50] = __byte_perm (w[10], w[11], selector); - w[49] = __byte_perm (w[ 9], w[10], selector); - w[48] = __byte_perm (w[ 8], w[ 9], selector); - w[47] = __byte_perm (w[ 7], w[ 8], selector); - w[46] = __byte_perm (w[ 6], w[ 7], selector); - w[45] = __byte_perm (w[ 5], w[ 6], selector); - w[44] = __byte_perm (w[ 4], w[ 5], selector); - w[43] = __byte_perm (w[ 3], w[ 4], selector); - w[42] = __byte_perm (w[ 2], w[ 3], selector); - w[41] = __byte_perm (w[ 1], w[ 2], selector); - w[40] = __byte_perm (w[ 0], w[ 1], selector); - w[39] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[23], w[24], selector); + w[62] = hc_byte_perm (w[22], w[23], selector); + w[61] = hc_byte_perm (w[21], w[22], selector); + w[60] = hc_byte_perm (w[20], w[21], selector); + w[59] = hc_byte_perm (w[19], w[20], selector); + w[58] = hc_byte_perm (w[18], w[19], selector); + w[57] = hc_byte_perm (w[17], w[18], selector); + w[56] = hc_byte_perm (w[16], w[17], selector); + w[55] = hc_byte_perm (w[15], w[16], selector); + w[54] = hc_byte_perm (w[14], w[15], selector); + w[53] = hc_byte_perm (w[13], w[14], selector); + w[52] = hc_byte_perm (w[12], w[13], selector); + w[51] = hc_byte_perm (w[11], w[12], selector); + w[50] = hc_byte_perm (w[10], w[11], selector); + w[49] = hc_byte_perm (w[ 9], w[10], selector); + w[48] = hc_byte_perm (w[ 8], w[ 9], selector); + w[47] = hc_byte_perm (w[ 7], w[ 8], selector); + w[46] = hc_byte_perm (w[ 6], w[ 7], selector); + w[45] = hc_byte_perm (w[ 5], w[ 6], selector); + w[44] = hc_byte_perm (w[ 4], w[ 5], selector); + w[43] = hc_byte_perm (w[ 3], w[ 4], selector); + w[42] = hc_byte_perm (w[ 2], w[ 3], selector); + w[41] = hc_byte_perm (w[ 1], w[ 2], selector); + w[40] = hc_byte_perm (w[ 0], w[ 1], selector); + w[39] = hc_byte_perm ( 0, w[ 0], selector); w[38] = 0; w[37] = 0; w[36] = 0; @@ -18756,30 +18756,30 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 40: - w[63] = __byte_perm (w[22], w[23], selector); - w[62] = __byte_perm (w[21], w[22], selector); - w[61] = __byte_perm (w[20], w[21], selector); - w[60] = __byte_perm (w[19], w[20], selector); - w[59] = __byte_perm (w[18], w[19], selector); - w[58] = __byte_perm (w[17], w[18], selector); - w[57] = __byte_perm (w[16], w[17], selector); - w[56] = __byte_perm (w[15], w[16], selector); - w[55] = __byte_perm (w[14], w[15], selector); - w[54] = __byte_perm (w[13], w[14], selector); - w[53] = __byte_perm (w[12], w[13], selector); - w[52] = __byte_perm (w[11], w[12], selector); - w[51] = __byte_perm (w[10], w[11], selector); - w[50] = __byte_perm (w[ 9], w[10], selector); - w[49] = __byte_perm (w[ 8], w[ 9], selector); - w[48] = __byte_perm (w[ 7], w[ 8], selector); - w[47] = __byte_perm (w[ 6], w[ 7], selector); - w[46] = __byte_perm (w[ 5], w[ 6], selector); - w[45] = __byte_perm (w[ 4], w[ 5], selector); - w[44] = __byte_perm (w[ 3], w[ 4], selector); - w[43] = __byte_perm (w[ 2], w[ 3], selector); - w[42] = __byte_perm (w[ 1], w[ 2], selector); - w[41] = __byte_perm (w[ 0], w[ 1], selector); - w[40] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[22], w[23], selector); + w[62] = hc_byte_perm (w[21], w[22], selector); + w[61] = hc_byte_perm (w[20], w[21], selector); + w[60] = hc_byte_perm (w[19], w[20], selector); + w[59] = hc_byte_perm (w[18], w[19], selector); + w[58] = hc_byte_perm (w[17], w[18], selector); + w[57] = hc_byte_perm (w[16], w[17], selector); + w[56] = hc_byte_perm (w[15], w[16], selector); + w[55] = hc_byte_perm (w[14], w[15], selector); + w[54] = hc_byte_perm (w[13], w[14], selector); + w[53] = hc_byte_perm (w[12], w[13], selector); + w[52] = hc_byte_perm (w[11], w[12], selector); + w[51] = hc_byte_perm (w[10], w[11], selector); + w[50] = hc_byte_perm (w[ 9], w[10], selector); + w[49] = hc_byte_perm (w[ 8], w[ 9], selector); + w[48] = hc_byte_perm (w[ 7], w[ 8], selector); + w[47] = hc_byte_perm (w[ 6], w[ 7], selector); + w[46] = hc_byte_perm (w[ 5], w[ 6], selector); + w[45] = hc_byte_perm (w[ 4], w[ 5], selector); + w[44] = hc_byte_perm (w[ 3], w[ 4], selector); + w[43] = hc_byte_perm (w[ 2], w[ 3], selector); + w[42] = hc_byte_perm (w[ 1], w[ 2], selector); + w[41] = hc_byte_perm (w[ 0], w[ 1], selector); + w[40] = hc_byte_perm ( 0, w[ 0], selector); w[39] = 0; w[38] = 0; w[37] = 0; @@ -18824,29 +18824,29 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 41: - w[63] = __byte_perm (w[21], w[22], selector); - w[62] = __byte_perm (w[20], w[21], selector); - w[61] = __byte_perm (w[19], w[20], selector); - w[60] = __byte_perm (w[18], w[19], selector); - w[59] = __byte_perm (w[17], w[18], selector); - w[58] = __byte_perm (w[16], w[17], selector); - w[57] = __byte_perm (w[15], w[16], selector); - w[56] = __byte_perm (w[14], w[15], selector); - w[55] = __byte_perm (w[13], w[14], selector); - w[54] = __byte_perm (w[12], w[13], selector); - w[53] = __byte_perm (w[11], w[12], selector); - w[52] = __byte_perm (w[10], w[11], selector); - w[51] = __byte_perm (w[ 9], w[10], selector); - w[50] = __byte_perm (w[ 8], w[ 9], selector); - w[49] = __byte_perm (w[ 7], w[ 8], selector); - w[48] = __byte_perm (w[ 6], w[ 7], selector); - w[47] = __byte_perm (w[ 5], w[ 6], selector); - w[46] = __byte_perm (w[ 4], w[ 5], selector); - w[45] = __byte_perm (w[ 3], w[ 4], selector); - w[44] = __byte_perm (w[ 2], w[ 3], selector); - w[43] = __byte_perm (w[ 1], w[ 2], selector); - w[42] = __byte_perm (w[ 0], w[ 1], selector); - w[41] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[21], w[22], selector); + w[62] = hc_byte_perm (w[20], w[21], selector); + w[61] = hc_byte_perm (w[19], w[20], selector); + w[60] = hc_byte_perm (w[18], w[19], selector); + w[59] = hc_byte_perm (w[17], w[18], selector); + w[58] = hc_byte_perm (w[16], w[17], selector); + w[57] = hc_byte_perm (w[15], w[16], selector); + w[56] = hc_byte_perm (w[14], w[15], selector); + w[55] = hc_byte_perm (w[13], w[14], selector); + w[54] = hc_byte_perm (w[12], w[13], selector); + w[53] = hc_byte_perm (w[11], w[12], selector); + w[52] = hc_byte_perm (w[10], w[11], selector); + w[51] = hc_byte_perm (w[ 9], w[10], selector); + w[50] = hc_byte_perm (w[ 8], w[ 9], selector); + w[49] = hc_byte_perm (w[ 7], w[ 8], selector); + w[48] = hc_byte_perm (w[ 6], w[ 7], selector); + w[47] = hc_byte_perm (w[ 5], w[ 6], selector); + w[46] = hc_byte_perm (w[ 4], w[ 5], selector); + w[45] = hc_byte_perm (w[ 3], w[ 4], selector); + w[44] = hc_byte_perm (w[ 2], w[ 3], selector); + w[43] = hc_byte_perm (w[ 1], w[ 2], selector); + w[42] = hc_byte_perm (w[ 0], w[ 1], selector); + w[41] = hc_byte_perm ( 0, w[ 0], selector); w[40] = 0; w[39] = 0; w[38] = 0; @@ -18892,28 +18892,28 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 42: - w[63] = __byte_perm (w[20], w[21], selector); - w[62] = __byte_perm (w[19], w[20], selector); - w[61] = __byte_perm (w[18], w[19], selector); - w[60] = __byte_perm (w[17], w[18], selector); - w[59] = __byte_perm (w[16], w[17], selector); - w[58] = __byte_perm (w[15], w[16], selector); - w[57] = __byte_perm (w[14], w[15], selector); - w[56] = __byte_perm (w[13], w[14], selector); - w[55] = __byte_perm (w[12], w[13], selector); - w[54] = __byte_perm (w[11], w[12], selector); - w[53] = __byte_perm (w[10], w[11], selector); - w[52] = __byte_perm (w[ 9], w[10], selector); - w[51] = __byte_perm (w[ 8], w[ 9], selector); - w[50] = __byte_perm (w[ 7], w[ 8], selector); - w[49] = __byte_perm (w[ 6], w[ 7], selector); - w[48] = __byte_perm (w[ 5], w[ 6], selector); - w[47] = __byte_perm (w[ 4], w[ 5], selector); - w[46] = __byte_perm (w[ 3], w[ 4], selector); - w[45] = __byte_perm (w[ 2], w[ 3], selector); - w[44] = __byte_perm (w[ 1], w[ 2], selector); - w[43] = __byte_perm (w[ 0], w[ 1], selector); - w[42] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[20], w[21], selector); + w[62] = hc_byte_perm (w[19], w[20], selector); + w[61] = hc_byte_perm (w[18], w[19], selector); + w[60] = hc_byte_perm (w[17], w[18], selector); + w[59] = hc_byte_perm (w[16], w[17], selector); + w[58] = hc_byte_perm (w[15], w[16], selector); + w[57] = hc_byte_perm (w[14], w[15], selector); + w[56] = hc_byte_perm (w[13], w[14], selector); + w[55] = hc_byte_perm (w[12], w[13], selector); + w[54] = hc_byte_perm (w[11], w[12], selector); + w[53] = hc_byte_perm (w[10], w[11], selector); + w[52] = hc_byte_perm (w[ 9], w[10], selector); + w[51] = hc_byte_perm (w[ 8], w[ 9], selector); + w[50] = hc_byte_perm (w[ 7], w[ 8], selector); + w[49] = hc_byte_perm (w[ 6], w[ 7], selector); + w[48] = hc_byte_perm (w[ 5], w[ 6], selector); + w[47] = hc_byte_perm (w[ 4], w[ 5], selector); + w[46] = hc_byte_perm (w[ 3], w[ 4], selector); + w[45] = hc_byte_perm (w[ 2], w[ 3], selector); + w[44] = hc_byte_perm (w[ 1], w[ 2], selector); + w[43] = hc_byte_perm (w[ 0], w[ 1], selector); + w[42] = hc_byte_perm ( 0, w[ 0], selector); w[41] = 0; w[40] = 0; w[39] = 0; @@ -18960,27 +18960,27 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 43: - w[63] = __byte_perm (w[19], w[20], selector); - w[62] = __byte_perm (w[18], w[19], selector); - w[61] = __byte_perm (w[17], w[18], selector); - w[60] = __byte_perm (w[16], w[17], selector); - w[59] = __byte_perm (w[15], w[16], selector); - w[58] = __byte_perm (w[14], w[15], selector); - w[57] = __byte_perm (w[13], w[14], selector); - w[56] = __byte_perm (w[12], w[13], selector); - w[55] = __byte_perm (w[11], w[12], selector); - w[54] = __byte_perm (w[10], w[11], selector); - w[53] = __byte_perm (w[ 9], w[10], selector); - w[52] = __byte_perm (w[ 8], w[ 9], selector); - w[51] = __byte_perm (w[ 7], w[ 8], selector); - w[50] = __byte_perm (w[ 6], w[ 7], selector); - w[49] = __byte_perm (w[ 5], w[ 6], selector); - w[48] = __byte_perm (w[ 4], w[ 5], selector); - w[47] = __byte_perm (w[ 3], w[ 4], selector); - w[46] = __byte_perm (w[ 2], w[ 3], selector); - w[45] = __byte_perm (w[ 1], w[ 2], selector); - w[44] = __byte_perm (w[ 0], w[ 1], selector); - w[43] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[19], w[20], selector); + w[62] = hc_byte_perm (w[18], w[19], selector); + w[61] = hc_byte_perm (w[17], w[18], selector); + w[60] = hc_byte_perm (w[16], w[17], selector); + w[59] = hc_byte_perm (w[15], w[16], selector); + w[58] = hc_byte_perm (w[14], w[15], selector); + w[57] = hc_byte_perm (w[13], w[14], selector); + w[56] = hc_byte_perm (w[12], w[13], selector); + w[55] = hc_byte_perm (w[11], w[12], selector); + w[54] = hc_byte_perm (w[10], w[11], selector); + w[53] = hc_byte_perm (w[ 9], w[10], selector); + w[52] = hc_byte_perm (w[ 8], w[ 9], selector); + w[51] = hc_byte_perm (w[ 7], w[ 8], selector); + w[50] = hc_byte_perm (w[ 6], w[ 7], selector); + w[49] = hc_byte_perm (w[ 5], w[ 6], selector); + w[48] = hc_byte_perm (w[ 4], w[ 5], selector); + w[47] = hc_byte_perm (w[ 3], w[ 4], selector); + w[46] = hc_byte_perm (w[ 2], w[ 3], selector); + w[45] = hc_byte_perm (w[ 1], w[ 2], selector); + w[44] = hc_byte_perm (w[ 0], w[ 1], selector); + w[43] = hc_byte_perm ( 0, w[ 0], selector); w[42] = 0; w[41] = 0; w[40] = 0; @@ -19028,26 +19028,26 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 44: - w[63] = __byte_perm (w[18], w[19], selector); - w[62] = __byte_perm (w[17], w[18], selector); - w[61] = __byte_perm (w[16], w[17], selector); - w[60] = __byte_perm (w[15], w[16], selector); - w[59] = __byte_perm (w[14], w[15], selector); - w[58] = __byte_perm (w[13], w[14], selector); - w[57] = __byte_perm (w[12], w[13], selector); - w[56] = __byte_perm (w[11], w[12], selector); - w[55] = __byte_perm (w[10], w[11], selector); - w[54] = __byte_perm (w[ 9], w[10], selector); - w[53] = __byte_perm (w[ 8], w[ 9], selector); - w[52] = __byte_perm (w[ 7], w[ 8], selector); - w[51] = __byte_perm (w[ 6], w[ 7], selector); - w[50] = __byte_perm (w[ 5], w[ 6], selector); - w[49] = __byte_perm (w[ 4], w[ 5], selector); - w[48] = __byte_perm (w[ 3], w[ 4], selector); - w[47] = __byte_perm (w[ 2], w[ 3], selector); - w[46] = __byte_perm (w[ 1], w[ 2], selector); - w[45] = __byte_perm (w[ 0], w[ 1], selector); - w[44] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[18], w[19], selector); + w[62] = hc_byte_perm (w[17], w[18], selector); + w[61] = hc_byte_perm (w[16], w[17], selector); + w[60] = hc_byte_perm (w[15], w[16], selector); + w[59] = hc_byte_perm (w[14], w[15], selector); + w[58] = hc_byte_perm (w[13], w[14], selector); + w[57] = hc_byte_perm (w[12], w[13], selector); + w[56] = hc_byte_perm (w[11], w[12], selector); + w[55] = hc_byte_perm (w[10], w[11], selector); + w[54] = hc_byte_perm (w[ 9], w[10], selector); + w[53] = hc_byte_perm (w[ 8], w[ 9], selector); + w[52] = hc_byte_perm (w[ 7], w[ 8], selector); + w[51] = hc_byte_perm (w[ 6], w[ 7], selector); + w[50] = hc_byte_perm (w[ 5], w[ 6], selector); + w[49] = hc_byte_perm (w[ 4], w[ 5], selector); + w[48] = hc_byte_perm (w[ 3], w[ 4], selector); + w[47] = hc_byte_perm (w[ 2], w[ 3], selector); + w[46] = hc_byte_perm (w[ 1], w[ 2], selector); + w[45] = hc_byte_perm (w[ 0], w[ 1], selector); + w[44] = hc_byte_perm ( 0, w[ 0], selector); w[43] = 0; w[42] = 0; w[41] = 0; @@ -19096,25 +19096,25 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 45: - w[63] = __byte_perm (w[17], w[18], selector); - w[62] = __byte_perm (w[16], w[17], selector); - w[61] = __byte_perm (w[15], w[16], selector); - w[60] = __byte_perm (w[14], w[15], selector); - w[59] = __byte_perm (w[13], w[14], selector); - w[58] = __byte_perm (w[12], w[13], selector); - w[57] = __byte_perm (w[11], w[12], selector); - w[56] = __byte_perm (w[10], w[11], selector); - w[55] = __byte_perm (w[ 9], w[10], selector); - w[54] = __byte_perm (w[ 8], w[ 9], selector); - w[53] = __byte_perm (w[ 7], w[ 8], selector); - w[52] = __byte_perm (w[ 6], w[ 7], selector); - w[51] = __byte_perm (w[ 5], w[ 6], selector); - w[50] = __byte_perm (w[ 4], w[ 5], selector); - w[49] = __byte_perm (w[ 3], w[ 4], selector); - w[48] = __byte_perm (w[ 2], w[ 3], selector); - w[47] = __byte_perm (w[ 1], w[ 2], selector); - w[46] = __byte_perm (w[ 0], w[ 1], selector); - w[45] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[17], w[18], selector); + w[62] = hc_byte_perm (w[16], w[17], selector); + w[61] = hc_byte_perm (w[15], w[16], selector); + w[60] = hc_byte_perm (w[14], w[15], selector); + w[59] = hc_byte_perm (w[13], w[14], selector); + w[58] = hc_byte_perm (w[12], w[13], selector); + w[57] = hc_byte_perm (w[11], w[12], selector); + w[56] = hc_byte_perm (w[10], w[11], selector); + w[55] = hc_byte_perm (w[ 9], w[10], selector); + w[54] = hc_byte_perm (w[ 8], w[ 9], selector); + w[53] = hc_byte_perm (w[ 7], w[ 8], selector); + w[52] = hc_byte_perm (w[ 6], w[ 7], selector); + w[51] = hc_byte_perm (w[ 5], w[ 6], selector); + w[50] = hc_byte_perm (w[ 4], w[ 5], selector); + w[49] = hc_byte_perm (w[ 3], w[ 4], selector); + w[48] = hc_byte_perm (w[ 2], w[ 3], selector); + w[47] = hc_byte_perm (w[ 1], w[ 2], selector); + w[46] = hc_byte_perm (w[ 0], w[ 1], selector); + w[45] = hc_byte_perm ( 0, w[ 0], selector); w[44] = 0; w[43] = 0; w[42] = 0; @@ -19164,24 +19164,24 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 46: - w[63] = __byte_perm (w[16], w[17], selector); - w[62] = __byte_perm (w[15], w[16], selector); - w[61] = __byte_perm (w[14], w[15], selector); - w[60] = __byte_perm (w[13], w[14], selector); - w[59] = __byte_perm (w[12], w[13], selector); - w[58] = __byte_perm (w[11], w[12], selector); - w[57] = __byte_perm (w[10], w[11], selector); - w[56] = __byte_perm (w[ 9], w[10], selector); - w[55] = __byte_perm (w[ 8], w[ 9], selector); - w[54] = __byte_perm (w[ 7], w[ 8], selector); - w[53] = __byte_perm (w[ 6], w[ 7], selector); - w[52] = __byte_perm (w[ 5], w[ 6], selector); - w[51] = __byte_perm (w[ 4], w[ 5], selector); - w[50] = __byte_perm (w[ 3], w[ 4], selector); - w[49] = __byte_perm (w[ 2], w[ 3], selector); - w[48] = __byte_perm (w[ 1], w[ 2], selector); - w[47] = __byte_perm (w[ 0], w[ 1], selector); - w[46] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[16], w[17], selector); + w[62] = hc_byte_perm (w[15], w[16], selector); + w[61] = hc_byte_perm (w[14], w[15], selector); + w[60] = hc_byte_perm (w[13], w[14], selector); + w[59] = hc_byte_perm (w[12], w[13], selector); + w[58] = hc_byte_perm (w[11], w[12], selector); + w[57] = hc_byte_perm (w[10], w[11], selector); + w[56] = hc_byte_perm (w[ 9], w[10], selector); + w[55] = hc_byte_perm (w[ 8], w[ 9], selector); + w[54] = hc_byte_perm (w[ 7], w[ 8], selector); + w[53] = hc_byte_perm (w[ 6], w[ 7], selector); + w[52] = hc_byte_perm (w[ 5], w[ 6], selector); + w[51] = hc_byte_perm (w[ 4], w[ 5], selector); + w[50] = hc_byte_perm (w[ 3], w[ 4], selector); + w[49] = hc_byte_perm (w[ 2], w[ 3], selector); + w[48] = hc_byte_perm (w[ 1], w[ 2], selector); + w[47] = hc_byte_perm (w[ 0], w[ 1], selector); + w[46] = hc_byte_perm ( 0, w[ 0], selector); w[45] = 0; w[44] = 0; w[43] = 0; @@ -19232,23 +19232,23 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 47: - w[63] = __byte_perm (w[15], w[16], selector); - w[62] = __byte_perm (w[14], w[15], selector); - w[61] = __byte_perm (w[13], w[14], selector); - w[60] = __byte_perm (w[12], w[13], selector); - w[59] = __byte_perm (w[11], w[12], selector); - w[58] = __byte_perm (w[10], w[11], selector); - w[57] = __byte_perm (w[ 9], w[10], selector); - w[56] = __byte_perm (w[ 8], w[ 9], selector); - w[55] = __byte_perm (w[ 7], w[ 8], selector); - w[54] = __byte_perm (w[ 6], w[ 7], selector); - w[53] = __byte_perm (w[ 5], w[ 6], selector); - w[52] = __byte_perm (w[ 4], w[ 5], selector); - w[51] = __byte_perm (w[ 3], w[ 4], selector); - w[50] = __byte_perm (w[ 2], w[ 3], selector); - w[49] = __byte_perm (w[ 1], w[ 2], selector); - w[48] = __byte_perm (w[ 0], w[ 1], selector); - w[47] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[15], w[16], selector); + w[62] = hc_byte_perm (w[14], w[15], selector); + w[61] = hc_byte_perm (w[13], w[14], selector); + w[60] = hc_byte_perm (w[12], w[13], selector); + w[59] = hc_byte_perm (w[11], w[12], selector); + w[58] = hc_byte_perm (w[10], w[11], selector); + w[57] = hc_byte_perm (w[ 9], w[10], selector); + w[56] = hc_byte_perm (w[ 8], w[ 9], selector); + w[55] = hc_byte_perm (w[ 7], w[ 8], selector); + w[54] = hc_byte_perm (w[ 6], w[ 7], selector); + w[53] = hc_byte_perm (w[ 5], w[ 6], selector); + w[52] = hc_byte_perm (w[ 4], w[ 5], selector); + w[51] = hc_byte_perm (w[ 3], w[ 4], selector); + w[50] = hc_byte_perm (w[ 2], w[ 3], selector); + w[49] = hc_byte_perm (w[ 1], w[ 2], selector); + w[48] = hc_byte_perm (w[ 0], w[ 1], selector); + w[47] = hc_byte_perm ( 0, w[ 0], selector); w[46] = 0; w[45] = 0; w[44] = 0; @@ -19300,22 +19300,22 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 48: - w[63] = __byte_perm (w[14], w[15], selector); - w[62] = __byte_perm (w[13], w[14], selector); - w[61] = __byte_perm (w[12], w[13], selector); - w[60] = __byte_perm (w[11], w[12], selector); - w[59] = __byte_perm (w[10], w[11], selector); - w[58] = __byte_perm (w[ 9], w[10], selector); - w[57] = __byte_perm (w[ 8], w[ 9], selector); - w[56] = __byte_perm (w[ 7], w[ 8], selector); - w[55] = __byte_perm (w[ 6], w[ 7], selector); - w[54] = __byte_perm (w[ 5], w[ 6], selector); - w[53] = __byte_perm (w[ 4], w[ 5], selector); - w[52] = __byte_perm (w[ 3], w[ 4], selector); - w[51] = __byte_perm (w[ 2], w[ 3], selector); - w[50] = __byte_perm (w[ 1], w[ 2], selector); - w[49] = __byte_perm (w[ 0], w[ 1], selector); - w[48] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[14], w[15], selector); + w[62] = hc_byte_perm (w[13], w[14], selector); + w[61] = hc_byte_perm (w[12], w[13], selector); + w[60] = hc_byte_perm (w[11], w[12], selector); + w[59] = hc_byte_perm (w[10], w[11], selector); + w[58] = hc_byte_perm (w[ 9], w[10], selector); + w[57] = hc_byte_perm (w[ 8], w[ 9], selector); + w[56] = hc_byte_perm (w[ 7], w[ 8], selector); + w[55] = hc_byte_perm (w[ 6], w[ 7], selector); + w[54] = hc_byte_perm (w[ 5], w[ 6], selector); + w[53] = hc_byte_perm (w[ 4], w[ 5], selector); + w[52] = hc_byte_perm (w[ 3], w[ 4], selector); + w[51] = hc_byte_perm (w[ 2], w[ 3], selector); + w[50] = hc_byte_perm (w[ 1], w[ 2], selector); + w[49] = hc_byte_perm (w[ 0], w[ 1], selector); + w[48] = hc_byte_perm ( 0, w[ 0], selector); w[47] = 0; w[46] = 0; w[45] = 0; @@ -19368,21 +19368,21 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 49: - w[63] = __byte_perm (w[13], w[14], selector); - w[62] = __byte_perm (w[12], w[13], selector); - w[61] = __byte_perm (w[11], w[12], selector); - w[60] = __byte_perm (w[10], w[11], selector); - w[59] = __byte_perm (w[ 9], w[10], selector); - w[58] = __byte_perm (w[ 8], w[ 9], selector); - w[57] = __byte_perm (w[ 7], w[ 8], selector); - w[56] = __byte_perm (w[ 6], w[ 7], selector); - w[55] = __byte_perm (w[ 5], w[ 6], selector); - w[54] = __byte_perm (w[ 4], w[ 5], selector); - w[53] = __byte_perm (w[ 3], w[ 4], selector); - w[52] = __byte_perm (w[ 2], w[ 3], selector); - w[51] = __byte_perm (w[ 1], w[ 2], selector); - w[50] = __byte_perm (w[ 0], w[ 1], selector); - w[49] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[13], w[14], selector); + w[62] = hc_byte_perm (w[12], w[13], selector); + w[61] = hc_byte_perm (w[11], w[12], selector); + w[60] = hc_byte_perm (w[10], w[11], selector); + w[59] = hc_byte_perm (w[ 9], w[10], selector); + w[58] = hc_byte_perm (w[ 8], w[ 9], selector); + w[57] = hc_byte_perm (w[ 7], w[ 8], selector); + w[56] = hc_byte_perm (w[ 6], w[ 7], selector); + w[55] = hc_byte_perm (w[ 5], w[ 6], selector); + w[54] = hc_byte_perm (w[ 4], w[ 5], selector); + w[53] = hc_byte_perm (w[ 3], w[ 4], selector); + w[52] = hc_byte_perm (w[ 2], w[ 3], selector); + w[51] = hc_byte_perm (w[ 1], w[ 2], selector); + w[50] = hc_byte_perm (w[ 0], w[ 1], selector); + w[49] = hc_byte_perm ( 0, w[ 0], selector); w[48] = 0; w[47] = 0; w[46] = 0; @@ -19436,20 +19436,20 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 50: - w[63] = __byte_perm (w[12], w[13], selector); - w[62] = __byte_perm (w[11], w[12], selector); - w[61] = __byte_perm (w[10], w[11], selector); - w[60] = __byte_perm (w[ 9], w[10], selector); - w[59] = __byte_perm (w[ 8], w[ 9], selector); - w[58] = __byte_perm (w[ 7], w[ 8], selector); - w[57] = __byte_perm (w[ 6], w[ 7], selector); - w[56] = __byte_perm (w[ 5], w[ 6], selector); - w[55] = __byte_perm (w[ 4], w[ 5], selector); - w[54] = __byte_perm (w[ 3], w[ 4], selector); - w[53] = __byte_perm (w[ 2], w[ 3], selector); - w[52] = __byte_perm (w[ 1], w[ 2], selector); - w[51] = __byte_perm (w[ 0], w[ 1], selector); - w[50] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[12], w[13], selector); + w[62] = hc_byte_perm (w[11], w[12], selector); + w[61] = hc_byte_perm (w[10], w[11], selector); + w[60] = hc_byte_perm (w[ 9], w[10], selector); + w[59] = hc_byte_perm (w[ 8], w[ 9], selector); + w[58] = hc_byte_perm (w[ 7], w[ 8], selector); + w[57] = hc_byte_perm (w[ 6], w[ 7], selector); + w[56] = hc_byte_perm (w[ 5], w[ 6], selector); + w[55] = hc_byte_perm (w[ 4], w[ 5], selector); + w[54] = hc_byte_perm (w[ 3], w[ 4], selector); + w[53] = hc_byte_perm (w[ 2], w[ 3], selector); + w[52] = hc_byte_perm (w[ 1], w[ 2], selector); + w[51] = hc_byte_perm (w[ 0], w[ 1], selector); + w[50] = hc_byte_perm ( 0, w[ 0], selector); w[49] = 0; w[48] = 0; w[47] = 0; @@ -19504,19 +19504,19 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 51: - w[63] = __byte_perm (w[11], w[12], selector); - w[62] = __byte_perm (w[10], w[11], selector); - w[61] = __byte_perm (w[ 9], w[10], selector); - w[60] = __byte_perm (w[ 8], w[ 9], selector); - w[59] = __byte_perm (w[ 7], w[ 8], selector); - w[58] = __byte_perm (w[ 6], w[ 7], selector); - w[57] = __byte_perm (w[ 5], w[ 6], selector); - w[56] = __byte_perm (w[ 4], w[ 5], selector); - w[55] = __byte_perm (w[ 3], w[ 4], selector); - w[54] = __byte_perm (w[ 2], w[ 3], selector); - w[53] = __byte_perm (w[ 1], w[ 2], selector); - w[52] = __byte_perm (w[ 0], w[ 1], selector); - w[51] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[11], w[12], selector); + w[62] = hc_byte_perm (w[10], w[11], selector); + w[61] = hc_byte_perm (w[ 9], w[10], selector); + w[60] = hc_byte_perm (w[ 8], w[ 9], selector); + w[59] = hc_byte_perm (w[ 7], w[ 8], selector); + w[58] = hc_byte_perm (w[ 6], w[ 7], selector); + w[57] = hc_byte_perm (w[ 5], w[ 6], selector); + w[56] = hc_byte_perm (w[ 4], w[ 5], selector); + w[55] = hc_byte_perm (w[ 3], w[ 4], selector); + w[54] = hc_byte_perm (w[ 2], w[ 3], selector); + w[53] = hc_byte_perm (w[ 1], w[ 2], selector); + w[52] = hc_byte_perm (w[ 0], w[ 1], selector); + w[51] = hc_byte_perm ( 0, w[ 0], selector); w[50] = 0; w[49] = 0; w[48] = 0; @@ -19572,18 +19572,18 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 52: - w[63] = __byte_perm (w[10], w[11], selector); - w[62] = __byte_perm (w[ 9], w[10], selector); - w[61] = __byte_perm (w[ 8], w[ 9], selector); - w[60] = __byte_perm (w[ 7], w[ 8], selector); - w[59] = __byte_perm (w[ 6], w[ 7], selector); - w[58] = __byte_perm (w[ 5], w[ 6], selector); - w[57] = __byte_perm (w[ 4], w[ 5], selector); - w[56] = __byte_perm (w[ 3], w[ 4], selector); - w[55] = __byte_perm (w[ 2], w[ 3], selector); - w[54] = __byte_perm (w[ 1], w[ 2], selector); - w[53] = __byte_perm (w[ 0], w[ 1], selector); - w[52] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[10], w[11], selector); + w[62] = hc_byte_perm (w[ 9], w[10], selector); + w[61] = hc_byte_perm (w[ 8], w[ 9], selector); + w[60] = hc_byte_perm (w[ 7], w[ 8], selector); + w[59] = hc_byte_perm (w[ 6], w[ 7], selector); + w[58] = hc_byte_perm (w[ 5], w[ 6], selector); + w[57] = hc_byte_perm (w[ 4], w[ 5], selector); + w[56] = hc_byte_perm (w[ 3], w[ 4], selector); + w[55] = hc_byte_perm (w[ 2], w[ 3], selector); + w[54] = hc_byte_perm (w[ 1], w[ 2], selector); + w[53] = hc_byte_perm (w[ 0], w[ 1], selector); + w[52] = hc_byte_perm ( 0, w[ 0], selector); w[51] = 0; w[50] = 0; w[49] = 0; @@ -19640,17 +19640,17 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 53: - w[63] = __byte_perm (w[ 9], w[10], selector); - w[62] = __byte_perm (w[ 8], w[ 9], selector); - w[61] = __byte_perm (w[ 7], w[ 8], selector); - w[60] = __byte_perm (w[ 6], w[ 7], selector); - w[59] = __byte_perm (w[ 5], w[ 6], selector); - w[58] = __byte_perm (w[ 4], w[ 5], selector); - w[57] = __byte_perm (w[ 3], w[ 4], selector); - w[56] = __byte_perm (w[ 2], w[ 3], selector); - w[55] = __byte_perm (w[ 1], w[ 2], selector); - w[54] = __byte_perm (w[ 0], w[ 1], selector); - w[53] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[ 9], w[10], selector); + w[62] = hc_byte_perm (w[ 8], w[ 9], selector); + w[61] = hc_byte_perm (w[ 7], w[ 8], selector); + w[60] = hc_byte_perm (w[ 6], w[ 7], selector); + w[59] = hc_byte_perm (w[ 5], w[ 6], selector); + w[58] = hc_byte_perm (w[ 4], w[ 5], selector); + w[57] = hc_byte_perm (w[ 3], w[ 4], selector); + w[56] = hc_byte_perm (w[ 2], w[ 3], selector); + w[55] = hc_byte_perm (w[ 1], w[ 2], selector); + w[54] = hc_byte_perm (w[ 0], w[ 1], selector); + w[53] = hc_byte_perm ( 0, w[ 0], selector); w[52] = 0; w[51] = 0; w[50] = 0; @@ -19708,16 +19708,16 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 54: - w[63] = __byte_perm (w[ 8], w[ 9], selector); - w[62] = __byte_perm (w[ 7], w[ 8], selector); - w[61] = __byte_perm (w[ 6], w[ 7], selector); - w[60] = __byte_perm (w[ 5], w[ 6], selector); - w[59] = __byte_perm (w[ 4], w[ 5], selector); - w[58] = __byte_perm (w[ 3], w[ 4], selector); - w[57] = __byte_perm (w[ 2], w[ 3], selector); - w[56] = __byte_perm (w[ 1], w[ 2], selector); - w[55] = __byte_perm (w[ 0], w[ 1], selector); - w[54] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[ 8], w[ 9], selector); + w[62] = hc_byte_perm (w[ 7], w[ 8], selector); + w[61] = hc_byte_perm (w[ 6], w[ 7], selector); + w[60] = hc_byte_perm (w[ 5], w[ 6], selector); + w[59] = hc_byte_perm (w[ 4], w[ 5], selector); + w[58] = hc_byte_perm (w[ 3], w[ 4], selector); + w[57] = hc_byte_perm (w[ 2], w[ 3], selector); + w[56] = hc_byte_perm (w[ 1], w[ 2], selector); + w[55] = hc_byte_perm (w[ 0], w[ 1], selector); + w[54] = hc_byte_perm ( 0, w[ 0], selector); w[53] = 0; w[52] = 0; w[51] = 0; @@ -19776,15 +19776,15 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 55: - w[63] = __byte_perm (w[ 7], w[ 8], selector); - w[62] = __byte_perm (w[ 6], w[ 7], selector); - w[61] = __byte_perm (w[ 5], w[ 6], selector); - w[60] = __byte_perm (w[ 4], w[ 5], selector); - w[59] = __byte_perm (w[ 3], w[ 4], selector); - w[58] = __byte_perm (w[ 2], w[ 3], selector); - w[57] = __byte_perm (w[ 1], w[ 2], selector); - w[56] = __byte_perm (w[ 0], w[ 1], selector); - w[55] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[ 7], w[ 8], selector); + w[62] = hc_byte_perm (w[ 6], w[ 7], selector); + w[61] = hc_byte_perm (w[ 5], w[ 6], selector); + w[60] = hc_byte_perm (w[ 4], w[ 5], selector); + w[59] = hc_byte_perm (w[ 3], w[ 4], selector); + w[58] = hc_byte_perm (w[ 2], w[ 3], selector); + w[57] = hc_byte_perm (w[ 1], w[ 2], selector); + w[56] = hc_byte_perm (w[ 0], w[ 1], selector); + w[55] = hc_byte_perm ( 0, w[ 0], selector); w[54] = 0; w[53] = 0; w[52] = 0; @@ -19844,14 +19844,14 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 56: - w[63] = __byte_perm (w[ 6], w[ 7], selector); - w[62] = __byte_perm (w[ 5], w[ 6], selector); - w[61] = __byte_perm (w[ 4], w[ 5], selector); - w[60] = __byte_perm (w[ 3], w[ 4], selector); - w[59] = __byte_perm (w[ 2], w[ 3], selector); - w[58] = __byte_perm (w[ 1], w[ 2], selector); - w[57] = __byte_perm (w[ 0], w[ 1], selector); - w[56] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[ 6], w[ 7], selector); + w[62] = hc_byte_perm (w[ 5], w[ 6], selector); + w[61] = hc_byte_perm (w[ 4], w[ 5], selector); + w[60] = hc_byte_perm (w[ 3], w[ 4], selector); + w[59] = hc_byte_perm (w[ 2], w[ 3], selector); + w[58] = hc_byte_perm (w[ 1], w[ 2], selector); + w[57] = hc_byte_perm (w[ 0], w[ 1], selector); + w[56] = hc_byte_perm ( 0, w[ 0], selector); w[55] = 0; w[54] = 0; w[53] = 0; @@ -19912,13 +19912,13 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 57: - w[63] = __byte_perm (w[ 5], w[ 6], selector); - w[62] = __byte_perm (w[ 4], w[ 5], selector); - w[61] = __byte_perm (w[ 3], w[ 4], selector); - w[60] = __byte_perm (w[ 2], w[ 3], selector); - w[59] = __byte_perm (w[ 1], w[ 2], selector); - w[58] = __byte_perm (w[ 0], w[ 1], selector); - w[57] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[ 5], w[ 6], selector); + w[62] = hc_byte_perm (w[ 4], w[ 5], selector); + w[61] = hc_byte_perm (w[ 3], w[ 4], selector); + w[60] = hc_byte_perm (w[ 2], w[ 3], selector); + w[59] = hc_byte_perm (w[ 1], w[ 2], selector); + w[58] = hc_byte_perm (w[ 0], w[ 1], selector); + w[57] = hc_byte_perm ( 0, w[ 0], selector); w[56] = 0; w[55] = 0; w[54] = 0; @@ -19980,12 +19980,12 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 58: - w[63] = __byte_perm (w[ 4], w[ 5], selector); - w[62] = __byte_perm (w[ 3], w[ 4], selector); - w[61] = __byte_perm (w[ 2], w[ 3], selector); - w[60] = __byte_perm (w[ 1], w[ 2], selector); - w[59] = __byte_perm (w[ 0], w[ 1], selector); - w[58] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[ 4], w[ 5], selector); + w[62] = hc_byte_perm (w[ 3], w[ 4], selector); + w[61] = hc_byte_perm (w[ 2], w[ 3], selector); + w[60] = hc_byte_perm (w[ 1], w[ 2], selector); + w[59] = hc_byte_perm (w[ 0], w[ 1], selector); + w[58] = hc_byte_perm ( 0, w[ 0], selector); w[57] = 0; w[56] = 0; w[55] = 0; @@ -20048,11 +20048,11 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 59: - w[63] = __byte_perm (w[ 3], w[ 4], selector); - w[62] = __byte_perm (w[ 2], w[ 3], selector); - w[61] = __byte_perm (w[ 1], w[ 2], selector); - w[60] = __byte_perm (w[ 0], w[ 1], selector); - w[59] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[ 3], w[ 4], selector); + w[62] = hc_byte_perm (w[ 2], w[ 3], selector); + w[61] = hc_byte_perm (w[ 1], w[ 2], selector); + w[60] = hc_byte_perm (w[ 0], w[ 1], selector); + w[59] = hc_byte_perm ( 0, w[ 0], selector); w[58] = 0; w[57] = 0; w[56] = 0; @@ -20116,10 +20116,10 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 60: - w[63] = __byte_perm (w[ 2], w[ 3], selector); - w[62] = __byte_perm (w[ 1], w[ 2], selector); - w[61] = __byte_perm (w[ 0], w[ 1], selector); - w[60] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[ 2], w[ 3], selector); + w[62] = hc_byte_perm (w[ 1], w[ 2], selector); + w[61] = hc_byte_perm (w[ 0], w[ 1], selector); + w[60] = hc_byte_perm ( 0, w[ 0], selector); w[59] = 0; w[58] = 0; w[57] = 0; @@ -20184,9 +20184,9 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 61: - w[63] = __byte_perm (w[ 1], w[ 2], selector); - w[62] = __byte_perm (w[ 0], w[ 1], selector); - w[61] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[ 1], w[ 2], selector); + w[62] = hc_byte_perm (w[ 0], w[ 1], selector); + w[61] = hc_byte_perm ( 0, w[ 0], selector); w[60] = 0; w[59] = 0; w[58] = 0; @@ -20252,8 +20252,8 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 62: - w[63] = __byte_perm (w[ 0], w[ 1], selector); - w[62] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm (w[ 0], w[ 1], selector); + w[62] = hc_byte_perm ( 0, w[ 0], selector); w[61] = 0; w[60] = 0; w[59] = 0; @@ -20320,7 +20320,7 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) break; case 63: - w[63] = __byte_perm ( 0, w[ 0], selector); + w[63] = hc_byte_perm ( 0, w[ 0], selector); w[62] = 0; w[61] = 0; w[60] = 0; @@ -20398,271 +20398,271 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) switch (offset_switch) { case 0: - w[63] = amd_bytealign (w[62], w[63], offset); - w[62] = amd_bytealign (w[61], w[62], offset); - w[61] = amd_bytealign (w[60], w[61], offset); - w[60] = amd_bytealign (w[59], w[60], offset); - w[59] = amd_bytealign (w[58], w[59], offset); - w[58] = amd_bytealign (w[57], w[58], offset); - w[57] = amd_bytealign (w[56], w[57], offset); - w[56] = amd_bytealign (w[55], w[56], offset); - w[55] = amd_bytealign (w[54], w[55], offset); - w[54] = amd_bytealign (w[53], w[54], offset); - w[53] = amd_bytealign (w[52], w[53], offset); - w[52] = amd_bytealign (w[51], w[52], offset); - w[51] = amd_bytealign (w[50], w[51], offset); - w[50] = amd_bytealign (w[49], w[50], offset); - w[49] = amd_bytealign (w[48], w[49], offset); - w[48] = amd_bytealign (w[47], w[48], offset); - w[47] = amd_bytealign (w[46], w[47], offset); - w[46] = amd_bytealign (w[45], w[46], offset); - w[45] = amd_bytealign (w[44], w[45], offset); - w[44] = amd_bytealign (w[43], w[44], offset); - w[43] = amd_bytealign (w[42], w[43], offset); - w[42] = amd_bytealign (w[41], w[42], offset); - w[41] = amd_bytealign (w[40], w[41], offset); - w[40] = amd_bytealign (w[39], w[40], offset); - w[39] = amd_bytealign (w[38], w[39], offset); - w[38] = amd_bytealign (w[37], w[38], offset); - w[37] = amd_bytealign (w[36], w[37], offset); - w[36] = amd_bytealign (w[35], w[36], offset); - w[35] = amd_bytealign (w[34], w[35], offset); - w[34] = amd_bytealign (w[33], w[34], offset); - w[33] = amd_bytealign (w[32], w[33], offset); - w[32] = amd_bytealign (w[31], w[32], offset); - w[31] = amd_bytealign (w[30], w[31], offset); - w[30] = amd_bytealign (w[29], w[30], offset); - w[29] = amd_bytealign (w[28], w[29], offset); - w[28] = amd_bytealign (w[27], w[28], offset); - w[27] = amd_bytealign (w[26], w[27], offset); - w[26] = amd_bytealign (w[25], w[26], offset); - w[25] = amd_bytealign (w[24], w[25], offset); - w[24] = amd_bytealign (w[23], w[24], offset); - w[23] = amd_bytealign (w[22], w[23], offset); - w[22] = amd_bytealign (w[21], w[22], offset); - w[21] = amd_bytealign (w[20], w[21], offset); - w[20] = amd_bytealign (w[19], w[20], offset); - w[19] = amd_bytealign (w[18], w[19], offset); - w[18] = amd_bytealign (w[17], w[18], offset); - w[17] = amd_bytealign (w[16], w[17], offset); - w[16] = amd_bytealign (w[15], w[16], offset); - w[15] = amd_bytealign (w[14], w[15], offset); - w[14] = amd_bytealign (w[13], w[14], offset); - w[13] = amd_bytealign (w[12], w[13], offset); - w[12] = amd_bytealign (w[11], w[12], offset); - w[11] = amd_bytealign (w[10], w[11], offset); - w[10] = amd_bytealign (w[ 9], w[10], offset); - w[ 9] = amd_bytealign (w[ 8], w[ 9], offset); - w[ 8] = amd_bytealign (w[ 7], w[ 8], offset); - w[ 7] = amd_bytealign (w[ 6], w[ 7], offset); - w[ 6] = amd_bytealign (w[ 5], w[ 6], offset); - w[ 5] = amd_bytealign (w[ 4], w[ 5], offset); - w[ 4] = amd_bytealign (w[ 3], w[ 4], offset); - w[ 3] = amd_bytealign (w[ 2], w[ 3], offset); - w[ 2] = amd_bytealign (w[ 1], w[ 2], offset); - w[ 1] = amd_bytealign (w[ 0], w[ 1], offset); - w[ 0] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[62], w[63], offset); + w[62] = hc_bytealign (w[61], w[62], offset); + w[61] = hc_bytealign (w[60], w[61], offset); + w[60] = hc_bytealign (w[59], w[60], offset); + w[59] = hc_bytealign (w[58], w[59], offset); + w[58] = hc_bytealign (w[57], w[58], offset); + w[57] = hc_bytealign (w[56], w[57], offset); + w[56] = hc_bytealign (w[55], w[56], offset); + w[55] = hc_bytealign (w[54], w[55], offset); + w[54] = hc_bytealign (w[53], w[54], offset); + w[53] = hc_bytealign (w[52], w[53], offset); + w[52] = hc_bytealign (w[51], w[52], offset); + w[51] = hc_bytealign (w[50], w[51], offset); + w[50] = hc_bytealign (w[49], w[50], offset); + w[49] = hc_bytealign (w[48], w[49], offset); + w[48] = hc_bytealign (w[47], w[48], offset); + w[47] = hc_bytealign (w[46], w[47], offset); + w[46] = hc_bytealign (w[45], w[46], offset); + w[45] = hc_bytealign (w[44], w[45], offset); + w[44] = hc_bytealign (w[43], w[44], offset); + w[43] = hc_bytealign (w[42], w[43], offset); + w[42] = hc_bytealign (w[41], w[42], offset); + w[41] = hc_bytealign (w[40], w[41], offset); + w[40] = hc_bytealign (w[39], w[40], offset); + w[39] = hc_bytealign (w[38], w[39], offset); + w[38] = hc_bytealign (w[37], w[38], offset); + w[37] = hc_bytealign (w[36], w[37], offset); + w[36] = hc_bytealign (w[35], w[36], offset); + w[35] = hc_bytealign (w[34], w[35], offset); + w[34] = hc_bytealign (w[33], w[34], offset); + w[33] = hc_bytealign (w[32], w[33], offset); + w[32] = hc_bytealign (w[31], w[32], offset); + w[31] = hc_bytealign (w[30], w[31], offset); + w[30] = hc_bytealign (w[29], w[30], offset); + w[29] = hc_bytealign (w[28], w[29], offset); + w[28] = hc_bytealign (w[27], w[28], offset); + w[27] = hc_bytealign (w[26], w[27], offset); + w[26] = hc_bytealign (w[25], w[26], offset); + w[25] = hc_bytealign (w[24], w[25], offset); + w[24] = hc_bytealign (w[23], w[24], offset); + w[23] = hc_bytealign (w[22], w[23], offset); + w[22] = hc_bytealign (w[21], w[22], offset); + w[21] = hc_bytealign (w[20], w[21], offset); + w[20] = hc_bytealign (w[19], w[20], offset); + w[19] = hc_bytealign (w[18], w[19], offset); + w[18] = hc_bytealign (w[17], w[18], offset); + w[17] = hc_bytealign (w[16], w[17], offset); + w[16] = hc_bytealign (w[15], w[16], offset); + w[15] = hc_bytealign (w[14], w[15], offset); + w[14] = hc_bytealign (w[13], w[14], offset); + w[13] = hc_bytealign (w[12], w[13], offset); + w[12] = hc_bytealign (w[11], w[12], offset); + w[11] = hc_bytealign (w[10], w[11], offset); + w[10] = hc_bytealign (w[ 9], w[10], offset); + w[ 9] = hc_bytealign (w[ 8], w[ 9], offset); + w[ 8] = hc_bytealign (w[ 7], w[ 8], offset); + w[ 7] = hc_bytealign (w[ 6], w[ 7], offset); + w[ 6] = hc_bytealign (w[ 5], w[ 6], offset); + w[ 5] = hc_bytealign (w[ 4], w[ 5], offset); + w[ 4] = hc_bytealign (w[ 3], w[ 4], offset); + w[ 3] = hc_bytealign (w[ 2], w[ 3], offset); + w[ 2] = hc_bytealign (w[ 1], w[ 2], offset); + w[ 1] = hc_bytealign (w[ 0], w[ 1], offset); + w[ 0] = hc_bytealign ( 0, w[ 0], offset); break; case 1: - w[63] = amd_bytealign (w[61], w[62], offset); - w[62] = amd_bytealign (w[60], w[61], offset); - w[61] = amd_bytealign (w[59], w[60], offset); - w[60] = amd_bytealign (w[58], w[59], offset); - w[59] = amd_bytealign (w[57], w[58], offset); - w[58] = amd_bytealign (w[56], w[57], offset); - w[57] = amd_bytealign (w[55], w[56], offset); - w[56] = amd_bytealign (w[54], w[55], offset); - w[55] = amd_bytealign (w[53], w[54], offset); - w[54] = amd_bytealign (w[52], w[53], offset); - w[53] = amd_bytealign (w[51], w[52], offset); - w[52] = amd_bytealign (w[50], w[51], offset); - w[51] = amd_bytealign (w[49], w[50], offset); - w[50] = amd_bytealign (w[48], w[49], offset); - w[49] = amd_bytealign (w[47], w[48], offset); - w[48] = amd_bytealign (w[46], w[47], offset); - w[47] = amd_bytealign (w[45], w[46], offset); - w[46] = amd_bytealign (w[44], w[45], offset); - w[45] = amd_bytealign (w[43], w[44], offset); - w[44] = amd_bytealign (w[42], w[43], offset); - w[43] = amd_bytealign (w[41], w[42], offset); - w[42] = amd_bytealign (w[40], w[41], offset); - w[41] = amd_bytealign (w[39], w[40], offset); - w[40] = amd_bytealign (w[38], w[39], offset); - w[39] = amd_bytealign (w[37], w[38], offset); - w[38] = amd_bytealign (w[36], w[37], offset); - w[37] = amd_bytealign (w[35], w[36], offset); - w[36] = amd_bytealign (w[34], w[35], offset); - w[35] = amd_bytealign (w[33], w[34], offset); - w[34] = amd_bytealign (w[32], w[33], offset); - w[33] = amd_bytealign (w[31], w[32], offset); - w[32] = amd_bytealign (w[30], w[31], offset); - w[31] = amd_bytealign (w[29], w[30], offset); - w[30] = amd_bytealign (w[28], w[29], offset); - w[29] = amd_bytealign (w[27], w[28], offset); - w[28] = amd_bytealign (w[26], w[27], offset); - w[27] = amd_bytealign (w[25], w[26], offset); - w[26] = amd_bytealign (w[24], w[25], offset); - w[25] = amd_bytealign (w[23], w[24], offset); - w[24] = amd_bytealign (w[22], w[23], offset); - w[23] = amd_bytealign (w[21], w[22], offset); - w[22] = amd_bytealign (w[20], w[21], offset); - w[21] = amd_bytealign (w[19], w[20], offset); - w[20] = amd_bytealign (w[18], w[19], offset); - w[19] = amd_bytealign (w[17], w[18], offset); - w[18] = amd_bytealign (w[16], w[17], offset); - w[17] = amd_bytealign (w[15], w[16], offset); - w[16] = amd_bytealign (w[14], w[15], offset); - w[15] = amd_bytealign (w[13], w[14], offset); - w[14] = amd_bytealign (w[12], w[13], offset); - w[13] = amd_bytealign (w[11], w[12], offset); - w[12] = amd_bytealign (w[10], w[11], offset); - w[11] = amd_bytealign (w[ 9], w[10], offset); - w[10] = amd_bytealign (w[ 8], w[ 9], offset); - w[ 9] = amd_bytealign (w[ 7], w[ 8], offset); - w[ 8] = amd_bytealign (w[ 6], w[ 7], offset); - w[ 7] = amd_bytealign (w[ 5], w[ 6], offset); - w[ 6] = amd_bytealign (w[ 4], w[ 5], offset); - w[ 5] = amd_bytealign (w[ 3], w[ 4], offset); - w[ 4] = amd_bytealign (w[ 2], w[ 3], offset); - w[ 3] = amd_bytealign (w[ 1], w[ 2], offset); - w[ 2] = amd_bytealign (w[ 0], w[ 1], offset); - w[ 1] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[61], w[62], offset); + w[62] = hc_bytealign (w[60], w[61], offset); + w[61] = hc_bytealign (w[59], w[60], offset); + w[60] = hc_bytealign (w[58], w[59], offset); + w[59] = hc_bytealign (w[57], w[58], offset); + w[58] = hc_bytealign (w[56], w[57], offset); + w[57] = hc_bytealign (w[55], w[56], offset); + w[56] = hc_bytealign (w[54], w[55], offset); + w[55] = hc_bytealign (w[53], w[54], offset); + w[54] = hc_bytealign (w[52], w[53], offset); + w[53] = hc_bytealign (w[51], w[52], offset); + w[52] = hc_bytealign (w[50], w[51], offset); + w[51] = hc_bytealign (w[49], w[50], offset); + w[50] = hc_bytealign (w[48], w[49], offset); + w[49] = hc_bytealign (w[47], w[48], offset); + w[48] = hc_bytealign (w[46], w[47], offset); + w[47] = hc_bytealign (w[45], w[46], offset); + w[46] = hc_bytealign (w[44], w[45], offset); + w[45] = hc_bytealign (w[43], w[44], offset); + w[44] = hc_bytealign (w[42], w[43], offset); + w[43] = hc_bytealign (w[41], w[42], offset); + w[42] = hc_bytealign (w[40], w[41], offset); + w[41] = hc_bytealign (w[39], w[40], offset); + w[40] = hc_bytealign (w[38], w[39], offset); + w[39] = hc_bytealign (w[37], w[38], offset); + w[38] = hc_bytealign (w[36], w[37], offset); + w[37] = hc_bytealign (w[35], w[36], offset); + w[36] = hc_bytealign (w[34], w[35], offset); + w[35] = hc_bytealign (w[33], w[34], offset); + w[34] = hc_bytealign (w[32], w[33], offset); + w[33] = hc_bytealign (w[31], w[32], offset); + w[32] = hc_bytealign (w[30], w[31], offset); + w[31] = hc_bytealign (w[29], w[30], offset); + w[30] = hc_bytealign (w[28], w[29], offset); + w[29] = hc_bytealign (w[27], w[28], offset); + w[28] = hc_bytealign (w[26], w[27], offset); + w[27] = hc_bytealign (w[25], w[26], offset); + w[26] = hc_bytealign (w[24], w[25], offset); + w[25] = hc_bytealign (w[23], w[24], offset); + w[24] = hc_bytealign (w[22], w[23], offset); + w[23] = hc_bytealign (w[21], w[22], offset); + w[22] = hc_bytealign (w[20], w[21], offset); + w[21] = hc_bytealign (w[19], w[20], offset); + w[20] = hc_bytealign (w[18], w[19], offset); + w[19] = hc_bytealign (w[17], w[18], offset); + w[18] = hc_bytealign (w[16], w[17], offset); + w[17] = hc_bytealign (w[15], w[16], offset); + w[16] = hc_bytealign (w[14], w[15], offset); + w[15] = hc_bytealign (w[13], w[14], offset); + w[14] = hc_bytealign (w[12], w[13], offset); + w[13] = hc_bytealign (w[11], w[12], offset); + w[12] = hc_bytealign (w[10], w[11], offset); + w[11] = hc_bytealign (w[ 9], w[10], offset); + w[10] = hc_bytealign (w[ 8], w[ 9], offset); + w[ 9] = hc_bytealign (w[ 7], w[ 8], offset); + w[ 8] = hc_bytealign (w[ 6], w[ 7], offset); + w[ 7] = hc_bytealign (w[ 5], w[ 6], offset); + w[ 6] = hc_bytealign (w[ 4], w[ 5], offset); + w[ 5] = hc_bytealign (w[ 3], w[ 4], offset); + w[ 4] = hc_bytealign (w[ 2], w[ 3], offset); + w[ 3] = hc_bytealign (w[ 1], w[ 2], offset); + w[ 2] = hc_bytealign (w[ 0], w[ 1], offset); + w[ 1] = hc_bytealign ( 0, w[ 0], offset); w[ 0] = 0; break; case 2: - w[63] = amd_bytealign (w[60], w[61], offset); - w[62] = amd_bytealign (w[59], w[60], offset); - w[61] = amd_bytealign (w[58], w[59], offset); - w[60] = amd_bytealign (w[57], w[58], offset); - w[59] = amd_bytealign (w[56], w[57], offset); - w[58] = amd_bytealign (w[55], w[56], offset); - w[57] = amd_bytealign (w[54], w[55], offset); - w[56] = amd_bytealign (w[53], w[54], offset); - w[55] = amd_bytealign (w[52], w[53], offset); - w[54] = amd_bytealign (w[51], w[52], offset); - w[53] = amd_bytealign (w[50], w[51], offset); - w[52] = amd_bytealign (w[49], w[50], offset); - w[51] = amd_bytealign (w[48], w[49], offset); - w[50] = amd_bytealign (w[47], w[48], offset); - w[49] = amd_bytealign (w[46], w[47], offset); - w[48] = amd_bytealign (w[45], w[46], offset); - w[47] = amd_bytealign (w[44], w[45], offset); - w[46] = amd_bytealign (w[43], w[44], offset); - w[45] = amd_bytealign (w[42], w[43], offset); - w[44] = amd_bytealign (w[41], w[42], offset); - w[43] = amd_bytealign (w[40], w[41], offset); - w[42] = amd_bytealign (w[39], w[40], offset); - w[41] = amd_bytealign (w[38], w[39], offset); - w[40] = amd_bytealign (w[37], w[38], offset); - w[39] = amd_bytealign (w[36], w[37], offset); - w[38] = amd_bytealign (w[35], w[36], offset); - w[37] = amd_bytealign (w[34], w[35], offset); - w[36] = amd_bytealign (w[33], w[34], offset); - w[35] = amd_bytealign (w[32], w[33], offset); - w[34] = amd_bytealign (w[31], w[32], offset); - w[33] = amd_bytealign (w[30], w[31], offset); - w[32] = amd_bytealign (w[29], w[30], offset); - w[31] = amd_bytealign (w[28], w[29], offset); - w[30] = amd_bytealign (w[27], w[28], offset); - w[29] = amd_bytealign (w[26], w[27], offset); - w[28] = amd_bytealign (w[25], w[26], offset); - w[27] = amd_bytealign (w[24], w[25], offset); - w[26] = amd_bytealign (w[23], w[24], offset); - w[25] = amd_bytealign (w[22], w[23], offset); - w[24] = amd_bytealign (w[21], w[22], offset); - w[23] = amd_bytealign (w[20], w[21], offset); - w[22] = amd_bytealign (w[19], w[20], offset); - w[21] = amd_bytealign (w[18], w[19], offset); - w[20] = amd_bytealign (w[17], w[18], offset); - w[19] = amd_bytealign (w[16], w[17], offset); - w[18] = amd_bytealign (w[15], w[16], offset); - w[17] = amd_bytealign (w[14], w[15], offset); - w[16] = amd_bytealign (w[13], w[14], offset); - w[15] = amd_bytealign (w[12], w[13], offset); - w[14] = amd_bytealign (w[11], w[12], offset); - w[13] = amd_bytealign (w[10], w[11], offset); - w[12] = amd_bytealign (w[ 9], w[10], offset); - w[11] = amd_bytealign (w[ 8], w[ 9], offset); - w[10] = amd_bytealign (w[ 7], w[ 8], offset); - w[ 9] = amd_bytealign (w[ 6], w[ 7], offset); - w[ 8] = amd_bytealign (w[ 5], w[ 6], offset); - w[ 7] = amd_bytealign (w[ 4], w[ 5], offset); - w[ 6] = amd_bytealign (w[ 3], w[ 4], offset); - w[ 5] = amd_bytealign (w[ 2], w[ 3], offset); - w[ 4] = amd_bytealign (w[ 1], w[ 2], offset); - w[ 3] = amd_bytealign (w[ 0], w[ 1], offset); - w[ 2] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[60], w[61], offset); + w[62] = hc_bytealign (w[59], w[60], offset); + w[61] = hc_bytealign (w[58], w[59], offset); + w[60] = hc_bytealign (w[57], w[58], offset); + w[59] = hc_bytealign (w[56], w[57], offset); + w[58] = hc_bytealign (w[55], w[56], offset); + w[57] = hc_bytealign (w[54], w[55], offset); + w[56] = hc_bytealign (w[53], w[54], offset); + w[55] = hc_bytealign (w[52], w[53], offset); + w[54] = hc_bytealign (w[51], w[52], offset); + w[53] = hc_bytealign (w[50], w[51], offset); + w[52] = hc_bytealign (w[49], w[50], offset); + w[51] = hc_bytealign (w[48], w[49], offset); + w[50] = hc_bytealign (w[47], w[48], offset); + w[49] = hc_bytealign (w[46], w[47], offset); + w[48] = hc_bytealign (w[45], w[46], offset); + w[47] = hc_bytealign (w[44], w[45], offset); + w[46] = hc_bytealign (w[43], w[44], offset); + w[45] = hc_bytealign (w[42], w[43], offset); + w[44] = hc_bytealign (w[41], w[42], offset); + w[43] = hc_bytealign (w[40], w[41], offset); + w[42] = hc_bytealign (w[39], w[40], offset); + w[41] = hc_bytealign (w[38], w[39], offset); + w[40] = hc_bytealign (w[37], w[38], offset); + w[39] = hc_bytealign (w[36], w[37], offset); + w[38] = hc_bytealign (w[35], w[36], offset); + w[37] = hc_bytealign (w[34], w[35], offset); + w[36] = hc_bytealign (w[33], w[34], offset); + w[35] = hc_bytealign (w[32], w[33], offset); + w[34] = hc_bytealign (w[31], w[32], offset); + w[33] = hc_bytealign (w[30], w[31], offset); + w[32] = hc_bytealign (w[29], w[30], offset); + w[31] = hc_bytealign (w[28], w[29], offset); + w[30] = hc_bytealign (w[27], w[28], offset); + w[29] = hc_bytealign (w[26], w[27], offset); + w[28] = hc_bytealign (w[25], w[26], offset); + w[27] = hc_bytealign (w[24], w[25], offset); + w[26] = hc_bytealign (w[23], w[24], offset); + w[25] = hc_bytealign (w[22], w[23], offset); + w[24] = hc_bytealign (w[21], w[22], offset); + w[23] = hc_bytealign (w[20], w[21], offset); + w[22] = hc_bytealign (w[19], w[20], offset); + w[21] = hc_bytealign (w[18], w[19], offset); + w[20] = hc_bytealign (w[17], w[18], offset); + w[19] = hc_bytealign (w[16], w[17], offset); + w[18] = hc_bytealign (w[15], w[16], offset); + w[17] = hc_bytealign (w[14], w[15], offset); + w[16] = hc_bytealign (w[13], w[14], offset); + w[15] = hc_bytealign (w[12], w[13], offset); + w[14] = hc_bytealign (w[11], w[12], offset); + w[13] = hc_bytealign (w[10], w[11], offset); + w[12] = hc_bytealign (w[ 9], w[10], offset); + w[11] = hc_bytealign (w[ 8], w[ 9], offset); + w[10] = hc_bytealign (w[ 7], w[ 8], offset); + w[ 9] = hc_bytealign (w[ 6], w[ 7], offset); + w[ 8] = hc_bytealign (w[ 5], w[ 6], offset); + w[ 7] = hc_bytealign (w[ 4], w[ 5], offset); + w[ 6] = hc_bytealign (w[ 3], w[ 4], offset); + w[ 5] = hc_bytealign (w[ 2], w[ 3], offset); + w[ 4] = hc_bytealign (w[ 1], w[ 2], offset); + w[ 3] = hc_bytealign (w[ 0], w[ 1], offset); + w[ 2] = hc_bytealign ( 0, w[ 0], offset); w[ 1] = 0; w[ 0] = 0; break; case 3: - w[63] = amd_bytealign (w[59], w[60], offset); - w[62] = amd_bytealign (w[58], w[59], offset); - w[61] = amd_bytealign (w[57], w[58], offset); - w[60] = amd_bytealign (w[56], w[57], offset); - w[59] = amd_bytealign (w[55], w[56], offset); - w[58] = amd_bytealign (w[54], w[55], offset); - w[57] = amd_bytealign (w[53], w[54], offset); - w[56] = amd_bytealign (w[52], w[53], offset); - w[55] = amd_bytealign (w[51], w[52], offset); - w[54] = amd_bytealign (w[50], w[51], offset); - w[53] = amd_bytealign (w[49], w[50], offset); - w[52] = amd_bytealign (w[48], w[49], offset); - w[51] = amd_bytealign (w[47], w[48], offset); - w[50] = amd_bytealign (w[46], w[47], offset); - w[49] = amd_bytealign (w[45], w[46], offset); - w[48] = amd_bytealign (w[44], w[45], offset); - w[47] = amd_bytealign (w[43], w[44], offset); - w[46] = amd_bytealign (w[42], w[43], offset); - w[45] = amd_bytealign (w[41], w[42], offset); - w[44] = amd_bytealign (w[40], w[41], offset); - w[43] = amd_bytealign (w[39], w[40], offset); - w[42] = amd_bytealign (w[38], w[39], offset); - w[41] = amd_bytealign (w[37], w[38], offset); - w[40] = amd_bytealign (w[36], w[37], offset); - w[39] = amd_bytealign (w[35], w[36], offset); - w[38] = amd_bytealign (w[34], w[35], offset); - w[37] = amd_bytealign (w[33], w[34], offset); - w[36] = amd_bytealign (w[32], w[33], offset); - w[35] = amd_bytealign (w[31], w[32], offset); - w[34] = amd_bytealign (w[30], w[31], offset); - w[33] = amd_bytealign (w[29], w[30], offset); - w[32] = amd_bytealign (w[28], w[29], offset); - w[31] = amd_bytealign (w[27], w[28], offset); - w[30] = amd_bytealign (w[26], w[27], offset); - w[29] = amd_bytealign (w[25], w[26], offset); - w[28] = amd_bytealign (w[24], w[25], offset); - w[27] = amd_bytealign (w[23], w[24], offset); - w[26] = amd_bytealign (w[22], w[23], offset); - w[25] = amd_bytealign (w[21], w[22], offset); - w[24] = amd_bytealign (w[20], w[21], offset); - w[23] = amd_bytealign (w[19], w[20], offset); - w[22] = amd_bytealign (w[18], w[19], offset); - w[21] = amd_bytealign (w[17], w[18], offset); - w[20] = amd_bytealign (w[16], w[17], offset); - w[19] = amd_bytealign (w[15], w[16], offset); - w[18] = amd_bytealign (w[14], w[15], offset); - w[17] = amd_bytealign (w[13], w[14], offset); - w[16] = amd_bytealign (w[12], w[13], offset); - w[15] = amd_bytealign (w[11], w[12], offset); - w[14] = amd_bytealign (w[10], w[11], offset); - w[13] = amd_bytealign (w[ 9], w[10], offset); - w[12] = amd_bytealign (w[ 8], w[ 9], offset); - w[11] = amd_bytealign (w[ 7], w[ 8], offset); - w[10] = amd_bytealign (w[ 6], w[ 7], offset); - w[ 9] = amd_bytealign (w[ 5], w[ 6], offset); - w[ 8] = amd_bytealign (w[ 4], w[ 5], offset); - w[ 7] = amd_bytealign (w[ 3], w[ 4], offset); - w[ 6] = amd_bytealign (w[ 2], w[ 3], offset); - w[ 5] = amd_bytealign (w[ 1], w[ 2], offset); - w[ 4] = amd_bytealign (w[ 0], w[ 1], offset); - w[ 3] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[59], w[60], offset); + w[62] = hc_bytealign (w[58], w[59], offset); + w[61] = hc_bytealign (w[57], w[58], offset); + w[60] = hc_bytealign (w[56], w[57], offset); + w[59] = hc_bytealign (w[55], w[56], offset); + w[58] = hc_bytealign (w[54], w[55], offset); + w[57] = hc_bytealign (w[53], w[54], offset); + w[56] = hc_bytealign (w[52], w[53], offset); + w[55] = hc_bytealign (w[51], w[52], offset); + w[54] = hc_bytealign (w[50], w[51], offset); + w[53] = hc_bytealign (w[49], w[50], offset); + w[52] = hc_bytealign (w[48], w[49], offset); + w[51] = hc_bytealign (w[47], w[48], offset); + w[50] = hc_bytealign (w[46], w[47], offset); + w[49] = hc_bytealign (w[45], w[46], offset); + w[48] = hc_bytealign (w[44], w[45], offset); + w[47] = hc_bytealign (w[43], w[44], offset); + w[46] = hc_bytealign (w[42], w[43], offset); + w[45] = hc_bytealign (w[41], w[42], offset); + w[44] = hc_bytealign (w[40], w[41], offset); + w[43] = hc_bytealign (w[39], w[40], offset); + w[42] = hc_bytealign (w[38], w[39], offset); + w[41] = hc_bytealign (w[37], w[38], offset); + w[40] = hc_bytealign (w[36], w[37], offset); + w[39] = hc_bytealign (w[35], w[36], offset); + w[38] = hc_bytealign (w[34], w[35], offset); + w[37] = hc_bytealign (w[33], w[34], offset); + w[36] = hc_bytealign (w[32], w[33], offset); + w[35] = hc_bytealign (w[31], w[32], offset); + w[34] = hc_bytealign (w[30], w[31], offset); + w[33] = hc_bytealign (w[29], w[30], offset); + w[32] = hc_bytealign (w[28], w[29], offset); + w[31] = hc_bytealign (w[27], w[28], offset); + w[30] = hc_bytealign (w[26], w[27], offset); + w[29] = hc_bytealign (w[25], w[26], offset); + w[28] = hc_bytealign (w[24], w[25], offset); + w[27] = hc_bytealign (w[23], w[24], offset); + w[26] = hc_bytealign (w[22], w[23], offset); + w[25] = hc_bytealign (w[21], w[22], offset); + w[24] = hc_bytealign (w[20], w[21], offset); + w[23] = hc_bytealign (w[19], w[20], offset); + w[22] = hc_bytealign (w[18], w[19], offset); + w[21] = hc_bytealign (w[17], w[18], offset); + w[20] = hc_bytealign (w[16], w[17], offset); + w[19] = hc_bytealign (w[15], w[16], offset); + w[18] = hc_bytealign (w[14], w[15], offset); + w[17] = hc_bytealign (w[13], w[14], offset); + w[16] = hc_bytealign (w[12], w[13], offset); + w[15] = hc_bytealign (w[11], w[12], offset); + w[14] = hc_bytealign (w[10], w[11], offset); + w[13] = hc_bytealign (w[ 9], w[10], offset); + w[12] = hc_bytealign (w[ 8], w[ 9], offset); + w[11] = hc_bytealign (w[ 7], w[ 8], offset); + w[10] = hc_bytealign (w[ 6], w[ 7], offset); + w[ 9] = hc_bytealign (w[ 5], w[ 6], offset); + w[ 8] = hc_bytealign (w[ 4], w[ 5], offset); + w[ 7] = hc_bytealign (w[ 3], w[ 4], offset); + w[ 6] = hc_bytealign (w[ 2], w[ 3], offset); + w[ 5] = hc_bytealign (w[ 1], w[ 2], offset); + w[ 4] = hc_bytealign (w[ 0], w[ 1], offset); + w[ 3] = hc_bytealign ( 0, w[ 0], offset); w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; @@ -20670,66 +20670,66 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 4: - w[63] = amd_bytealign (w[58], w[59], offset); - w[62] = amd_bytealign (w[57], w[58], offset); - w[61] = amd_bytealign (w[56], w[57], offset); - w[60] = amd_bytealign (w[55], w[56], offset); - w[59] = amd_bytealign (w[54], w[55], offset); - w[58] = amd_bytealign (w[53], w[54], offset); - w[57] = amd_bytealign (w[52], w[53], offset); - w[56] = amd_bytealign (w[51], w[52], offset); - w[55] = amd_bytealign (w[50], w[51], offset); - w[54] = amd_bytealign (w[49], w[50], offset); - w[53] = amd_bytealign (w[48], w[49], offset); - w[52] = amd_bytealign (w[47], w[48], offset); - w[51] = amd_bytealign (w[46], w[47], offset); - w[50] = amd_bytealign (w[45], w[46], offset); - w[49] = amd_bytealign (w[44], w[45], offset); - w[48] = amd_bytealign (w[43], w[44], offset); - w[47] = amd_bytealign (w[42], w[43], offset); - w[46] = amd_bytealign (w[41], w[42], offset); - w[45] = amd_bytealign (w[40], w[41], offset); - w[44] = amd_bytealign (w[39], w[40], offset); - w[43] = amd_bytealign (w[38], w[39], offset); - w[42] = amd_bytealign (w[37], w[38], offset); - w[41] = amd_bytealign (w[36], w[37], offset); - w[40] = amd_bytealign (w[35], w[36], offset); - w[39] = amd_bytealign (w[34], w[35], offset); - w[38] = amd_bytealign (w[33], w[34], offset); - w[37] = amd_bytealign (w[32], w[33], offset); - w[36] = amd_bytealign (w[31], w[32], offset); - w[35] = amd_bytealign (w[30], w[31], offset); - w[34] = amd_bytealign (w[29], w[30], offset); - w[33] = amd_bytealign (w[28], w[29], offset); - w[32] = amd_bytealign (w[27], w[28], offset); - w[31] = amd_bytealign (w[26], w[27], offset); - w[30] = amd_bytealign (w[25], w[26], offset); - w[29] = amd_bytealign (w[24], w[25], offset); - w[28] = amd_bytealign (w[23], w[24], offset); - w[27] = amd_bytealign (w[22], w[23], offset); - w[26] = amd_bytealign (w[21], w[22], offset); - w[25] = amd_bytealign (w[20], w[21], offset); - w[24] = amd_bytealign (w[19], w[20], offset); - w[23] = amd_bytealign (w[18], w[19], offset); - w[22] = amd_bytealign (w[17], w[18], offset); - w[21] = amd_bytealign (w[16], w[17], offset); - w[20] = amd_bytealign (w[15], w[16], offset); - w[19] = amd_bytealign (w[14], w[15], offset); - w[18] = amd_bytealign (w[13], w[14], offset); - w[17] = amd_bytealign (w[12], w[13], offset); - w[16] = amd_bytealign (w[11], w[12], offset); - w[15] = amd_bytealign (w[10], w[11], offset); - w[14] = amd_bytealign (w[ 9], w[10], offset); - w[13] = amd_bytealign (w[ 8], w[ 9], offset); - w[12] = amd_bytealign (w[ 7], w[ 8], offset); - w[11] = amd_bytealign (w[ 6], w[ 7], offset); - w[10] = amd_bytealign (w[ 5], w[ 6], offset); - w[ 9] = amd_bytealign (w[ 4], w[ 5], offset); - w[ 8] = amd_bytealign (w[ 3], w[ 4], offset); - w[ 7] = amd_bytealign (w[ 2], w[ 3], offset); - w[ 6] = amd_bytealign (w[ 1], w[ 2], offset); - w[ 5] = amd_bytealign (w[ 0], w[ 1], offset); - w[ 4] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[58], w[59], offset); + w[62] = hc_bytealign (w[57], w[58], offset); + w[61] = hc_bytealign (w[56], w[57], offset); + w[60] = hc_bytealign (w[55], w[56], offset); + w[59] = hc_bytealign (w[54], w[55], offset); + w[58] = hc_bytealign (w[53], w[54], offset); + w[57] = hc_bytealign (w[52], w[53], offset); + w[56] = hc_bytealign (w[51], w[52], offset); + w[55] = hc_bytealign (w[50], w[51], offset); + w[54] = hc_bytealign (w[49], w[50], offset); + w[53] = hc_bytealign (w[48], w[49], offset); + w[52] = hc_bytealign (w[47], w[48], offset); + w[51] = hc_bytealign (w[46], w[47], offset); + w[50] = hc_bytealign (w[45], w[46], offset); + w[49] = hc_bytealign (w[44], w[45], offset); + w[48] = hc_bytealign (w[43], w[44], offset); + w[47] = hc_bytealign (w[42], w[43], offset); + w[46] = hc_bytealign (w[41], w[42], offset); + w[45] = hc_bytealign (w[40], w[41], offset); + w[44] = hc_bytealign (w[39], w[40], offset); + w[43] = hc_bytealign (w[38], w[39], offset); + w[42] = hc_bytealign (w[37], w[38], offset); + w[41] = hc_bytealign (w[36], w[37], offset); + w[40] = hc_bytealign (w[35], w[36], offset); + w[39] = hc_bytealign (w[34], w[35], offset); + w[38] = hc_bytealign (w[33], w[34], offset); + w[37] = hc_bytealign (w[32], w[33], offset); + w[36] = hc_bytealign (w[31], w[32], offset); + w[35] = hc_bytealign (w[30], w[31], offset); + w[34] = hc_bytealign (w[29], w[30], offset); + w[33] = hc_bytealign (w[28], w[29], offset); + w[32] = hc_bytealign (w[27], w[28], offset); + w[31] = hc_bytealign (w[26], w[27], offset); + w[30] = hc_bytealign (w[25], w[26], offset); + w[29] = hc_bytealign (w[24], w[25], offset); + w[28] = hc_bytealign (w[23], w[24], offset); + w[27] = hc_bytealign (w[22], w[23], offset); + w[26] = hc_bytealign (w[21], w[22], offset); + w[25] = hc_bytealign (w[20], w[21], offset); + w[24] = hc_bytealign (w[19], w[20], offset); + w[23] = hc_bytealign (w[18], w[19], offset); + w[22] = hc_bytealign (w[17], w[18], offset); + w[21] = hc_bytealign (w[16], w[17], offset); + w[20] = hc_bytealign (w[15], w[16], offset); + w[19] = hc_bytealign (w[14], w[15], offset); + w[18] = hc_bytealign (w[13], w[14], offset); + w[17] = hc_bytealign (w[12], w[13], offset); + w[16] = hc_bytealign (w[11], w[12], offset); + w[15] = hc_bytealign (w[10], w[11], offset); + w[14] = hc_bytealign (w[ 9], w[10], offset); + w[13] = hc_bytealign (w[ 8], w[ 9], offset); + w[12] = hc_bytealign (w[ 7], w[ 8], offset); + w[11] = hc_bytealign (w[ 6], w[ 7], offset); + w[10] = hc_bytealign (w[ 5], w[ 6], offset); + w[ 9] = hc_bytealign (w[ 4], w[ 5], offset); + w[ 8] = hc_bytealign (w[ 3], w[ 4], offset); + w[ 7] = hc_bytealign (w[ 2], w[ 3], offset); + w[ 6] = hc_bytealign (w[ 1], w[ 2], offset); + w[ 5] = hc_bytealign (w[ 0], w[ 1], offset); + w[ 4] = hc_bytealign ( 0, w[ 0], offset); w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; @@ -20738,65 +20738,65 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 5: - w[63] = amd_bytealign (w[57], w[58], offset); - w[62] = amd_bytealign (w[56], w[57], offset); - w[61] = amd_bytealign (w[55], w[56], offset); - w[60] = amd_bytealign (w[54], w[55], offset); - w[59] = amd_bytealign (w[53], w[54], offset); - w[58] = amd_bytealign (w[52], w[53], offset); - w[57] = amd_bytealign (w[51], w[52], offset); - w[56] = amd_bytealign (w[50], w[51], offset); - w[55] = amd_bytealign (w[49], w[50], offset); - w[54] = amd_bytealign (w[48], w[49], offset); - w[53] = amd_bytealign (w[47], w[48], offset); - w[52] = amd_bytealign (w[46], w[47], offset); - w[51] = amd_bytealign (w[45], w[46], offset); - w[50] = amd_bytealign (w[44], w[45], offset); - w[49] = amd_bytealign (w[43], w[44], offset); - w[48] = amd_bytealign (w[42], w[43], offset); - w[47] = amd_bytealign (w[41], w[42], offset); - w[46] = amd_bytealign (w[40], w[41], offset); - w[45] = amd_bytealign (w[39], w[40], offset); - w[44] = amd_bytealign (w[38], w[39], offset); - w[43] = amd_bytealign (w[37], w[38], offset); - w[42] = amd_bytealign (w[36], w[37], offset); - w[41] = amd_bytealign (w[35], w[36], offset); - w[40] = amd_bytealign (w[34], w[35], offset); - w[39] = amd_bytealign (w[33], w[34], offset); - w[38] = amd_bytealign (w[32], w[33], offset); - w[37] = amd_bytealign (w[31], w[32], offset); - w[36] = amd_bytealign (w[30], w[31], offset); - w[35] = amd_bytealign (w[29], w[30], offset); - w[34] = amd_bytealign (w[28], w[29], offset); - w[33] = amd_bytealign (w[27], w[28], offset); - w[32] = amd_bytealign (w[26], w[27], offset); - w[31] = amd_bytealign (w[25], w[26], offset); - w[30] = amd_bytealign (w[24], w[25], offset); - w[29] = amd_bytealign (w[23], w[24], offset); - w[28] = amd_bytealign (w[22], w[23], offset); - w[27] = amd_bytealign (w[21], w[22], offset); - w[26] = amd_bytealign (w[20], w[21], offset); - w[25] = amd_bytealign (w[19], w[20], offset); - w[24] = amd_bytealign (w[18], w[19], offset); - w[23] = amd_bytealign (w[17], w[18], offset); - w[22] = amd_bytealign (w[16], w[17], offset); - w[21] = amd_bytealign (w[15], w[16], offset); - w[20] = amd_bytealign (w[14], w[15], offset); - w[19] = amd_bytealign (w[13], w[14], offset); - w[18] = amd_bytealign (w[12], w[13], offset); - w[17] = amd_bytealign (w[11], w[12], offset); - w[16] = amd_bytealign (w[10], w[11], offset); - w[15] = amd_bytealign (w[ 9], w[10], offset); - w[14] = amd_bytealign (w[ 8], w[ 9], offset); - w[13] = amd_bytealign (w[ 7], w[ 8], offset); - w[12] = amd_bytealign (w[ 6], w[ 7], offset); - w[11] = amd_bytealign (w[ 5], w[ 6], offset); - w[10] = amd_bytealign (w[ 4], w[ 5], offset); - w[ 9] = amd_bytealign (w[ 3], w[ 4], offset); - w[ 8] = amd_bytealign (w[ 2], w[ 3], offset); - w[ 7] = amd_bytealign (w[ 1], w[ 2], offset); - w[ 6] = amd_bytealign (w[ 0], w[ 1], offset); - w[ 5] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[57], w[58], offset); + w[62] = hc_bytealign (w[56], w[57], offset); + w[61] = hc_bytealign (w[55], w[56], offset); + w[60] = hc_bytealign (w[54], w[55], offset); + w[59] = hc_bytealign (w[53], w[54], offset); + w[58] = hc_bytealign (w[52], w[53], offset); + w[57] = hc_bytealign (w[51], w[52], offset); + w[56] = hc_bytealign (w[50], w[51], offset); + w[55] = hc_bytealign (w[49], w[50], offset); + w[54] = hc_bytealign (w[48], w[49], offset); + w[53] = hc_bytealign (w[47], w[48], offset); + w[52] = hc_bytealign (w[46], w[47], offset); + w[51] = hc_bytealign (w[45], w[46], offset); + w[50] = hc_bytealign (w[44], w[45], offset); + w[49] = hc_bytealign (w[43], w[44], offset); + w[48] = hc_bytealign (w[42], w[43], offset); + w[47] = hc_bytealign (w[41], w[42], offset); + w[46] = hc_bytealign (w[40], w[41], offset); + w[45] = hc_bytealign (w[39], w[40], offset); + w[44] = hc_bytealign (w[38], w[39], offset); + w[43] = hc_bytealign (w[37], w[38], offset); + w[42] = hc_bytealign (w[36], w[37], offset); + w[41] = hc_bytealign (w[35], w[36], offset); + w[40] = hc_bytealign (w[34], w[35], offset); + w[39] = hc_bytealign (w[33], w[34], offset); + w[38] = hc_bytealign (w[32], w[33], offset); + w[37] = hc_bytealign (w[31], w[32], offset); + w[36] = hc_bytealign (w[30], w[31], offset); + w[35] = hc_bytealign (w[29], w[30], offset); + w[34] = hc_bytealign (w[28], w[29], offset); + w[33] = hc_bytealign (w[27], w[28], offset); + w[32] = hc_bytealign (w[26], w[27], offset); + w[31] = hc_bytealign (w[25], w[26], offset); + w[30] = hc_bytealign (w[24], w[25], offset); + w[29] = hc_bytealign (w[23], w[24], offset); + w[28] = hc_bytealign (w[22], w[23], offset); + w[27] = hc_bytealign (w[21], w[22], offset); + w[26] = hc_bytealign (w[20], w[21], offset); + w[25] = hc_bytealign (w[19], w[20], offset); + w[24] = hc_bytealign (w[18], w[19], offset); + w[23] = hc_bytealign (w[17], w[18], offset); + w[22] = hc_bytealign (w[16], w[17], offset); + w[21] = hc_bytealign (w[15], w[16], offset); + w[20] = hc_bytealign (w[14], w[15], offset); + w[19] = hc_bytealign (w[13], w[14], offset); + w[18] = hc_bytealign (w[12], w[13], offset); + w[17] = hc_bytealign (w[11], w[12], offset); + w[16] = hc_bytealign (w[10], w[11], offset); + w[15] = hc_bytealign (w[ 9], w[10], offset); + w[14] = hc_bytealign (w[ 8], w[ 9], offset); + w[13] = hc_bytealign (w[ 7], w[ 8], offset); + w[12] = hc_bytealign (w[ 6], w[ 7], offset); + w[11] = hc_bytealign (w[ 5], w[ 6], offset); + w[10] = hc_bytealign (w[ 4], w[ 5], offset); + w[ 9] = hc_bytealign (w[ 3], w[ 4], offset); + w[ 8] = hc_bytealign (w[ 2], w[ 3], offset); + w[ 7] = hc_bytealign (w[ 1], w[ 2], offset); + w[ 6] = hc_bytealign (w[ 0], w[ 1], offset); + w[ 5] = hc_bytealign ( 0, w[ 0], offset); w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; @@ -20806,64 +20806,64 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 6: - w[63] = amd_bytealign (w[56], w[57], offset); - w[62] = amd_bytealign (w[55], w[56], offset); - w[61] = amd_bytealign (w[54], w[55], offset); - w[60] = amd_bytealign (w[53], w[54], offset); - w[59] = amd_bytealign (w[52], w[53], offset); - w[58] = amd_bytealign (w[51], w[52], offset); - w[57] = amd_bytealign (w[50], w[51], offset); - w[56] = amd_bytealign (w[49], w[50], offset); - w[55] = amd_bytealign (w[48], w[49], offset); - w[54] = amd_bytealign (w[47], w[48], offset); - w[53] = amd_bytealign (w[46], w[47], offset); - w[52] = amd_bytealign (w[45], w[46], offset); - w[51] = amd_bytealign (w[44], w[45], offset); - w[50] = amd_bytealign (w[43], w[44], offset); - w[49] = amd_bytealign (w[42], w[43], offset); - w[48] = amd_bytealign (w[41], w[42], offset); - w[47] = amd_bytealign (w[40], w[41], offset); - w[46] = amd_bytealign (w[39], w[40], offset); - w[45] = amd_bytealign (w[38], w[39], offset); - w[44] = amd_bytealign (w[37], w[38], offset); - w[43] = amd_bytealign (w[36], w[37], offset); - w[42] = amd_bytealign (w[35], w[36], offset); - w[41] = amd_bytealign (w[34], w[35], offset); - w[40] = amd_bytealign (w[33], w[34], offset); - w[39] = amd_bytealign (w[32], w[33], offset); - w[38] = amd_bytealign (w[31], w[32], offset); - w[37] = amd_bytealign (w[30], w[31], offset); - w[36] = amd_bytealign (w[29], w[30], offset); - w[35] = amd_bytealign (w[28], w[29], offset); - w[34] = amd_bytealign (w[27], w[28], offset); - w[33] = amd_bytealign (w[26], w[27], offset); - w[32] = amd_bytealign (w[25], w[26], offset); - w[31] = amd_bytealign (w[24], w[25], offset); - w[30] = amd_bytealign (w[23], w[24], offset); - w[29] = amd_bytealign (w[22], w[23], offset); - w[28] = amd_bytealign (w[21], w[22], offset); - w[27] = amd_bytealign (w[20], w[21], offset); - w[26] = amd_bytealign (w[19], w[20], offset); - w[25] = amd_bytealign (w[18], w[19], offset); - w[24] = amd_bytealign (w[17], w[18], offset); - w[23] = amd_bytealign (w[16], w[17], offset); - w[22] = amd_bytealign (w[15], w[16], offset); - w[21] = amd_bytealign (w[14], w[15], offset); - w[20] = amd_bytealign (w[13], w[14], offset); - w[19] = amd_bytealign (w[12], w[13], offset); - w[18] = amd_bytealign (w[11], w[12], offset); - w[17] = amd_bytealign (w[10], w[11], offset); - w[16] = amd_bytealign (w[ 9], w[10], offset); - w[15] = amd_bytealign (w[ 8], w[ 9], offset); - w[14] = amd_bytealign (w[ 7], w[ 8], offset); - w[13] = amd_bytealign (w[ 6], w[ 7], offset); - w[12] = amd_bytealign (w[ 5], w[ 6], offset); - w[11] = amd_bytealign (w[ 4], w[ 5], offset); - w[10] = amd_bytealign (w[ 3], w[ 4], offset); - w[ 9] = amd_bytealign (w[ 2], w[ 3], offset); - w[ 8] = amd_bytealign (w[ 1], w[ 2], offset); - w[ 7] = amd_bytealign (w[ 0], w[ 1], offset); - w[ 6] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[56], w[57], offset); + w[62] = hc_bytealign (w[55], w[56], offset); + w[61] = hc_bytealign (w[54], w[55], offset); + w[60] = hc_bytealign (w[53], w[54], offset); + w[59] = hc_bytealign (w[52], w[53], offset); + w[58] = hc_bytealign (w[51], w[52], offset); + w[57] = hc_bytealign (w[50], w[51], offset); + w[56] = hc_bytealign (w[49], w[50], offset); + w[55] = hc_bytealign (w[48], w[49], offset); + w[54] = hc_bytealign (w[47], w[48], offset); + w[53] = hc_bytealign (w[46], w[47], offset); + w[52] = hc_bytealign (w[45], w[46], offset); + w[51] = hc_bytealign (w[44], w[45], offset); + w[50] = hc_bytealign (w[43], w[44], offset); + w[49] = hc_bytealign (w[42], w[43], offset); + w[48] = hc_bytealign (w[41], w[42], offset); + w[47] = hc_bytealign (w[40], w[41], offset); + w[46] = hc_bytealign (w[39], w[40], offset); + w[45] = hc_bytealign (w[38], w[39], offset); + w[44] = hc_bytealign (w[37], w[38], offset); + w[43] = hc_bytealign (w[36], w[37], offset); + w[42] = hc_bytealign (w[35], w[36], offset); + w[41] = hc_bytealign (w[34], w[35], offset); + w[40] = hc_bytealign (w[33], w[34], offset); + w[39] = hc_bytealign (w[32], w[33], offset); + w[38] = hc_bytealign (w[31], w[32], offset); + w[37] = hc_bytealign (w[30], w[31], offset); + w[36] = hc_bytealign (w[29], w[30], offset); + w[35] = hc_bytealign (w[28], w[29], offset); + w[34] = hc_bytealign (w[27], w[28], offset); + w[33] = hc_bytealign (w[26], w[27], offset); + w[32] = hc_bytealign (w[25], w[26], offset); + w[31] = hc_bytealign (w[24], w[25], offset); + w[30] = hc_bytealign (w[23], w[24], offset); + w[29] = hc_bytealign (w[22], w[23], offset); + w[28] = hc_bytealign (w[21], w[22], offset); + w[27] = hc_bytealign (w[20], w[21], offset); + w[26] = hc_bytealign (w[19], w[20], offset); + w[25] = hc_bytealign (w[18], w[19], offset); + w[24] = hc_bytealign (w[17], w[18], offset); + w[23] = hc_bytealign (w[16], w[17], offset); + w[22] = hc_bytealign (w[15], w[16], offset); + w[21] = hc_bytealign (w[14], w[15], offset); + w[20] = hc_bytealign (w[13], w[14], offset); + w[19] = hc_bytealign (w[12], w[13], offset); + w[18] = hc_bytealign (w[11], w[12], offset); + w[17] = hc_bytealign (w[10], w[11], offset); + w[16] = hc_bytealign (w[ 9], w[10], offset); + w[15] = hc_bytealign (w[ 8], w[ 9], offset); + w[14] = hc_bytealign (w[ 7], w[ 8], offset); + w[13] = hc_bytealign (w[ 6], w[ 7], offset); + w[12] = hc_bytealign (w[ 5], w[ 6], offset); + w[11] = hc_bytealign (w[ 4], w[ 5], offset); + w[10] = hc_bytealign (w[ 3], w[ 4], offset); + w[ 9] = hc_bytealign (w[ 2], w[ 3], offset); + w[ 8] = hc_bytealign (w[ 1], w[ 2], offset); + w[ 7] = hc_bytealign (w[ 0], w[ 1], offset); + w[ 6] = hc_bytealign ( 0, w[ 0], offset); w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; @@ -20874,63 +20874,63 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 7: - w[63] = amd_bytealign (w[55], w[56], offset); - w[62] = amd_bytealign (w[54], w[55], offset); - w[61] = amd_bytealign (w[53], w[54], offset); - w[60] = amd_bytealign (w[52], w[53], offset); - w[59] = amd_bytealign (w[51], w[52], offset); - w[58] = amd_bytealign (w[50], w[51], offset); - w[57] = amd_bytealign (w[49], w[50], offset); - w[56] = amd_bytealign (w[48], w[49], offset); - w[55] = amd_bytealign (w[47], w[48], offset); - w[54] = amd_bytealign (w[46], w[47], offset); - w[53] = amd_bytealign (w[45], w[46], offset); - w[52] = amd_bytealign (w[44], w[45], offset); - w[51] = amd_bytealign (w[43], w[44], offset); - w[50] = amd_bytealign (w[42], w[43], offset); - w[49] = amd_bytealign (w[41], w[42], offset); - w[48] = amd_bytealign (w[40], w[41], offset); - w[47] = amd_bytealign (w[39], w[40], offset); - w[46] = amd_bytealign (w[38], w[39], offset); - w[45] = amd_bytealign (w[37], w[38], offset); - w[44] = amd_bytealign (w[36], w[37], offset); - w[43] = amd_bytealign (w[35], w[36], offset); - w[42] = amd_bytealign (w[34], w[35], offset); - w[41] = amd_bytealign (w[33], w[34], offset); - w[40] = amd_bytealign (w[32], w[33], offset); - w[39] = amd_bytealign (w[31], w[32], offset); - w[38] = amd_bytealign (w[30], w[31], offset); - w[37] = amd_bytealign (w[29], w[30], offset); - w[36] = amd_bytealign (w[28], w[29], offset); - w[35] = amd_bytealign (w[27], w[28], offset); - w[34] = amd_bytealign (w[26], w[27], offset); - w[33] = amd_bytealign (w[25], w[26], offset); - w[32] = amd_bytealign (w[24], w[25], offset); - w[31] = amd_bytealign (w[23], w[24], offset); - w[30] = amd_bytealign (w[22], w[23], offset); - w[29] = amd_bytealign (w[21], w[22], offset); - w[28] = amd_bytealign (w[20], w[21], offset); - w[27] = amd_bytealign (w[19], w[20], offset); - w[26] = amd_bytealign (w[18], w[19], offset); - w[25] = amd_bytealign (w[17], w[18], offset); - w[24] = amd_bytealign (w[16], w[17], offset); - w[23] = amd_bytealign (w[15], w[16], offset); - w[22] = amd_bytealign (w[14], w[15], offset); - w[21] = amd_bytealign (w[13], w[14], offset); - w[20] = amd_bytealign (w[12], w[13], offset); - w[19] = amd_bytealign (w[11], w[12], offset); - w[18] = amd_bytealign (w[10], w[11], offset); - w[17] = amd_bytealign (w[ 9], w[10], offset); - w[16] = amd_bytealign (w[ 8], w[ 9], offset); - w[15] = amd_bytealign (w[ 7], w[ 8], offset); - w[14] = amd_bytealign (w[ 6], w[ 7], offset); - w[13] = amd_bytealign (w[ 5], w[ 6], offset); - w[12] = amd_bytealign (w[ 4], w[ 5], offset); - w[11] = amd_bytealign (w[ 3], w[ 4], offset); - w[10] = amd_bytealign (w[ 2], w[ 3], offset); - w[ 9] = amd_bytealign (w[ 1], w[ 2], offset); - w[ 8] = amd_bytealign (w[ 0], w[ 1], offset); - w[ 7] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[55], w[56], offset); + w[62] = hc_bytealign (w[54], w[55], offset); + w[61] = hc_bytealign (w[53], w[54], offset); + w[60] = hc_bytealign (w[52], w[53], offset); + w[59] = hc_bytealign (w[51], w[52], offset); + w[58] = hc_bytealign (w[50], w[51], offset); + w[57] = hc_bytealign (w[49], w[50], offset); + w[56] = hc_bytealign (w[48], w[49], offset); + w[55] = hc_bytealign (w[47], w[48], offset); + w[54] = hc_bytealign (w[46], w[47], offset); + w[53] = hc_bytealign (w[45], w[46], offset); + w[52] = hc_bytealign (w[44], w[45], offset); + w[51] = hc_bytealign (w[43], w[44], offset); + w[50] = hc_bytealign (w[42], w[43], offset); + w[49] = hc_bytealign (w[41], w[42], offset); + w[48] = hc_bytealign (w[40], w[41], offset); + w[47] = hc_bytealign (w[39], w[40], offset); + w[46] = hc_bytealign (w[38], w[39], offset); + w[45] = hc_bytealign (w[37], w[38], offset); + w[44] = hc_bytealign (w[36], w[37], offset); + w[43] = hc_bytealign (w[35], w[36], offset); + w[42] = hc_bytealign (w[34], w[35], offset); + w[41] = hc_bytealign (w[33], w[34], offset); + w[40] = hc_bytealign (w[32], w[33], offset); + w[39] = hc_bytealign (w[31], w[32], offset); + w[38] = hc_bytealign (w[30], w[31], offset); + w[37] = hc_bytealign (w[29], w[30], offset); + w[36] = hc_bytealign (w[28], w[29], offset); + w[35] = hc_bytealign (w[27], w[28], offset); + w[34] = hc_bytealign (w[26], w[27], offset); + w[33] = hc_bytealign (w[25], w[26], offset); + w[32] = hc_bytealign (w[24], w[25], offset); + w[31] = hc_bytealign (w[23], w[24], offset); + w[30] = hc_bytealign (w[22], w[23], offset); + w[29] = hc_bytealign (w[21], w[22], offset); + w[28] = hc_bytealign (w[20], w[21], offset); + w[27] = hc_bytealign (w[19], w[20], offset); + w[26] = hc_bytealign (w[18], w[19], offset); + w[25] = hc_bytealign (w[17], w[18], offset); + w[24] = hc_bytealign (w[16], w[17], offset); + w[23] = hc_bytealign (w[15], w[16], offset); + w[22] = hc_bytealign (w[14], w[15], offset); + w[21] = hc_bytealign (w[13], w[14], offset); + w[20] = hc_bytealign (w[12], w[13], offset); + w[19] = hc_bytealign (w[11], w[12], offset); + w[18] = hc_bytealign (w[10], w[11], offset); + w[17] = hc_bytealign (w[ 9], w[10], offset); + w[16] = hc_bytealign (w[ 8], w[ 9], offset); + w[15] = hc_bytealign (w[ 7], w[ 8], offset); + w[14] = hc_bytealign (w[ 6], w[ 7], offset); + w[13] = hc_bytealign (w[ 5], w[ 6], offset); + w[12] = hc_bytealign (w[ 4], w[ 5], offset); + w[11] = hc_bytealign (w[ 3], w[ 4], offset); + w[10] = hc_bytealign (w[ 2], w[ 3], offset); + w[ 9] = hc_bytealign (w[ 1], w[ 2], offset); + w[ 8] = hc_bytealign (w[ 0], w[ 1], offset); + w[ 7] = hc_bytealign ( 0, w[ 0], offset); w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; @@ -20942,62 +20942,62 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 8: - w[63] = amd_bytealign (w[54], w[55], offset); - w[62] = amd_bytealign (w[53], w[54], offset); - w[61] = amd_bytealign (w[52], w[53], offset); - w[60] = amd_bytealign (w[51], w[52], offset); - w[59] = amd_bytealign (w[50], w[51], offset); - w[58] = amd_bytealign (w[49], w[50], offset); - w[57] = amd_bytealign (w[48], w[49], offset); - w[56] = amd_bytealign (w[47], w[48], offset); - w[55] = amd_bytealign (w[46], w[47], offset); - w[54] = amd_bytealign (w[45], w[46], offset); - w[53] = amd_bytealign (w[44], w[45], offset); - w[52] = amd_bytealign (w[43], w[44], offset); - w[51] = amd_bytealign (w[42], w[43], offset); - w[50] = amd_bytealign (w[41], w[42], offset); - w[49] = amd_bytealign (w[40], w[41], offset); - w[48] = amd_bytealign (w[39], w[40], offset); - w[47] = amd_bytealign (w[38], w[39], offset); - w[46] = amd_bytealign (w[37], w[38], offset); - w[45] = amd_bytealign (w[36], w[37], offset); - w[44] = amd_bytealign (w[35], w[36], offset); - w[43] = amd_bytealign (w[34], w[35], offset); - w[42] = amd_bytealign (w[33], w[34], offset); - w[41] = amd_bytealign (w[32], w[33], offset); - w[40] = amd_bytealign (w[31], w[32], offset); - w[39] = amd_bytealign (w[30], w[31], offset); - w[38] = amd_bytealign (w[29], w[30], offset); - w[37] = amd_bytealign (w[28], w[29], offset); - w[36] = amd_bytealign (w[27], w[28], offset); - w[35] = amd_bytealign (w[26], w[27], offset); - w[34] = amd_bytealign (w[25], w[26], offset); - w[33] = amd_bytealign (w[24], w[25], offset); - w[32] = amd_bytealign (w[23], w[24], offset); - w[31] = amd_bytealign (w[22], w[23], offset); - w[30] = amd_bytealign (w[21], w[22], offset); - w[29] = amd_bytealign (w[20], w[21], offset); - w[28] = amd_bytealign (w[19], w[20], offset); - w[27] = amd_bytealign (w[18], w[19], offset); - w[26] = amd_bytealign (w[17], w[18], offset); - w[25] = amd_bytealign (w[16], w[17], offset); - w[24] = amd_bytealign (w[15], w[16], offset); - w[23] = amd_bytealign (w[14], w[15], offset); - w[22] = amd_bytealign (w[13], w[14], offset); - w[21] = amd_bytealign (w[12], w[13], offset); - w[20] = amd_bytealign (w[11], w[12], offset); - w[19] = amd_bytealign (w[10], w[11], offset); - w[18] = amd_bytealign (w[ 9], w[10], offset); - w[17] = amd_bytealign (w[ 8], w[ 9], offset); - w[16] = amd_bytealign (w[ 7], w[ 8], offset); - w[15] = amd_bytealign (w[ 6], w[ 7], offset); - w[14] = amd_bytealign (w[ 5], w[ 6], offset); - w[13] = amd_bytealign (w[ 4], w[ 5], offset); - w[12] = amd_bytealign (w[ 3], w[ 4], offset); - w[11] = amd_bytealign (w[ 2], w[ 3], offset); - w[10] = amd_bytealign (w[ 1], w[ 2], offset); - w[ 9] = amd_bytealign (w[ 0], w[ 1], offset); - w[ 8] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[54], w[55], offset); + w[62] = hc_bytealign (w[53], w[54], offset); + w[61] = hc_bytealign (w[52], w[53], offset); + w[60] = hc_bytealign (w[51], w[52], offset); + w[59] = hc_bytealign (w[50], w[51], offset); + w[58] = hc_bytealign (w[49], w[50], offset); + w[57] = hc_bytealign (w[48], w[49], offset); + w[56] = hc_bytealign (w[47], w[48], offset); + w[55] = hc_bytealign (w[46], w[47], offset); + w[54] = hc_bytealign (w[45], w[46], offset); + w[53] = hc_bytealign (w[44], w[45], offset); + w[52] = hc_bytealign (w[43], w[44], offset); + w[51] = hc_bytealign (w[42], w[43], offset); + w[50] = hc_bytealign (w[41], w[42], offset); + w[49] = hc_bytealign (w[40], w[41], offset); + w[48] = hc_bytealign (w[39], w[40], offset); + w[47] = hc_bytealign (w[38], w[39], offset); + w[46] = hc_bytealign (w[37], w[38], offset); + w[45] = hc_bytealign (w[36], w[37], offset); + w[44] = hc_bytealign (w[35], w[36], offset); + w[43] = hc_bytealign (w[34], w[35], offset); + w[42] = hc_bytealign (w[33], w[34], offset); + w[41] = hc_bytealign (w[32], w[33], offset); + w[40] = hc_bytealign (w[31], w[32], offset); + w[39] = hc_bytealign (w[30], w[31], offset); + w[38] = hc_bytealign (w[29], w[30], offset); + w[37] = hc_bytealign (w[28], w[29], offset); + w[36] = hc_bytealign (w[27], w[28], offset); + w[35] = hc_bytealign (w[26], w[27], offset); + w[34] = hc_bytealign (w[25], w[26], offset); + w[33] = hc_bytealign (w[24], w[25], offset); + w[32] = hc_bytealign (w[23], w[24], offset); + w[31] = hc_bytealign (w[22], w[23], offset); + w[30] = hc_bytealign (w[21], w[22], offset); + w[29] = hc_bytealign (w[20], w[21], offset); + w[28] = hc_bytealign (w[19], w[20], offset); + w[27] = hc_bytealign (w[18], w[19], offset); + w[26] = hc_bytealign (w[17], w[18], offset); + w[25] = hc_bytealign (w[16], w[17], offset); + w[24] = hc_bytealign (w[15], w[16], offset); + w[23] = hc_bytealign (w[14], w[15], offset); + w[22] = hc_bytealign (w[13], w[14], offset); + w[21] = hc_bytealign (w[12], w[13], offset); + w[20] = hc_bytealign (w[11], w[12], offset); + w[19] = hc_bytealign (w[10], w[11], offset); + w[18] = hc_bytealign (w[ 9], w[10], offset); + w[17] = hc_bytealign (w[ 8], w[ 9], offset); + w[16] = hc_bytealign (w[ 7], w[ 8], offset); + w[15] = hc_bytealign (w[ 6], w[ 7], offset); + w[14] = hc_bytealign (w[ 5], w[ 6], offset); + w[13] = hc_bytealign (w[ 4], w[ 5], offset); + w[12] = hc_bytealign (w[ 3], w[ 4], offset); + w[11] = hc_bytealign (w[ 2], w[ 3], offset); + w[10] = hc_bytealign (w[ 1], w[ 2], offset); + w[ 9] = hc_bytealign (w[ 0], w[ 1], offset); + w[ 8] = hc_bytealign ( 0, w[ 0], offset); w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; @@ -21010,61 +21010,61 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 9: - w[63] = amd_bytealign (w[53], w[54], offset); - w[62] = amd_bytealign (w[52], w[53], offset); - w[61] = amd_bytealign (w[51], w[52], offset); - w[60] = amd_bytealign (w[50], w[51], offset); - w[59] = amd_bytealign (w[49], w[50], offset); - w[58] = amd_bytealign (w[48], w[49], offset); - w[57] = amd_bytealign (w[47], w[48], offset); - w[56] = amd_bytealign (w[46], w[47], offset); - w[55] = amd_bytealign (w[45], w[46], offset); - w[54] = amd_bytealign (w[44], w[45], offset); - w[53] = amd_bytealign (w[43], w[44], offset); - w[52] = amd_bytealign (w[42], w[43], offset); - w[51] = amd_bytealign (w[41], w[42], offset); - w[50] = amd_bytealign (w[40], w[41], offset); - w[49] = amd_bytealign (w[39], w[40], offset); - w[48] = amd_bytealign (w[38], w[39], offset); - w[47] = amd_bytealign (w[37], w[38], offset); - w[46] = amd_bytealign (w[36], w[37], offset); - w[45] = amd_bytealign (w[35], w[36], offset); - w[44] = amd_bytealign (w[34], w[35], offset); - w[43] = amd_bytealign (w[33], w[34], offset); - w[42] = amd_bytealign (w[32], w[33], offset); - w[41] = amd_bytealign (w[31], w[32], offset); - w[40] = amd_bytealign (w[30], w[31], offset); - w[39] = amd_bytealign (w[29], w[30], offset); - w[38] = amd_bytealign (w[28], w[29], offset); - w[37] = amd_bytealign (w[27], w[28], offset); - w[36] = amd_bytealign (w[26], w[27], offset); - w[35] = amd_bytealign (w[25], w[26], offset); - w[34] = amd_bytealign (w[24], w[25], offset); - w[33] = amd_bytealign (w[23], w[24], offset); - w[32] = amd_bytealign (w[22], w[23], offset); - w[31] = amd_bytealign (w[21], w[22], offset); - w[30] = amd_bytealign (w[20], w[21], offset); - w[29] = amd_bytealign (w[19], w[20], offset); - w[28] = amd_bytealign (w[18], w[19], offset); - w[27] = amd_bytealign (w[17], w[18], offset); - w[26] = amd_bytealign (w[16], w[17], offset); - w[25] = amd_bytealign (w[15], w[16], offset); - w[24] = amd_bytealign (w[14], w[15], offset); - w[23] = amd_bytealign (w[13], w[14], offset); - w[22] = amd_bytealign (w[12], w[13], offset); - w[21] = amd_bytealign (w[11], w[12], offset); - w[20] = amd_bytealign (w[10], w[11], offset); - w[19] = amd_bytealign (w[ 9], w[10], offset); - w[18] = amd_bytealign (w[ 8], w[ 9], offset); - w[17] = amd_bytealign (w[ 7], w[ 8], offset); - w[16] = amd_bytealign (w[ 6], w[ 7], offset); - w[15] = amd_bytealign (w[ 5], w[ 6], offset); - w[14] = amd_bytealign (w[ 4], w[ 5], offset); - w[13] = amd_bytealign (w[ 3], w[ 4], offset); - w[12] = amd_bytealign (w[ 2], w[ 3], offset); - w[11] = amd_bytealign (w[ 1], w[ 2], offset); - w[10] = amd_bytealign (w[ 0], w[ 1], offset); - w[ 9] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[53], w[54], offset); + w[62] = hc_bytealign (w[52], w[53], offset); + w[61] = hc_bytealign (w[51], w[52], offset); + w[60] = hc_bytealign (w[50], w[51], offset); + w[59] = hc_bytealign (w[49], w[50], offset); + w[58] = hc_bytealign (w[48], w[49], offset); + w[57] = hc_bytealign (w[47], w[48], offset); + w[56] = hc_bytealign (w[46], w[47], offset); + w[55] = hc_bytealign (w[45], w[46], offset); + w[54] = hc_bytealign (w[44], w[45], offset); + w[53] = hc_bytealign (w[43], w[44], offset); + w[52] = hc_bytealign (w[42], w[43], offset); + w[51] = hc_bytealign (w[41], w[42], offset); + w[50] = hc_bytealign (w[40], w[41], offset); + w[49] = hc_bytealign (w[39], w[40], offset); + w[48] = hc_bytealign (w[38], w[39], offset); + w[47] = hc_bytealign (w[37], w[38], offset); + w[46] = hc_bytealign (w[36], w[37], offset); + w[45] = hc_bytealign (w[35], w[36], offset); + w[44] = hc_bytealign (w[34], w[35], offset); + w[43] = hc_bytealign (w[33], w[34], offset); + w[42] = hc_bytealign (w[32], w[33], offset); + w[41] = hc_bytealign (w[31], w[32], offset); + w[40] = hc_bytealign (w[30], w[31], offset); + w[39] = hc_bytealign (w[29], w[30], offset); + w[38] = hc_bytealign (w[28], w[29], offset); + w[37] = hc_bytealign (w[27], w[28], offset); + w[36] = hc_bytealign (w[26], w[27], offset); + w[35] = hc_bytealign (w[25], w[26], offset); + w[34] = hc_bytealign (w[24], w[25], offset); + w[33] = hc_bytealign (w[23], w[24], offset); + w[32] = hc_bytealign (w[22], w[23], offset); + w[31] = hc_bytealign (w[21], w[22], offset); + w[30] = hc_bytealign (w[20], w[21], offset); + w[29] = hc_bytealign (w[19], w[20], offset); + w[28] = hc_bytealign (w[18], w[19], offset); + w[27] = hc_bytealign (w[17], w[18], offset); + w[26] = hc_bytealign (w[16], w[17], offset); + w[25] = hc_bytealign (w[15], w[16], offset); + w[24] = hc_bytealign (w[14], w[15], offset); + w[23] = hc_bytealign (w[13], w[14], offset); + w[22] = hc_bytealign (w[12], w[13], offset); + w[21] = hc_bytealign (w[11], w[12], offset); + w[20] = hc_bytealign (w[10], w[11], offset); + w[19] = hc_bytealign (w[ 9], w[10], offset); + w[18] = hc_bytealign (w[ 8], w[ 9], offset); + w[17] = hc_bytealign (w[ 7], w[ 8], offset); + w[16] = hc_bytealign (w[ 6], w[ 7], offset); + w[15] = hc_bytealign (w[ 5], w[ 6], offset); + w[14] = hc_bytealign (w[ 4], w[ 5], offset); + w[13] = hc_bytealign (w[ 3], w[ 4], offset); + w[12] = hc_bytealign (w[ 2], w[ 3], offset); + w[11] = hc_bytealign (w[ 1], w[ 2], offset); + w[10] = hc_bytealign (w[ 0], w[ 1], offset); + w[ 9] = hc_bytealign ( 0, w[ 0], offset); w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; @@ -21078,60 +21078,60 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 10: - w[63] = amd_bytealign (w[52], w[53], offset); - w[62] = amd_bytealign (w[51], w[52], offset); - w[61] = amd_bytealign (w[50], w[51], offset); - w[60] = amd_bytealign (w[49], w[50], offset); - w[59] = amd_bytealign (w[48], w[49], offset); - w[58] = amd_bytealign (w[47], w[48], offset); - w[57] = amd_bytealign (w[46], w[47], offset); - w[56] = amd_bytealign (w[45], w[46], offset); - w[55] = amd_bytealign (w[44], w[45], offset); - w[54] = amd_bytealign (w[43], w[44], offset); - w[53] = amd_bytealign (w[42], w[43], offset); - w[52] = amd_bytealign (w[41], w[42], offset); - w[51] = amd_bytealign (w[40], w[41], offset); - w[50] = amd_bytealign (w[39], w[40], offset); - w[49] = amd_bytealign (w[38], w[39], offset); - w[48] = amd_bytealign (w[37], w[38], offset); - w[47] = amd_bytealign (w[36], w[37], offset); - w[46] = amd_bytealign (w[35], w[36], offset); - w[45] = amd_bytealign (w[34], w[35], offset); - w[44] = amd_bytealign (w[33], w[34], offset); - w[43] = amd_bytealign (w[32], w[33], offset); - w[42] = amd_bytealign (w[31], w[32], offset); - w[41] = amd_bytealign (w[30], w[31], offset); - w[40] = amd_bytealign (w[29], w[30], offset); - w[39] = amd_bytealign (w[28], w[29], offset); - w[38] = amd_bytealign (w[27], w[28], offset); - w[37] = amd_bytealign (w[26], w[27], offset); - w[36] = amd_bytealign (w[25], w[26], offset); - w[35] = amd_bytealign (w[24], w[25], offset); - w[34] = amd_bytealign (w[23], w[24], offset); - w[33] = amd_bytealign (w[22], w[23], offset); - w[32] = amd_bytealign (w[21], w[22], offset); - w[31] = amd_bytealign (w[20], w[21], offset); - w[30] = amd_bytealign (w[19], w[20], offset); - w[29] = amd_bytealign (w[18], w[19], offset); - w[28] = amd_bytealign (w[17], w[18], offset); - w[27] = amd_bytealign (w[16], w[17], offset); - w[26] = amd_bytealign (w[15], w[16], offset); - w[25] = amd_bytealign (w[14], w[15], offset); - w[24] = amd_bytealign (w[13], w[14], offset); - w[23] = amd_bytealign (w[12], w[13], offset); - w[22] = amd_bytealign (w[11], w[12], offset); - w[21] = amd_bytealign (w[10], w[11], offset); - w[20] = amd_bytealign (w[ 9], w[10], offset); - w[19] = amd_bytealign (w[ 8], w[ 9], offset); - w[18] = amd_bytealign (w[ 7], w[ 8], offset); - w[17] = amd_bytealign (w[ 6], w[ 7], offset); - w[16] = amd_bytealign (w[ 5], w[ 6], offset); - w[15] = amd_bytealign (w[ 4], w[ 5], offset); - w[14] = amd_bytealign (w[ 3], w[ 4], offset); - w[13] = amd_bytealign (w[ 2], w[ 3], offset); - w[12] = amd_bytealign (w[ 1], w[ 2], offset); - w[11] = amd_bytealign (w[ 0], w[ 1], offset); - w[10] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[52], w[53], offset); + w[62] = hc_bytealign (w[51], w[52], offset); + w[61] = hc_bytealign (w[50], w[51], offset); + w[60] = hc_bytealign (w[49], w[50], offset); + w[59] = hc_bytealign (w[48], w[49], offset); + w[58] = hc_bytealign (w[47], w[48], offset); + w[57] = hc_bytealign (w[46], w[47], offset); + w[56] = hc_bytealign (w[45], w[46], offset); + w[55] = hc_bytealign (w[44], w[45], offset); + w[54] = hc_bytealign (w[43], w[44], offset); + w[53] = hc_bytealign (w[42], w[43], offset); + w[52] = hc_bytealign (w[41], w[42], offset); + w[51] = hc_bytealign (w[40], w[41], offset); + w[50] = hc_bytealign (w[39], w[40], offset); + w[49] = hc_bytealign (w[38], w[39], offset); + w[48] = hc_bytealign (w[37], w[38], offset); + w[47] = hc_bytealign (w[36], w[37], offset); + w[46] = hc_bytealign (w[35], w[36], offset); + w[45] = hc_bytealign (w[34], w[35], offset); + w[44] = hc_bytealign (w[33], w[34], offset); + w[43] = hc_bytealign (w[32], w[33], offset); + w[42] = hc_bytealign (w[31], w[32], offset); + w[41] = hc_bytealign (w[30], w[31], offset); + w[40] = hc_bytealign (w[29], w[30], offset); + w[39] = hc_bytealign (w[28], w[29], offset); + w[38] = hc_bytealign (w[27], w[28], offset); + w[37] = hc_bytealign (w[26], w[27], offset); + w[36] = hc_bytealign (w[25], w[26], offset); + w[35] = hc_bytealign (w[24], w[25], offset); + w[34] = hc_bytealign (w[23], w[24], offset); + w[33] = hc_bytealign (w[22], w[23], offset); + w[32] = hc_bytealign (w[21], w[22], offset); + w[31] = hc_bytealign (w[20], w[21], offset); + w[30] = hc_bytealign (w[19], w[20], offset); + w[29] = hc_bytealign (w[18], w[19], offset); + w[28] = hc_bytealign (w[17], w[18], offset); + w[27] = hc_bytealign (w[16], w[17], offset); + w[26] = hc_bytealign (w[15], w[16], offset); + w[25] = hc_bytealign (w[14], w[15], offset); + w[24] = hc_bytealign (w[13], w[14], offset); + w[23] = hc_bytealign (w[12], w[13], offset); + w[22] = hc_bytealign (w[11], w[12], offset); + w[21] = hc_bytealign (w[10], w[11], offset); + w[20] = hc_bytealign (w[ 9], w[10], offset); + w[19] = hc_bytealign (w[ 8], w[ 9], offset); + w[18] = hc_bytealign (w[ 7], w[ 8], offset); + w[17] = hc_bytealign (w[ 6], w[ 7], offset); + w[16] = hc_bytealign (w[ 5], w[ 6], offset); + w[15] = hc_bytealign (w[ 4], w[ 5], offset); + w[14] = hc_bytealign (w[ 3], w[ 4], offset); + w[13] = hc_bytealign (w[ 2], w[ 3], offset); + w[12] = hc_bytealign (w[ 1], w[ 2], offset); + w[11] = hc_bytealign (w[ 0], w[ 1], offset); + w[10] = hc_bytealign ( 0, w[ 0], offset); w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; @@ -21146,59 +21146,59 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 11: - w[63] = amd_bytealign (w[51], w[52], offset); - w[62] = amd_bytealign (w[50], w[51], offset); - w[61] = amd_bytealign (w[49], w[50], offset); - w[60] = amd_bytealign (w[48], w[49], offset); - w[59] = amd_bytealign (w[47], w[48], offset); - w[58] = amd_bytealign (w[46], w[47], offset); - w[57] = amd_bytealign (w[45], w[46], offset); - w[56] = amd_bytealign (w[44], w[45], offset); - w[55] = amd_bytealign (w[43], w[44], offset); - w[54] = amd_bytealign (w[42], w[43], offset); - w[53] = amd_bytealign (w[41], w[42], offset); - w[52] = amd_bytealign (w[40], w[41], offset); - w[51] = amd_bytealign (w[39], w[40], offset); - w[50] = amd_bytealign (w[38], w[39], offset); - w[49] = amd_bytealign (w[37], w[38], offset); - w[48] = amd_bytealign (w[36], w[37], offset); - w[47] = amd_bytealign (w[35], w[36], offset); - w[46] = amd_bytealign (w[34], w[35], offset); - w[45] = amd_bytealign (w[33], w[34], offset); - w[44] = amd_bytealign (w[32], w[33], offset); - w[43] = amd_bytealign (w[31], w[32], offset); - w[42] = amd_bytealign (w[30], w[31], offset); - w[41] = amd_bytealign (w[29], w[30], offset); - w[40] = amd_bytealign (w[28], w[29], offset); - w[39] = amd_bytealign (w[27], w[28], offset); - w[38] = amd_bytealign (w[26], w[27], offset); - w[37] = amd_bytealign (w[25], w[26], offset); - w[36] = amd_bytealign (w[24], w[25], offset); - w[35] = amd_bytealign (w[23], w[24], offset); - w[34] = amd_bytealign (w[22], w[23], offset); - w[33] = amd_bytealign (w[21], w[22], offset); - w[32] = amd_bytealign (w[20], w[21], offset); - w[31] = amd_bytealign (w[19], w[20], offset); - w[30] = amd_bytealign (w[18], w[19], offset); - w[29] = amd_bytealign (w[17], w[18], offset); - w[28] = amd_bytealign (w[16], w[17], offset); - w[27] = amd_bytealign (w[15], w[16], offset); - w[26] = amd_bytealign (w[14], w[15], offset); - w[25] = amd_bytealign (w[13], w[14], offset); - w[24] = amd_bytealign (w[12], w[13], offset); - w[23] = amd_bytealign (w[11], w[12], offset); - w[22] = amd_bytealign (w[10], w[11], offset); - w[21] = amd_bytealign (w[ 9], w[10], offset); - w[20] = amd_bytealign (w[ 8], w[ 9], offset); - w[19] = amd_bytealign (w[ 7], w[ 8], offset); - w[18] = amd_bytealign (w[ 6], w[ 7], offset); - w[17] = amd_bytealign (w[ 5], w[ 6], offset); - w[16] = amd_bytealign (w[ 4], w[ 5], offset); - w[15] = amd_bytealign (w[ 3], w[ 4], offset); - w[14] = amd_bytealign (w[ 2], w[ 3], offset); - w[13] = amd_bytealign (w[ 1], w[ 2], offset); - w[12] = amd_bytealign (w[ 0], w[ 1], offset); - w[11] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[51], w[52], offset); + w[62] = hc_bytealign (w[50], w[51], offset); + w[61] = hc_bytealign (w[49], w[50], offset); + w[60] = hc_bytealign (w[48], w[49], offset); + w[59] = hc_bytealign (w[47], w[48], offset); + w[58] = hc_bytealign (w[46], w[47], offset); + w[57] = hc_bytealign (w[45], w[46], offset); + w[56] = hc_bytealign (w[44], w[45], offset); + w[55] = hc_bytealign (w[43], w[44], offset); + w[54] = hc_bytealign (w[42], w[43], offset); + w[53] = hc_bytealign (w[41], w[42], offset); + w[52] = hc_bytealign (w[40], w[41], offset); + w[51] = hc_bytealign (w[39], w[40], offset); + w[50] = hc_bytealign (w[38], w[39], offset); + w[49] = hc_bytealign (w[37], w[38], offset); + w[48] = hc_bytealign (w[36], w[37], offset); + w[47] = hc_bytealign (w[35], w[36], offset); + w[46] = hc_bytealign (w[34], w[35], offset); + w[45] = hc_bytealign (w[33], w[34], offset); + w[44] = hc_bytealign (w[32], w[33], offset); + w[43] = hc_bytealign (w[31], w[32], offset); + w[42] = hc_bytealign (w[30], w[31], offset); + w[41] = hc_bytealign (w[29], w[30], offset); + w[40] = hc_bytealign (w[28], w[29], offset); + w[39] = hc_bytealign (w[27], w[28], offset); + w[38] = hc_bytealign (w[26], w[27], offset); + w[37] = hc_bytealign (w[25], w[26], offset); + w[36] = hc_bytealign (w[24], w[25], offset); + w[35] = hc_bytealign (w[23], w[24], offset); + w[34] = hc_bytealign (w[22], w[23], offset); + w[33] = hc_bytealign (w[21], w[22], offset); + w[32] = hc_bytealign (w[20], w[21], offset); + w[31] = hc_bytealign (w[19], w[20], offset); + w[30] = hc_bytealign (w[18], w[19], offset); + w[29] = hc_bytealign (w[17], w[18], offset); + w[28] = hc_bytealign (w[16], w[17], offset); + w[27] = hc_bytealign (w[15], w[16], offset); + w[26] = hc_bytealign (w[14], w[15], offset); + w[25] = hc_bytealign (w[13], w[14], offset); + w[24] = hc_bytealign (w[12], w[13], offset); + w[23] = hc_bytealign (w[11], w[12], offset); + w[22] = hc_bytealign (w[10], w[11], offset); + w[21] = hc_bytealign (w[ 9], w[10], offset); + w[20] = hc_bytealign (w[ 8], w[ 9], offset); + w[19] = hc_bytealign (w[ 7], w[ 8], offset); + w[18] = hc_bytealign (w[ 6], w[ 7], offset); + w[17] = hc_bytealign (w[ 5], w[ 6], offset); + w[16] = hc_bytealign (w[ 4], w[ 5], offset); + w[15] = hc_bytealign (w[ 3], w[ 4], offset); + w[14] = hc_bytealign (w[ 2], w[ 3], offset); + w[13] = hc_bytealign (w[ 1], w[ 2], offset); + w[12] = hc_bytealign (w[ 0], w[ 1], offset); + w[11] = hc_bytealign ( 0, w[ 0], offset); w[10] = 0; w[ 9] = 0; w[ 8] = 0; @@ -21214,58 +21214,58 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 12: - w[63] = amd_bytealign (w[50], w[51], offset); - w[62] = amd_bytealign (w[49], w[50], offset); - w[61] = amd_bytealign (w[48], w[49], offset); - w[60] = amd_bytealign (w[47], w[48], offset); - w[59] = amd_bytealign (w[46], w[47], offset); - w[58] = amd_bytealign (w[45], w[46], offset); - w[57] = amd_bytealign (w[44], w[45], offset); - w[56] = amd_bytealign (w[43], w[44], offset); - w[55] = amd_bytealign (w[42], w[43], offset); - w[54] = amd_bytealign (w[41], w[42], offset); - w[53] = amd_bytealign (w[40], w[41], offset); - w[52] = amd_bytealign (w[39], w[40], offset); - w[51] = amd_bytealign (w[38], w[39], offset); - w[50] = amd_bytealign (w[37], w[38], offset); - w[49] = amd_bytealign (w[36], w[37], offset); - w[48] = amd_bytealign (w[35], w[36], offset); - w[47] = amd_bytealign (w[34], w[35], offset); - w[46] = amd_bytealign (w[33], w[34], offset); - w[45] = amd_bytealign (w[32], w[33], offset); - w[44] = amd_bytealign (w[31], w[32], offset); - w[43] = amd_bytealign (w[30], w[31], offset); - w[42] = amd_bytealign (w[29], w[30], offset); - w[41] = amd_bytealign (w[28], w[29], offset); - w[40] = amd_bytealign (w[27], w[28], offset); - w[39] = amd_bytealign (w[26], w[27], offset); - w[38] = amd_bytealign (w[25], w[26], offset); - w[37] = amd_bytealign (w[24], w[25], offset); - w[36] = amd_bytealign (w[23], w[24], offset); - w[35] = amd_bytealign (w[22], w[23], offset); - w[34] = amd_bytealign (w[21], w[22], offset); - w[33] = amd_bytealign (w[20], w[21], offset); - w[32] = amd_bytealign (w[19], w[20], offset); - w[31] = amd_bytealign (w[18], w[19], offset); - w[30] = amd_bytealign (w[17], w[18], offset); - w[29] = amd_bytealign (w[16], w[17], offset); - w[28] = amd_bytealign (w[15], w[16], offset); - w[27] = amd_bytealign (w[14], w[15], offset); - w[26] = amd_bytealign (w[13], w[14], offset); - w[25] = amd_bytealign (w[12], w[13], offset); - w[24] = amd_bytealign (w[11], w[12], offset); - w[23] = amd_bytealign (w[10], w[11], offset); - w[22] = amd_bytealign (w[ 9], w[10], offset); - w[21] = amd_bytealign (w[ 8], w[ 9], offset); - w[20] = amd_bytealign (w[ 7], w[ 8], offset); - w[19] = amd_bytealign (w[ 6], w[ 7], offset); - w[18] = amd_bytealign (w[ 5], w[ 6], offset); - w[17] = amd_bytealign (w[ 4], w[ 5], offset); - w[16] = amd_bytealign (w[ 3], w[ 4], offset); - w[15] = amd_bytealign (w[ 2], w[ 3], offset); - w[14] = amd_bytealign (w[ 1], w[ 2], offset); - w[13] = amd_bytealign (w[ 0], w[ 1], offset); - w[12] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[50], w[51], offset); + w[62] = hc_bytealign (w[49], w[50], offset); + w[61] = hc_bytealign (w[48], w[49], offset); + w[60] = hc_bytealign (w[47], w[48], offset); + w[59] = hc_bytealign (w[46], w[47], offset); + w[58] = hc_bytealign (w[45], w[46], offset); + w[57] = hc_bytealign (w[44], w[45], offset); + w[56] = hc_bytealign (w[43], w[44], offset); + w[55] = hc_bytealign (w[42], w[43], offset); + w[54] = hc_bytealign (w[41], w[42], offset); + w[53] = hc_bytealign (w[40], w[41], offset); + w[52] = hc_bytealign (w[39], w[40], offset); + w[51] = hc_bytealign (w[38], w[39], offset); + w[50] = hc_bytealign (w[37], w[38], offset); + w[49] = hc_bytealign (w[36], w[37], offset); + w[48] = hc_bytealign (w[35], w[36], offset); + w[47] = hc_bytealign (w[34], w[35], offset); + w[46] = hc_bytealign (w[33], w[34], offset); + w[45] = hc_bytealign (w[32], w[33], offset); + w[44] = hc_bytealign (w[31], w[32], offset); + w[43] = hc_bytealign (w[30], w[31], offset); + w[42] = hc_bytealign (w[29], w[30], offset); + w[41] = hc_bytealign (w[28], w[29], offset); + w[40] = hc_bytealign (w[27], w[28], offset); + w[39] = hc_bytealign (w[26], w[27], offset); + w[38] = hc_bytealign (w[25], w[26], offset); + w[37] = hc_bytealign (w[24], w[25], offset); + w[36] = hc_bytealign (w[23], w[24], offset); + w[35] = hc_bytealign (w[22], w[23], offset); + w[34] = hc_bytealign (w[21], w[22], offset); + w[33] = hc_bytealign (w[20], w[21], offset); + w[32] = hc_bytealign (w[19], w[20], offset); + w[31] = hc_bytealign (w[18], w[19], offset); + w[30] = hc_bytealign (w[17], w[18], offset); + w[29] = hc_bytealign (w[16], w[17], offset); + w[28] = hc_bytealign (w[15], w[16], offset); + w[27] = hc_bytealign (w[14], w[15], offset); + w[26] = hc_bytealign (w[13], w[14], offset); + w[25] = hc_bytealign (w[12], w[13], offset); + w[24] = hc_bytealign (w[11], w[12], offset); + w[23] = hc_bytealign (w[10], w[11], offset); + w[22] = hc_bytealign (w[ 9], w[10], offset); + w[21] = hc_bytealign (w[ 8], w[ 9], offset); + w[20] = hc_bytealign (w[ 7], w[ 8], offset); + w[19] = hc_bytealign (w[ 6], w[ 7], offset); + w[18] = hc_bytealign (w[ 5], w[ 6], offset); + w[17] = hc_bytealign (w[ 4], w[ 5], offset); + w[16] = hc_bytealign (w[ 3], w[ 4], offset); + w[15] = hc_bytealign (w[ 2], w[ 3], offset); + w[14] = hc_bytealign (w[ 1], w[ 2], offset); + w[13] = hc_bytealign (w[ 0], w[ 1], offset); + w[12] = hc_bytealign ( 0, w[ 0], offset); w[11] = 0; w[10] = 0; w[ 9] = 0; @@ -21282,57 +21282,57 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 13: - w[63] = amd_bytealign (w[49], w[50], offset); - w[62] = amd_bytealign (w[48], w[49], offset); - w[61] = amd_bytealign (w[47], w[48], offset); - w[60] = amd_bytealign (w[46], w[47], offset); - w[59] = amd_bytealign (w[45], w[46], offset); - w[58] = amd_bytealign (w[44], w[45], offset); - w[57] = amd_bytealign (w[43], w[44], offset); - w[56] = amd_bytealign (w[42], w[43], offset); - w[55] = amd_bytealign (w[41], w[42], offset); - w[54] = amd_bytealign (w[40], w[41], offset); - w[53] = amd_bytealign (w[39], w[40], offset); - w[52] = amd_bytealign (w[38], w[39], offset); - w[51] = amd_bytealign (w[37], w[38], offset); - w[50] = amd_bytealign (w[36], w[37], offset); - w[49] = amd_bytealign (w[35], w[36], offset); - w[48] = amd_bytealign (w[34], w[35], offset); - w[47] = amd_bytealign (w[33], w[34], offset); - w[46] = amd_bytealign (w[32], w[33], offset); - w[45] = amd_bytealign (w[31], w[32], offset); - w[44] = amd_bytealign (w[30], w[31], offset); - w[43] = amd_bytealign (w[29], w[30], offset); - w[42] = amd_bytealign (w[28], w[29], offset); - w[41] = amd_bytealign (w[27], w[28], offset); - w[40] = amd_bytealign (w[26], w[27], offset); - w[39] = amd_bytealign (w[25], w[26], offset); - w[38] = amd_bytealign (w[24], w[25], offset); - w[37] = amd_bytealign (w[23], w[24], offset); - w[36] = amd_bytealign (w[22], w[23], offset); - w[35] = amd_bytealign (w[21], w[22], offset); - w[34] = amd_bytealign (w[20], w[21], offset); - w[33] = amd_bytealign (w[19], w[20], offset); - w[32] = amd_bytealign (w[18], w[19], offset); - w[31] = amd_bytealign (w[17], w[18], offset); - w[30] = amd_bytealign (w[16], w[17], offset); - w[29] = amd_bytealign (w[15], w[16], offset); - w[28] = amd_bytealign (w[14], w[15], offset); - w[27] = amd_bytealign (w[13], w[14], offset); - w[26] = amd_bytealign (w[12], w[13], offset); - w[25] = amd_bytealign (w[11], w[12], offset); - w[24] = amd_bytealign (w[10], w[11], offset); - w[23] = amd_bytealign (w[ 9], w[10], offset); - w[22] = amd_bytealign (w[ 8], w[ 9], offset); - w[21] = amd_bytealign (w[ 7], w[ 8], offset); - w[20] = amd_bytealign (w[ 6], w[ 7], offset); - w[19] = amd_bytealign (w[ 5], w[ 6], offset); - w[18] = amd_bytealign (w[ 4], w[ 5], offset); - w[17] = amd_bytealign (w[ 3], w[ 4], offset); - w[16] = amd_bytealign (w[ 2], w[ 3], offset); - w[15] = amd_bytealign (w[ 1], w[ 2], offset); - w[14] = amd_bytealign (w[ 0], w[ 1], offset); - w[13] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[49], w[50], offset); + w[62] = hc_bytealign (w[48], w[49], offset); + w[61] = hc_bytealign (w[47], w[48], offset); + w[60] = hc_bytealign (w[46], w[47], offset); + w[59] = hc_bytealign (w[45], w[46], offset); + w[58] = hc_bytealign (w[44], w[45], offset); + w[57] = hc_bytealign (w[43], w[44], offset); + w[56] = hc_bytealign (w[42], w[43], offset); + w[55] = hc_bytealign (w[41], w[42], offset); + w[54] = hc_bytealign (w[40], w[41], offset); + w[53] = hc_bytealign (w[39], w[40], offset); + w[52] = hc_bytealign (w[38], w[39], offset); + w[51] = hc_bytealign (w[37], w[38], offset); + w[50] = hc_bytealign (w[36], w[37], offset); + w[49] = hc_bytealign (w[35], w[36], offset); + w[48] = hc_bytealign (w[34], w[35], offset); + w[47] = hc_bytealign (w[33], w[34], offset); + w[46] = hc_bytealign (w[32], w[33], offset); + w[45] = hc_bytealign (w[31], w[32], offset); + w[44] = hc_bytealign (w[30], w[31], offset); + w[43] = hc_bytealign (w[29], w[30], offset); + w[42] = hc_bytealign (w[28], w[29], offset); + w[41] = hc_bytealign (w[27], w[28], offset); + w[40] = hc_bytealign (w[26], w[27], offset); + w[39] = hc_bytealign (w[25], w[26], offset); + w[38] = hc_bytealign (w[24], w[25], offset); + w[37] = hc_bytealign (w[23], w[24], offset); + w[36] = hc_bytealign (w[22], w[23], offset); + w[35] = hc_bytealign (w[21], w[22], offset); + w[34] = hc_bytealign (w[20], w[21], offset); + w[33] = hc_bytealign (w[19], w[20], offset); + w[32] = hc_bytealign (w[18], w[19], offset); + w[31] = hc_bytealign (w[17], w[18], offset); + w[30] = hc_bytealign (w[16], w[17], offset); + w[29] = hc_bytealign (w[15], w[16], offset); + w[28] = hc_bytealign (w[14], w[15], offset); + w[27] = hc_bytealign (w[13], w[14], offset); + w[26] = hc_bytealign (w[12], w[13], offset); + w[25] = hc_bytealign (w[11], w[12], offset); + w[24] = hc_bytealign (w[10], w[11], offset); + w[23] = hc_bytealign (w[ 9], w[10], offset); + w[22] = hc_bytealign (w[ 8], w[ 9], offset); + w[21] = hc_bytealign (w[ 7], w[ 8], offset); + w[20] = hc_bytealign (w[ 6], w[ 7], offset); + w[19] = hc_bytealign (w[ 5], w[ 6], offset); + w[18] = hc_bytealign (w[ 4], w[ 5], offset); + w[17] = hc_bytealign (w[ 3], w[ 4], offset); + w[16] = hc_bytealign (w[ 2], w[ 3], offset); + w[15] = hc_bytealign (w[ 1], w[ 2], offset); + w[14] = hc_bytealign (w[ 0], w[ 1], offset); + w[13] = hc_bytealign ( 0, w[ 0], offset); w[12] = 0; w[11] = 0; w[10] = 0; @@ -21350,56 +21350,56 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 14: - w[63] = amd_bytealign (w[48], w[49], offset); - w[62] = amd_bytealign (w[47], w[48], offset); - w[61] = amd_bytealign (w[46], w[47], offset); - w[60] = amd_bytealign (w[45], w[46], offset); - w[59] = amd_bytealign (w[44], w[45], offset); - w[58] = amd_bytealign (w[43], w[44], offset); - w[57] = amd_bytealign (w[42], w[43], offset); - w[56] = amd_bytealign (w[41], w[42], offset); - w[55] = amd_bytealign (w[40], w[41], offset); - w[54] = amd_bytealign (w[39], w[40], offset); - w[53] = amd_bytealign (w[38], w[39], offset); - w[52] = amd_bytealign (w[37], w[38], offset); - w[51] = amd_bytealign (w[36], w[37], offset); - w[50] = amd_bytealign (w[35], w[36], offset); - w[49] = amd_bytealign (w[34], w[35], offset); - w[48] = amd_bytealign (w[33], w[34], offset); - w[47] = amd_bytealign (w[32], w[33], offset); - w[46] = amd_bytealign (w[31], w[32], offset); - w[45] = amd_bytealign (w[30], w[31], offset); - w[44] = amd_bytealign (w[29], w[30], offset); - w[43] = amd_bytealign (w[28], w[29], offset); - w[42] = amd_bytealign (w[27], w[28], offset); - w[41] = amd_bytealign (w[26], w[27], offset); - w[40] = amd_bytealign (w[25], w[26], offset); - w[39] = amd_bytealign (w[24], w[25], offset); - w[38] = amd_bytealign (w[23], w[24], offset); - w[37] = amd_bytealign (w[22], w[23], offset); - w[36] = amd_bytealign (w[21], w[22], offset); - w[35] = amd_bytealign (w[20], w[21], offset); - w[34] = amd_bytealign (w[19], w[20], offset); - w[33] = amd_bytealign (w[18], w[19], offset); - w[32] = amd_bytealign (w[17], w[18], offset); - w[31] = amd_bytealign (w[16], w[17], offset); - w[30] = amd_bytealign (w[15], w[16], offset); - w[29] = amd_bytealign (w[14], w[15], offset); - w[28] = amd_bytealign (w[13], w[14], offset); - w[27] = amd_bytealign (w[12], w[13], offset); - w[26] = amd_bytealign (w[11], w[12], offset); - w[25] = amd_bytealign (w[10], w[11], offset); - w[24] = amd_bytealign (w[ 9], w[10], offset); - w[23] = amd_bytealign (w[ 8], w[ 9], offset); - w[22] = amd_bytealign (w[ 7], w[ 8], offset); - w[21] = amd_bytealign (w[ 6], w[ 7], offset); - w[20] = amd_bytealign (w[ 5], w[ 6], offset); - w[19] = amd_bytealign (w[ 4], w[ 5], offset); - w[18] = amd_bytealign (w[ 3], w[ 4], offset); - w[17] = amd_bytealign (w[ 2], w[ 3], offset); - w[16] = amd_bytealign (w[ 1], w[ 2], offset); - w[15] = amd_bytealign (w[ 0], w[ 1], offset); - w[14] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[48], w[49], offset); + w[62] = hc_bytealign (w[47], w[48], offset); + w[61] = hc_bytealign (w[46], w[47], offset); + w[60] = hc_bytealign (w[45], w[46], offset); + w[59] = hc_bytealign (w[44], w[45], offset); + w[58] = hc_bytealign (w[43], w[44], offset); + w[57] = hc_bytealign (w[42], w[43], offset); + w[56] = hc_bytealign (w[41], w[42], offset); + w[55] = hc_bytealign (w[40], w[41], offset); + w[54] = hc_bytealign (w[39], w[40], offset); + w[53] = hc_bytealign (w[38], w[39], offset); + w[52] = hc_bytealign (w[37], w[38], offset); + w[51] = hc_bytealign (w[36], w[37], offset); + w[50] = hc_bytealign (w[35], w[36], offset); + w[49] = hc_bytealign (w[34], w[35], offset); + w[48] = hc_bytealign (w[33], w[34], offset); + w[47] = hc_bytealign (w[32], w[33], offset); + w[46] = hc_bytealign (w[31], w[32], offset); + w[45] = hc_bytealign (w[30], w[31], offset); + w[44] = hc_bytealign (w[29], w[30], offset); + w[43] = hc_bytealign (w[28], w[29], offset); + w[42] = hc_bytealign (w[27], w[28], offset); + w[41] = hc_bytealign (w[26], w[27], offset); + w[40] = hc_bytealign (w[25], w[26], offset); + w[39] = hc_bytealign (w[24], w[25], offset); + w[38] = hc_bytealign (w[23], w[24], offset); + w[37] = hc_bytealign (w[22], w[23], offset); + w[36] = hc_bytealign (w[21], w[22], offset); + w[35] = hc_bytealign (w[20], w[21], offset); + w[34] = hc_bytealign (w[19], w[20], offset); + w[33] = hc_bytealign (w[18], w[19], offset); + w[32] = hc_bytealign (w[17], w[18], offset); + w[31] = hc_bytealign (w[16], w[17], offset); + w[30] = hc_bytealign (w[15], w[16], offset); + w[29] = hc_bytealign (w[14], w[15], offset); + w[28] = hc_bytealign (w[13], w[14], offset); + w[27] = hc_bytealign (w[12], w[13], offset); + w[26] = hc_bytealign (w[11], w[12], offset); + w[25] = hc_bytealign (w[10], w[11], offset); + w[24] = hc_bytealign (w[ 9], w[10], offset); + w[23] = hc_bytealign (w[ 8], w[ 9], offset); + w[22] = hc_bytealign (w[ 7], w[ 8], offset); + w[21] = hc_bytealign (w[ 6], w[ 7], offset); + w[20] = hc_bytealign (w[ 5], w[ 6], offset); + w[19] = hc_bytealign (w[ 4], w[ 5], offset); + w[18] = hc_bytealign (w[ 3], w[ 4], offset); + w[17] = hc_bytealign (w[ 2], w[ 3], offset); + w[16] = hc_bytealign (w[ 1], w[ 2], offset); + w[15] = hc_bytealign (w[ 0], w[ 1], offset); + w[14] = hc_bytealign ( 0, w[ 0], offset); w[13] = 0; w[12] = 0; w[11] = 0; @@ -21418,55 +21418,55 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 15: - w[63] = amd_bytealign (w[47], w[48], offset); - w[62] = amd_bytealign (w[46], w[47], offset); - w[61] = amd_bytealign (w[45], w[46], offset); - w[60] = amd_bytealign (w[44], w[45], offset); - w[59] = amd_bytealign (w[43], w[44], offset); - w[58] = amd_bytealign (w[42], w[43], offset); - w[57] = amd_bytealign (w[41], w[42], offset); - w[56] = amd_bytealign (w[40], w[41], offset); - w[55] = amd_bytealign (w[39], w[40], offset); - w[54] = amd_bytealign (w[38], w[39], offset); - w[53] = amd_bytealign (w[37], w[38], offset); - w[52] = amd_bytealign (w[36], w[37], offset); - w[51] = amd_bytealign (w[35], w[36], offset); - w[50] = amd_bytealign (w[34], w[35], offset); - w[49] = amd_bytealign (w[33], w[34], offset); - w[48] = amd_bytealign (w[32], w[33], offset); - w[47] = amd_bytealign (w[31], w[32], offset); - w[46] = amd_bytealign (w[30], w[31], offset); - w[45] = amd_bytealign (w[29], w[30], offset); - w[44] = amd_bytealign (w[28], w[29], offset); - w[43] = amd_bytealign (w[27], w[28], offset); - w[42] = amd_bytealign (w[26], w[27], offset); - w[41] = amd_bytealign (w[25], w[26], offset); - w[40] = amd_bytealign (w[24], w[25], offset); - w[39] = amd_bytealign (w[23], w[24], offset); - w[38] = amd_bytealign (w[22], w[23], offset); - w[37] = amd_bytealign (w[21], w[22], offset); - w[36] = amd_bytealign (w[20], w[21], offset); - w[35] = amd_bytealign (w[19], w[20], offset); - w[34] = amd_bytealign (w[18], w[19], offset); - w[33] = amd_bytealign (w[17], w[18], offset); - w[32] = amd_bytealign (w[16], w[17], offset); - w[31] = amd_bytealign (w[15], w[16], offset); - w[30] = amd_bytealign (w[14], w[15], offset); - w[29] = amd_bytealign (w[13], w[14], offset); - w[28] = amd_bytealign (w[12], w[13], offset); - w[27] = amd_bytealign (w[11], w[12], offset); - w[26] = amd_bytealign (w[10], w[11], offset); - w[25] = amd_bytealign (w[ 9], w[10], offset); - w[24] = amd_bytealign (w[ 8], w[ 9], offset); - w[23] = amd_bytealign (w[ 7], w[ 8], offset); - w[22] = amd_bytealign (w[ 6], w[ 7], offset); - w[21] = amd_bytealign (w[ 5], w[ 6], offset); - w[20] = amd_bytealign (w[ 4], w[ 5], offset); - w[19] = amd_bytealign (w[ 3], w[ 4], offset); - w[18] = amd_bytealign (w[ 2], w[ 3], offset); - w[17] = amd_bytealign (w[ 1], w[ 2], offset); - w[16] = amd_bytealign (w[ 0], w[ 1], offset); - w[15] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[47], w[48], offset); + w[62] = hc_bytealign (w[46], w[47], offset); + w[61] = hc_bytealign (w[45], w[46], offset); + w[60] = hc_bytealign (w[44], w[45], offset); + w[59] = hc_bytealign (w[43], w[44], offset); + w[58] = hc_bytealign (w[42], w[43], offset); + w[57] = hc_bytealign (w[41], w[42], offset); + w[56] = hc_bytealign (w[40], w[41], offset); + w[55] = hc_bytealign (w[39], w[40], offset); + w[54] = hc_bytealign (w[38], w[39], offset); + w[53] = hc_bytealign (w[37], w[38], offset); + w[52] = hc_bytealign (w[36], w[37], offset); + w[51] = hc_bytealign (w[35], w[36], offset); + w[50] = hc_bytealign (w[34], w[35], offset); + w[49] = hc_bytealign (w[33], w[34], offset); + w[48] = hc_bytealign (w[32], w[33], offset); + w[47] = hc_bytealign (w[31], w[32], offset); + w[46] = hc_bytealign (w[30], w[31], offset); + w[45] = hc_bytealign (w[29], w[30], offset); + w[44] = hc_bytealign (w[28], w[29], offset); + w[43] = hc_bytealign (w[27], w[28], offset); + w[42] = hc_bytealign (w[26], w[27], offset); + w[41] = hc_bytealign (w[25], w[26], offset); + w[40] = hc_bytealign (w[24], w[25], offset); + w[39] = hc_bytealign (w[23], w[24], offset); + w[38] = hc_bytealign (w[22], w[23], offset); + w[37] = hc_bytealign (w[21], w[22], offset); + w[36] = hc_bytealign (w[20], w[21], offset); + w[35] = hc_bytealign (w[19], w[20], offset); + w[34] = hc_bytealign (w[18], w[19], offset); + w[33] = hc_bytealign (w[17], w[18], offset); + w[32] = hc_bytealign (w[16], w[17], offset); + w[31] = hc_bytealign (w[15], w[16], offset); + w[30] = hc_bytealign (w[14], w[15], offset); + w[29] = hc_bytealign (w[13], w[14], offset); + w[28] = hc_bytealign (w[12], w[13], offset); + w[27] = hc_bytealign (w[11], w[12], offset); + w[26] = hc_bytealign (w[10], w[11], offset); + w[25] = hc_bytealign (w[ 9], w[10], offset); + w[24] = hc_bytealign (w[ 8], w[ 9], offset); + w[23] = hc_bytealign (w[ 7], w[ 8], offset); + w[22] = hc_bytealign (w[ 6], w[ 7], offset); + w[21] = hc_bytealign (w[ 5], w[ 6], offset); + w[20] = hc_bytealign (w[ 4], w[ 5], offset); + w[19] = hc_bytealign (w[ 3], w[ 4], offset); + w[18] = hc_bytealign (w[ 2], w[ 3], offset); + w[17] = hc_bytealign (w[ 1], w[ 2], offset); + w[16] = hc_bytealign (w[ 0], w[ 1], offset); + w[15] = hc_bytealign ( 0, w[ 0], offset); w[14] = 0; w[13] = 0; w[12] = 0; @@ -21486,54 +21486,54 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 16: - w[63] = amd_bytealign (w[46], w[47], offset); - w[62] = amd_bytealign (w[45], w[46], offset); - w[61] = amd_bytealign (w[44], w[45], offset); - w[60] = amd_bytealign (w[43], w[44], offset); - w[59] = amd_bytealign (w[42], w[43], offset); - w[58] = amd_bytealign (w[41], w[42], offset); - w[57] = amd_bytealign (w[40], w[41], offset); - w[56] = amd_bytealign (w[39], w[40], offset); - w[55] = amd_bytealign (w[38], w[39], offset); - w[54] = amd_bytealign (w[37], w[38], offset); - w[53] = amd_bytealign (w[36], w[37], offset); - w[52] = amd_bytealign (w[35], w[36], offset); - w[51] = amd_bytealign (w[34], w[35], offset); - w[50] = amd_bytealign (w[33], w[34], offset); - w[49] = amd_bytealign (w[32], w[33], offset); - w[48] = amd_bytealign (w[31], w[32], offset); - w[47] = amd_bytealign (w[30], w[31], offset); - w[46] = amd_bytealign (w[29], w[30], offset); - w[45] = amd_bytealign (w[28], w[29], offset); - w[44] = amd_bytealign (w[27], w[28], offset); - w[43] = amd_bytealign (w[26], w[27], offset); - w[42] = amd_bytealign (w[25], w[26], offset); - w[41] = amd_bytealign (w[24], w[25], offset); - w[40] = amd_bytealign (w[23], w[24], offset); - w[39] = amd_bytealign (w[22], w[23], offset); - w[38] = amd_bytealign (w[21], w[22], offset); - w[37] = amd_bytealign (w[20], w[21], offset); - w[36] = amd_bytealign (w[19], w[20], offset); - w[35] = amd_bytealign (w[18], w[19], offset); - w[34] = amd_bytealign (w[17], w[18], offset); - w[33] = amd_bytealign (w[16], w[17], offset); - w[32] = amd_bytealign (w[15], w[16], offset); - w[31] = amd_bytealign (w[14], w[15], offset); - w[30] = amd_bytealign (w[13], w[14], offset); - w[29] = amd_bytealign (w[12], w[13], offset); - w[28] = amd_bytealign (w[11], w[12], offset); - w[27] = amd_bytealign (w[10], w[11], offset); - w[26] = amd_bytealign (w[ 9], w[10], offset); - w[25] = amd_bytealign (w[ 8], w[ 9], offset); - w[24] = amd_bytealign (w[ 7], w[ 8], offset); - w[23] = amd_bytealign (w[ 6], w[ 7], offset); - w[22] = amd_bytealign (w[ 5], w[ 6], offset); - w[21] = amd_bytealign (w[ 4], w[ 5], offset); - w[20] = amd_bytealign (w[ 3], w[ 4], offset); - w[19] = amd_bytealign (w[ 2], w[ 3], offset); - w[18] = amd_bytealign (w[ 1], w[ 2], offset); - w[17] = amd_bytealign (w[ 0], w[ 1], offset); - w[16] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[46], w[47], offset); + w[62] = hc_bytealign (w[45], w[46], offset); + w[61] = hc_bytealign (w[44], w[45], offset); + w[60] = hc_bytealign (w[43], w[44], offset); + w[59] = hc_bytealign (w[42], w[43], offset); + w[58] = hc_bytealign (w[41], w[42], offset); + w[57] = hc_bytealign (w[40], w[41], offset); + w[56] = hc_bytealign (w[39], w[40], offset); + w[55] = hc_bytealign (w[38], w[39], offset); + w[54] = hc_bytealign (w[37], w[38], offset); + w[53] = hc_bytealign (w[36], w[37], offset); + w[52] = hc_bytealign (w[35], w[36], offset); + w[51] = hc_bytealign (w[34], w[35], offset); + w[50] = hc_bytealign (w[33], w[34], offset); + w[49] = hc_bytealign (w[32], w[33], offset); + w[48] = hc_bytealign (w[31], w[32], offset); + w[47] = hc_bytealign (w[30], w[31], offset); + w[46] = hc_bytealign (w[29], w[30], offset); + w[45] = hc_bytealign (w[28], w[29], offset); + w[44] = hc_bytealign (w[27], w[28], offset); + w[43] = hc_bytealign (w[26], w[27], offset); + w[42] = hc_bytealign (w[25], w[26], offset); + w[41] = hc_bytealign (w[24], w[25], offset); + w[40] = hc_bytealign (w[23], w[24], offset); + w[39] = hc_bytealign (w[22], w[23], offset); + w[38] = hc_bytealign (w[21], w[22], offset); + w[37] = hc_bytealign (w[20], w[21], offset); + w[36] = hc_bytealign (w[19], w[20], offset); + w[35] = hc_bytealign (w[18], w[19], offset); + w[34] = hc_bytealign (w[17], w[18], offset); + w[33] = hc_bytealign (w[16], w[17], offset); + w[32] = hc_bytealign (w[15], w[16], offset); + w[31] = hc_bytealign (w[14], w[15], offset); + w[30] = hc_bytealign (w[13], w[14], offset); + w[29] = hc_bytealign (w[12], w[13], offset); + w[28] = hc_bytealign (w[11], w[12], offset); + w[27] = hc_bytealign (w[10], w[11], offset); + w[26] = hc_bytealign (w[ 9], w[10], offset); + w[25] = hc_bytealign (w[ 8], w[ 9], offset); + w[24] = hc_bytealign (w[ 7], w[ 8], offset); + w[23] = hc_bytealign (w[ 6], w[ 7], offset); + w[22] = hc_bytealign (w[ 5], w[ 6], offset); + w[21] = hc_bytealign (w[ 4], w[ 5], offset); + w[20] = hc_bytealign (w[ 3], w[ 4], offset); + w[19] = hc_bytealign (w[ 2], w[ 3], offset); + w[18] = hc_bytealign (w[ 1], w[ 2], offset); + w[17] = hc_bytealign (w[ 0], w[ 1], offset); + w[16] = hc_bytealign ( 0, w[ 0], offset); w[15] = 0; w[14] = 0; w[13] = 0; @@ -21554,53 +21554,53 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 17: - w[63] = amd_bytealign (w[45], w[46], offset); - w[62] = amd_bytealign (w[44], w[45], offset); - w[61] = amd_bytealign (w[43], w[44], offset); - w[60] = amd_bytealign (w[42], w[43], offset); - w[59] = amd_bytealign (w[41], w[42], offset); - w[58] = amd_bytealign (w[40], w[41], offset); - w[57] = amd_bytealign (w[39], w[40], offset); - w[56] = amd_bytealign (w[38], w[39], offset); - w[55] = amd_bytealign (w[37], w[38], offset); - w[54] = amd_bytealign (w[36], w[37], offset); - w[53] = amd_bytealign (w[35], w[36], offset); - w[52] = amd_bytealign (w[34], w[35], offset); - w[51] = amd_bytealign (w[33], w[34], offset); - w[50] = amd_bytealign (w[32], w[33], offset); - w[49] = amd_bytealign (w[31], w[32], offset); - w[48] = amd_bytealign (w[30], w[31], offset); - w[47] = amd_bytealign (w[29], w[30], offset); - w[46] = amd_bytealign (w[28], w[29], offset); - w[45] = amd_bytealign (w[27], w[28], offset); - w[44] = amd_bytealign (w[26], w[27], offset); - w[43] = amd_bytealign (w[25], w[26], offset); - w[42] = amd_bytealign (w[24], w[25], offset); - w[41] = amd_bytealign (w[23], w[24], offset); - w[40] = amd_bytealign (w[22], w[23], offset); - w[39] = amd_bytealign (w[21], w[22], offset); - w[38] = amd_bytealign (w[20], w[21], offset); - w[37] = amd_bytealign (w[19], w[20], offset); - w[36] = amd_bytealign (w[18], w[19], offset); - w[35] = amd_bytealign (w[17], w[18], offset); - w[34] = amd_bytealign (w[16], w[17], offset); - w[33] = amd_bytealign (w[15], w[16], offset); - w[32] = amd_bytealign (w[14], w[15], offset); - w[31] = amd_bytealign (w[13], w[14], offset); - w[30] = amd_bytealign (w[12], w[13], offset); - w[29] = amd_bytealign (w[11], w[12], offset); - w[28] = amd_bytealign (w[10], w[11], offset); - w[27] = amd_bytealign (w[ 9], w[10], offset); - w[26] = amd_bytealign (w[ 8], w[ 9], offset); - w[25] = amd_bytealign (w[ 7], w[ 8], offset); - w[24] = amd_bytealign (w[ 6], w[ 7], offset); - w[23] = amd_bytealign (w[ 5], w[ 6], offset); - w[22] = amd_bytealign (w[ 4], w[ 5], offset); - w[21] = amd_bytealign (w[ 3], w[ 4], offset); - w[20] = amd_bytealign (w[ 2], w[ 3], offset); - w[19] = amd_bytealign (w[ 1], w[ 2], offset); - w[18] = amd_bytealign (w[ 0], w[ 1], offset); - w[17] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[45], w[46], offset); + w[62] = hc_bytealign (w[44], w[45], offset); + w[61] = hc_bytealign (w[43], w[44], offset); + w[60] = hc_bytealign (w[42], w[43], offset); + w[59] = hc_bytealign (w[41], w[42], offset); + w[58] = hc_bytealign (w[40], w[41], offset); + w[57] = hc_bytealign (w[39], w[40], offset); + w[56] = hc_bytealign (w[38], w[39], offset); + w[55] = hc_bytealign (w[37], w[38], offset); + w[54] = hc_bytealign (w[36], w[37], offset); + w[53] = hc_bytealign (w[35], w[36], offset); + w[52] = hc_bytealign (w[34], w[35], offset); + w[51] = hc_bytealign (w[33], w[34], offset); + w[50] = hc_bytealign (w[32], w[33], offset); + w[49] = hc_bytealign (w[31], w[32], offset); + w[48] = hc_bytealign (w[30], w[31], offset); + w[47] = hc_bytealign (w[29], w[30], offset); + w[46] = hc_bytealign (w[28], w[29], offset); + w[45] = hc_bytealign (w[27], w[28], offset); + w[44] = hc_bytealign (w[26], w[27], offset); + w[43] = hc_bytealign (w[25], w[26], offset); + w[42] = hc_bytealign (w[24], w[25], offset); + w[41] = hc_bytealign (w[23], w[24], offset); + w[40] = hc_bytealign (w[22], w[23], offset); + w[39] = hc_bytealign (w[21], w[22], offset); + w[38] = hc_bytealign (w[20], w[21], offset); + w[37] = hc_bytealign (w[19], w[20], offset); + w[36] = hc_bytealign (w[18], w[19], offset); + w[35] = hc_bytealign (w[17], w[18], offset); + w[34] = hc_bytealign (w[16], w[17], offset); + w[33] = hc_bytealign (w[15], w[16], offset); + w[32] = hc_bytealign (w[14], w[15], offset); + w[31] = hc_bytealign (w[13], w[14], offset); + w[30] = hc_bytealign (w[12], w[13], offset); + w[29] = hc_bytealign (w[11], w[12], offset); + w[28] = hc_bytealign (w[10], w[11], offset); + w[27] = hc_bytealign (w[ 9], w[10], offset); + w[26] = hc_bytealign (w[ 8], w[ 9], offset); + w[25] = hc_bytealign (w[ 7], w[ 8], offset); + w[24] = hc_bytealign (w[ 6], w[ 7], offset); + w[23] = hc_bytealign (w[ 5], w[ 6], offset); + w[22] = hc_bytealign (w[ 4], w[ 5], offset); + w[21] = hc_bytealign (w[ 3], w[ 4], offset); + w[20] = hc_bytealign (w[ 2], w[ 3], offset); + w[19] = hc_bytealign (w[ 1], w[ 2], offset); + w[18] = hc_bytealign (w[ 0], w[ 1], offset); + w[17] = hc_bytealign ( 0, w[ 0], offset); w[16] = 0; w[15] = 0; w[14] = 0; @@ -21622,52 +21622,52 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 18: - w[63] = amd_bytealign (w[44], w[45], offset); - w[62] = amd_bytealign (w[43], w[44], offset); - w[61] = amd_bytealign (w[42], w[43], offset); - w[60] = amd_bytealign (w[41], w[42], offset); - w[59] = amd_bytealign (w[40], w[41], offset); - w[58] = amd_bytealign (w[39], w[40], offset); - w[57] = amd_bytealign (w[38], w[39], offset); - w[56] = amd_bytealign (w[37], w[38], offset); - w[55] = amd_bytealign (w[36], w[37], offset); - w[54] = amd_bytealign (w[35], w[36], offset); - w[53] = amd_bytealign (w[34], w[35], offset); - w[52] = amd_bytealign (w[33], w[34], offset); - w[51] = amd_bytealign (w[32], w[33], offset); - w[50] = amd_bytealign (w[31], w[32], offset); - w[49] = amd_bytealign (w[30], w[31], offset); - w[48] = amd_bytealign (w[29], w[30], offset); - w[47] = amd_bytealign (w[28], w[29], offset); - w[46] = amd_bytealign (w[27], w[28], offset); - w[45] = amd_bytealign (w[26], w[27], offset); - w[44] = amd_bytealign (w[25], w[26], offset); - w[43] = amd_bytealign (w[24], w[25], offset); - w[42] = amd_bytealign (w[23], w[24], offset); - w[41] = amd_bytealign (w[22], w[23], offset); - w[40] = amd_bytealign (w[21], w[22], offset); - w[39] = amd_bytealign (w[20], w[21], offset); - w[38] = amd_bytealign (w[19], w[20], offset); - w[37] = amd_bytealign (w[18], w[19], offset); - w[36] = amd_bytealign (w[17], w[18], offset); - w[35] = amd_bytealign (w[16], w[17], offset); - w[34] = amd_bytealign (w[15], w[16], offset); - w[33] = amd_bytealign (w[14], w[15], offset); - w[32] = amd_bytealign (w[13], w[14], offset); - w[31] = amd_bytealign (w[12], w[13], offset); - w[30] = amd_bytealign (w[11], w[12], offset); - w[29] = amd_bytealign (w[10], w[11], offset); - w[28] = amd_bytealign (w[ 9], w[10], offset); - w[27] = amd_bytealign (w[ 8], w[ 9], offset); - w[26] = amd_bytealign (w[ 7], w[ 8], offset); - w[25] = amd_bytealign (w[ 6], w[ 7], offset); - w[24] = amd_bytealign (w[ 5], w[ 6], offset); - w[23] = amd_bytealign (w[ 4], w[ 5], offset); - w[22] = amd_bytealign (w[ 3], w[ 4], offset); - w[21] = amd_bytealign (w[ 2], w[ 3], offset); - w[20] = amd_bytealign (w[ 1], w[ 2], offset); - w[19] = amd_bytealign (w[ 0], w[ 1], offset); - w[18] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[44], w[45], offset); + w[62] = hc_bytealign (w[43], w[44], offset); + w[61] = hc_bytealign (w[42], w[43], offset); + w[60] = hc_bytealign (w[41], w[42], offset); + w[59] = hc_bytealign (w[40], w[41], offset); + w[58] = hc_bytealign (w[39], w[40], offset); + w[57] = hc_bytealign (w[38], w[39], offset); + w[56] = hc_bytealign (w[37], w[38], offset); + w[55] = hc_bytealign (w[36], w[37], offset); + w[54] = hc_bytealign (w[35], w[36], offset); + w[53] = hc_bytealign (w[34], w[35], offset); + w[52] = hc_bytealign (w[33], w[34], offset); + w[51] = hc_bytealign (w[32], w[33], offset); + w[50] = hc_bytealign (w[31], w[32], offset); + w[49] = hc_bytealign (w[30], w[31], offset); + w[48] = hc_bytealign (w[29], w[30], offset); + w[47] = hc_bytealign (w[28], w[29], offset); + w[46] = hc_bytealign (w[27], w[28], offset); + w[45] = hc_bytealign (w[26], w[27], offset); + w[44] = hc_bytealign (w[25], w[26], offset); + w[43] = hc_bytealign (w[24], w[25], offset); + w[42] = hc_bytealign (w[23], w[24], offset); + w[41] = hc_bytealign (w[22], w[23], offset); + w[40] = hc_bytealign (w[21], w[22], offset); + w[39] = hc_bytealign (w[20], w[21], offset); + w[38] = hc_bytealign (w[19], w[20], offset); + w[37] = hc_bytealign (w[18], w[19], offset); + w[36] = hc_bytealign (w[17], w[18], offset); + w[35] = hc_bytealign (w[16], w[17], offset); + w[34] = hc_bytealign (w[15], w[16], offset); + w[33] = hc_bytealign (w[14], w[15], offset); + w[32] = hc_bytealign (w[13], w[14], offset); + w[31] = hc_bytealign (w[12], w[13], offset); + w[30] = hc_bytealign (w[11], w[12], offset); + w[29] = hc_bytealign (w[10], w[11], offset); + w[28] = hc_bytealign (w[ 9], w[10], offset); + w[27] = hc_bytealign (w[ 8], w[ 9], offset); + w[26] = hc_bytealign (w[ 7], w[ 8], offset); + w[25] = hc_bytealign (w[ 6], w[ 7], offset); + w[24] = hc_bytealign (w[ 5], w[ 6], offset); + w[23] = hc_bytealign (w[ 4], w[ 5], offset); + w[22] = hc_bytealign (w[ 3], w[ 4], offset); + w[21] = hc_bytealign (w[ 2], w[ 3], offset); + w[20] = hc_bytealign (w[ 1], w[ 2], offset); + w[19] = hc_bytealign (w[ 0], w[ 1], offset); + w[18] = hc_bytealign ( 0, w[ 0], offset); w[17] = 0; w[16] = 0; w[15] = 0; @@ -21690,51 +21690,51 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 19: - w[63] = amd_bytealign (w[43], w[44], offset); - w[62] = amd_bytealign (w[42], w[43], offset); - w[61] = amd_bytealign (w[41], w[42], offset); - w[60] = amd_bytealign (w[40], w[41], offset); - w[59] = amd_bytealign (w[39], w[40], offset); - w[58] = amd_bytealign (w[38], w[39], offset); - w[57] = amd_bytealign (w[37], w[38], offset); - w[56] = amd_bytealign (w[36], w[37], offset); - w[55] = amd_bytealign (w[35], w[36], offset); - w[54] = amd_bytealign (w[34], w[35], offset); - w[53] = amd_bytealign (w[33], w[34], offset); - w[52] = amd_bytealign (w[32], w[33], offset); - w[51] = amd_bytealign (w[31], w[32], offset); - w[50] = amd_bytealign (w[30], w[31], offset); - w[49] = amd_bytealign (w[29], w[30], offset); - w[48] = amd_bytealign (w[28], w[29], offset); - w[47] = amd_bytealign (w[27], w[28], offset); - w[46] = amd_bytealign (w[26], w[27], offset); - w[45] = amd_bytealign (w[25], w[26], offset); - w[44] = amd_bytealign (w[24], w[25], offset); - w[43] = amd_bytealign (w[23], w[24], offset); - w[42] = amd_bytealign (w[22], w[23], offset); - w[41] = amd_bytealign (w[21], w[22], offset); - w[40] = amd_bytealign (w[20], w[21], offset); - w[39] = amd_bytealign (w[19], w[20], offset); - w[38] = amd_bytealign (w[18], w[19], offset); - w[37] = amd_bytealign (w[17], w[18], offset); - w[36] = amd_bytealign (w[16], w[17], offset); - w[35] = amd_bytealign (w[15], w[16], offset); - w[34] = amd_bytealign (w[14], w[15], offset); - w[33] = amd_bytealign (w[13], w[14], offset); - w[32] = amd_bytealign (w[12], w[13], offset); - w[31] = amd_bytealign (w[11], w[12], offset); - w[30] = amd_bytealign (w[10], w[11], offset); - w[29] = amd_bytealign (w[ 9], w[10], offset); - w[28] = amd_bytealign (w[ 8], w[ 9], offset); - w[27] = amd_bytealign (w[ 7], w[ 8], offset); - w[26] = amd_bytealign (w[ 6], w[ 7], offset); - w[25] = amd_bytealign (w[ 5], w[ 6], offset); - w[24] = amd_bytealign (w[ 4], w[ 5], offset); - w[23] = amd_bytealign (w[ 3], w[ 4], offset); - w[22] = amd_bytealign (w[ 2], w[ 3], offset); - w[21] = amd_bytealign (w[ 1], w[ 2], offset); - w[20] = amd_bytealign (w[ 0], w[ 1], offset); - w[19] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[43], w[44], offset); + w[62] = hc_bytealign (w[42], w[43], offset); + w[61] = hc_bytealign (w[41], w[42], offset); + w[60] = hc_bytealign (w[40], w[41], offset); + w[59] = hc_bytealign (w[39], w[40], offset); + w[58] = hc_bytealign (w[38], w[39], offset); + w[57] = hc_bytealign (w[37], w[38], offset); + w[56] = hc_bytealign (w[36], w[37], offset); + w[55] = hc_bytealign (w[35], w[36], offset); + w[54] = hc_bytealign (w[34], w[35], offset); + w[53] = hc_bytealign (w[33], w[34], offset); + w[52] = hc_bytealign (w[32], w[33], offset); + w[51] = hc_bytealign (w[31], w[32], offset); + w[50] = hc_bytealign (w[30], w[31], offset); + w[49] = hc_bytealign (w[29], w[30], offset); + w[48] = hc_bytealign (w[28], w[29], offset); + w[47] = hc_bytealign (w[27], w[28], offset); + w[46] = hc_bytealign (w[26], w[27], offset); + w[45] = hc_bytealign (w[25], w[26], offset); + w[44] = hc_bytealign (w[24], w[25], offset); + w[43] = hc_bytealign (w[23], w[24], offset); + w[42] = hc_bytealign (w[22], w[23], offset); + w[41] = hc_bytealign (w[21], w[22], offset); + w[40] = hc_bytealign (w[20], w[21], offset); + w[39] = hc_bytealign (w[19], w[20], offset); + w[38] = hc_bytealign (w[18], w[19], offset); + w[37] = hc_bytealign (w[17], w[18], offset); + w[36] = hc_bytealign (w[16], w[17], offset); + w[35] = hc_bytealign (w[15], w[16], offset); + w[34] = hc_bytealign (w[14], w[15], offset); + w[33] = hc_bytealign (w[13], w[14], offset); + w[32] = hc_bytealign (w[12], w[13], offset); + w[31] = hc_bytealign (w[11], w[12], offset); + w[30] = hc_bytealign (w[10], w[11], offset); + w[29] = hc_bytealign (w[ 9], w[10], offset); + w[28] = hc_bytealign (w[ 8], w[ 9], offset); + w[27] = hc_bytealign (w[ 7], w[ 8], offset); + w[26] = hc_bytealign (w[ 6], w[ 7], offset); + w[25] = hc_bytealign (w[ 5], w[ 6], offset); + w[24] = hc_bytealign (w[ 4], w[ 5], offset); + w[23] = hc_bytealign (w[ 3], w[ 4], offset); + w[22] = hc_bytealign (w[ 2], w[ 3], offset); + w[21] = hc_bytealign (w[ 1], w[ 2], offset); + w[20] = hc_bytealign (w[ 0], w[ 1], offset); + w[19] = hc_bytealign ( 0, w[ 0], offset); w[18] = 0; w[17] = 0; w[16] = 0; @@ -21758,50 +21758,50 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 20: - w[63] = amd_bytealign (w[42], w[43], offset); - w[62] = amd_bytealign (w[41], w[42], offset); - w[61] = amd_bytealign (w[40], w[41], offset); - w[60] = amd_bytealign (w[39], w[40], offset); - w[59] = amd_bytealign (w[38], w[39], offset); - w[58] = amd_bytealign (w[37], w[38], offset); - w[57] = amd_bytealign (w[36], w[37], offset); - w[56] = amd_bytealign (w[35], w[36], offset); - w[55] = amd_bytealign (w[34], w[35], offset); - w[54] = amd_bytealign (w[33], w[34], offset); - w[53] = amd_bytealign (w[32], w[33], offset); - w[52] = amd_bytealign (w[31], w[32], offset); - w[51] = amd_bytealign (w[30], w[31], offset); - w[50] = amd_bytealign (w[29], w[30], offset); - w[49] = amd_bytealign (w[28], w[29], offset); - w[48] = amd_bytealign (w[27], w[28], offset); - w[47] = amd_bytealign (w[26], w[27], offset); - w[46] = amd_bytealign (w[25], w[26], offset); - w[45] = amd_bytealign (w[24], w[25], offset); - w[44] = amd_bytealign (w[23], w[24], offset); - w[43] = amd_bytealign (w[22], w[23], offset); - w[42] = amd_bytealign (w[21], w[22], offset); - w[41] = amd_bytealign (w[20], w[21], offset); - w[40] = amd_bytealign (w[19], w[20], offset); - w[39] = amd_bytealign (w[18], w[19], offset); - w[38] = amd_bytealign (w[17], w[18], offset); - w[37] = amd_bytealign (w[16], w[17], offset); - w[36] = amd_bytealign (w[15], w[16], offset); - w[35] = amd_bytealign (w[14], w[15], offset); - w[34] = amd_bytealign (w[13], w[14], offset); - w[33] = amd_bytealign (w[12], w[13], offset); - w[32] = amd_bytealign (w[11], w[12], offset); - w[31] = amd_bytealign (w[10], w[11], offset); - w[30] = amd_bytealign (w[ 9], w[10], offset); - w[29] = amd_bytealign (w[ 8], w[ 9], offset); - w[28] = amd_bytealign (w[ 7], w[ 8], offset); - w[27] = amd_bytealign (w[ 6], w[ 7], offset); - w[26] = amd_bytealign (w[ 5], w[ 6], offset); - w[25] = amd_bytealign (w[ 4], w[ 5], offset); - w[24] = amd_bytealign (w[ 3], w[ 4], offset); - w[23] = amd_bytealign (w[ 2], w[ 3], offset); - w[22] = amd_bytealign (w[ 1], w[ 2], offset); - w[21] = amd_bytealign (w[ 0], w[ 1], offset); - w[20] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[42], w[43], offset); + w[62] = hc_bytealign (w[41], w[42], offset); + w[61] = hc_bytealign (w[40], w[41], offset); + w[60] = hc_bytealign (w[39], w[40], offset); + w[59] = hc_bytealign (w[38], w[39], offset); + w[58] = hc_bytealign (w[37], w[38], offset); + w[57] = hc_bytealign (w[36], w[37], offset); + w[56] = hc_bytealign (w[35], w[36], offset); + w[55] = hc_bytealign (w[34], w[35], offset); + w[54] = hc_bytealign (w[33], w[34], offset); + w[53] = hc_bytealign (w[32], w[33], offset); + w[52] = hc_bytealign (w[31], w[32], offset); + w[51] = hc_bytealign (w[30], w[31], offset); + w[50] = hc_bytealign (w[29], w[30], offset); + w[49] = hc_bytealign (w[28], w[29], offset); + w[48] = hc_bytealign (w[27], w[28], offset); + w[47] = hc_bytealign (w[26], w[27], offset); + w[46] = hc_bytealign (w[25], w[26], offset); + w[45] = hc_bytealign (w[24], w[25], offset); + w[44] = hc_bytealign (w[23], w[24], offset); + w[43] = hc_bytealign (w[22], w[23], offset); + w[42] = hc_bytealign (w[21], w[22], offset); + w[41] = hc_bytealign (w[20], w[21], offset); + w[40] = hc_bytealign (w[19], w[20], offset); + w[39] = hc_bytealign (w[18], w[19], offset); + w[38] = hc_bytealign (w[17], w[18], offset); + w[37] = hc_bytealign (w[16], w[17], offset); + w[36] = hc_bytealign (w[15], w[16], offset); + w[35] = hc_bytealign (w[14], w[15], offset); + w[34] = hc_bytealign (w[13], w[14], offset); + w[33] = hc_bytealign (w[12], w[13], offset); + w[32] = hc_bytealign (w[11], w[12], offset); + w[31] = hc_bytealign (w[10], w[11], offset); + w[30] = hc_bytealign (w[ 9], w[10], offset); + w[29] = hc_bytealign (w[ 8], w[ 9], offset); + w[28] = hc_bytealign (w[ 7], w[ 8], offset); + w[27] = hc_bytealign (w[ 6], w[ 7], offset); + w[26] = hc_bytealign (w[ 5], w[ 6], offset); + w[25] = hc_bytealign (w[ 4], w[ 5], offset); + w[24] = hc_bytealign (w[ 3], w[ 4], offset); + w[23] = hc_bytealign (w[ 2], w[ 3], offset); + w[22] = hc_bytealign (w[ 1], w[ 2], offset); + w[21] = hc_bytealign (w[ 0], w[ 1], offset); + w[20] = hc_bytealign ( 0, w[ 0], offset); w[19] = 0; w[18] = 0; w[17] = 0; @@ -21826,49 +21826,49 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 21: - w[63] = amd_bytealign (w[41], w[42], offset); - w[62] = amd_bytealign (w[40], w[41], offset); - w[61] = amd_bytealign (w[39], w[40], offset); - w[60] = amd_bytealign (w[38], w[39], offset); - w[59] = amd_bytealign (w[37], w[38], offset); - w[58] = amd_bytealign (w[36], w[37], offset); - w[57] = amd_bytealign (w[35], w[36], offset); - w[56] = amd_bytealign (w[34], w[35], offset); - w[55] = amd_bytealign (w[33], w[34], offset); - w[54] = amd_bytealign (w[32], w[33], offset); - w[53] = amd_bytealign (w[31], w[32], offset); - w[52] = amd_bytealign (w[30], w[31], offset); - w[51] = amd_bytealign (w[29], w[30], offset); - w[50] = amd_bytealign (w[28], w[29], offset); - w[49] = amd_bytealign (w[27], w[28], offset); - w[48] = amd_bytealign (w[26], w[27], offset); - w[47] = amd_bytealign (w[25], w[26], offset); - w[46] = amd_bytealign (w[24], w[25], offset); - w[45] = amd_bytealign (w[23], w[24], offset); - w[44] = amd_bytealign (w[22], w[23], offset); - w[43] = amd_bytealign (w[21], w[22], offset); - w[42] = amd_bytealign (w[20], w[21], offset); - w[41] = amd_bytealign (w[19], w[20], offset); - w[40] = amd_bytealign (w[18], w[19], offset); - w[39] = amd_bytealign (w[17], w[18], offset); - w[38] = amd_bytealign (w[16], w[17], offset); - w[37] = amd_bytealign (w[15], w[16], offset); - w[36] = amd_bytealign (w[14], w[15], offset); - w[35] = amd_bytealign (w[13], w[14], offset); - w[34] = amd_bytealign (w[12], w[13], offset); - w[33] = amd_bytealign (w[11], w[12], offset); - w[32] = amd_bytealign (w[10], w[11], offset); - w[31] = amd_bytealign (w[ 9], w[10], offset); - w[30] = amd_bytealign (w[ 8], w[ 9], offset); - w[29] = amd_bytealign (w[ 7], w[ 8], offset); - w[28] = amd_bytealign (w[ 6], w[ 7], offset); - w[27] = amd_bytealign (w[ 5], w[ 6], offset); - w[26] = amd_bytealign (w[ 4], w[ 5], offset); - w[25] = amd_bytealign (w[ 3], w[ 4], offset); - w[24] = amd_bytealign (w[ 2], w[ 3], offset); - w[23] = amd_bytealign (w[ 1], w[ 2], offset); - w[22] = amd_bytealign (w[ 0], w[ 1], offset); - w[21] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[41], w[42], offset); + w[62] = hc_bytealign (w[40], w[41], offset); + w[61] = hc_bytealign (w[39], w[40], offset); + w[60] = hc_bytealign (w[38], w[39], offset); + w[59] = hc_bytealign (w[37], w[38], offset); + w[58] = hc_bytealign (w[36], w[37], offset); + w[57] = hc_bytealign (w[35], w[36], offset); + w[56] = hc_bytealign (w[34], w[35], offset); + w[55] = hc_bytealign (w[33], w[34], offset); + w[54] = hc_bytealign (w[32], w[33], offset); + w[53] = hc_bytealign (w[31], w[32], offset); + w[52] = hc_bytealign (w[30], w[31], offset); + w[51] = hc_bytealign (w[29], w[30], offset); + w[50] = hc_bytealign (w[28], w[29], offset); + w[49] = hc_bytealign (w[27], w[28], offset); + w[48] = hc_bytealign (w[26], w[27], offset); + w[47] = hc_bytealign (w[25], w[26], offset); + w[46] = hc_bytealign (w[24], w[25], offset); + w[45] = hc_bytealign (w[23], w[24], offset); + w[44] = hc_bytealign (w[22], w[23], offset); + w[43] = hc_bytealign (w[21], w[22], offset); + w[42] = hc_bytealign (w[20], w[21], offset); + w[41] = hc_bytealign (w[19], w[20], offset); + w[40] = hc_bytealign (w[18], w[19], offset); + w[39] = hc_bytealign (w[17], w[18], offset); + w[38] = hc_bytealign (w[16], w[17], offset); + w[37] = hc_bytealign (w[15], w[16], offset); + w[36] = hc_bytealign (w[14], w[15], offset); + w[35] = hc_bytealign (w[13], w[14], offset); + w[34] = hc_bytealign (w[12], w[13], offset); + w[33] = hc_bytealign (w[11], w[12], offset); + w[32] = hc_bytealign (w[10], w[11], offset); + w[31] = hc_bytealign (w[ 9], w[10], offset); + w[30] = hc_bytealign (w[ 8], w[ 9], offset); + w[29] = hc_bytealign (w[ 7], w[ 8], offset); + w[28] = hc_bytealign (w[ 6], w[ 7], offset); + w[27] = hc_bytealign (w[ 5], w[ 6], offset); + w[26] = hc_bytealign (w[ 4], w[ 5], offset); + w[25] = hc_bytealign (w[ 3], w[ 4], offset); + w[24] = hc_bytealign (w[ 2], w[ 3], offset); + w[23] = hc_bytealign (w[ 1], w[ 2], offset); + w[22] = hc_bytealign (w[ 0], w[ 1], offset); + w[21] = hc_bytealign ( 0, w[ 0], offset); w[20] = 0; w[19] = 0; w[18] = 0; @@ -21894,48 +21894,48 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 22: - w[63] = amd_bytealign (w[40], w[41], offset); - w[62] = amd_bytealign (w[39], w[40], offset); - w[61] = amd_bytealign (w[38], w[39], offset); - w[60] = amd_bytealign (w[37], w[38], offset); - w[59] = amd_bytealign (w[36], w[37], offset); - w[58] = amd_bytealign (w[35], w[36], offset); - w[57] = amd_bytealign (w[34], w[35], offset); - w[56] = amd_bytealign (w[33], w[34], offset); - w[55] = amd_bytealign (w[32], w[33], offset); - w[54] = amd_bytealign (w[31], w[32], offset); - w[53] = amd_bytealign (w[30], w[31], offset); - w[52] = amd_bytealign (w[29], w[30], offset); - w[51] = amd_bytealign (w[28], w[29], offset); - w[50] = amd_bytealign (w[27], w[28], offset); - w[49] = amd_bytealign (w[26], w[27], offset); - w[48] = amd_bytealign (w[25], w[26], offset); - w[47] = amd_bytealign (w[24], w[25], offset); - w[46] = amd_bytealign (w[23], w[24], offset); - w[45] = amd_bytealign (w[22], w[23], offset); - w[44] = amd_bytealign (w[21], w[22], offset); - w[43] = amd_bytealign (w[20], w[21], offset); - w[42] = amd_bytealign (w[19], w[20], offset); - w[41] = amd_bytealign (w[18], w[19], offset); - w[40] = amd_bytealign (w[17], w[18], offset); - w[39] = amd_bytealign (w[16], w[17], offset); - w[38] = amd_bytealign (w[15], w[16], offset); - w[37] = amd_bytealign (w[14], w[15], offset); - w[36] = amd_bytealign (w[13], w[14], offset); - w[35] = amd_bytealign (w[12], w[13], offset); - w[34] = amd_bytealign (w[11], w[12], offset); - w[33] = amd_bytealign (w[10], w[11], offset); - w[32] = amd_bytealign (w[ 9], w[10], offset); - w[31] = amd_bytealign (w[ 8], w[ 9], offset); - w[30] = amd_bytealign (w[ 7], w[ 8], offset); - w[29] = amd_bytealign (w[ 6], w[ 7], offset); - w[28] = amd_bytealign (w[ 5], w[ 6], offset); - w[27] = amd_bytealign (w[ 4], w[ 5], offset); - w[26] = amd_bytealign (w[ 3], w[ 4], offset); - w[25] = amd_bytealign (w[ 2], w[ 3], offset); - w[24] = amd_bytealign (w[ 1], w[ 2], offset); - w[23] = amd_bytealign (w[ 0], w[ 1], offset); - w[22] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[40], w[41], offset); + w[62] = hc_bytealign (w[39], w[40], offset); + w[61] = hc_bytealign (w[38], w[39], offset); + w[60] = hc_bytealign (w[37], w[38], offset); + w[59] = hc_bytealign (w[36], w[37], offset); + w[58] = hc_bytealign (w[35], w[36], offset); + w[57] = hc_bytealign (w[34], w[35], offset); + w[56] = hc_bytealign (w[33], w[34], offset); + w[55] = hc_bytealign (w[32], w[33], offset); + w[54] = hc_bytealign (w[31], w[32], offset); + w[53] = hc_bytealign (w[30], w[31], offset); + w[52] = hc_bytealign (w[29], w[30], offset); + w[51] = hc_bytealign (w[28], w[29], offset); + w[50] = hc_bytealign (w[27], w[28], offset); + w[49] = hc_bytealign (w[26], w[27], offset); + w[48] = hc_bytealign (w[25], w[26], offset); + w[47] = hc_bytealign (w[24], w[25], offset); + w[46] = hc_bytealign (w[23], w[24], offset); + w[45] = hc_bytealign (w[22], w[23], offset); + w[44] = hc_bytealign (w[21], w[22], offset); + w[43] = hc_bytealign (w[20], w[21], offset); + w[42] = hc_bytealign (w[19], w[20], offset); + w[41] = hc_bytealign (w[18], w[19], offset); + w[40] = hc_bytealign (w[17], w[18], offset); + w[39] = hc_bytealign (w[16], w[17], offset); + w[38] = hc_bytealign (w[15], w[16], offset); + w[37] = hc_bytealign (w[14], w[15], offset); + w[36] = hc_bytealign (w[13], w[14], offset); + w[35] = hc_bytealign (w[12], w[13], offset); + w[34] = hc_bytealign (w[11], w[12], offset); + w[33] = hc_bytealign (w[10], w[11], offset); + w[32] = hc_bytealign (w[ 9], w[10], offset); + w[31] = hc_bytealign (w[ 8], w[ 9], offset); + w[30] = hc_bytealign (w[ 7], w[ 8], offset); + w[29] = hc_bytealign (w[ 6], w[ 7], offset); + w[28] = hc_bytealign (w[ 5], w[ 6], offset); + w[27] = hc_bytealign (w[ 4], w[ 5], offset); + w[26] = hc_bytealign (w[ 3], w[ 4], offset); + w[25] = hc_bytealign (w[ 2], w[ 3], offset); + w[24] = hc_bytealign (w[ 1], w[ 2], offset); + w[23] = hc_bytealign (w[ 0], w[ 1], offset); + w[22] = hc_bytealign ( 0, w[ 0], offset); w[21] = 0; w[20] = 0; w[19] = 0; @@ -21962,47 +21962,47 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 23: - w[63] = amd_bytealign (w[39], w[40], offset); - w[62] = amd_bytealign (w[38], w[39], offset); - w[61] = amd_bytealign (w[37], w[38], offset); - w[60] = amd_bytealign (w[36], w[37], offset); - w[59] = amd_bytealign (w[35], w[36], offset); - w[58] = amd_bytealign (w[34], w[35], offset); - w[57] = amd_bytealign (w[33], w[34], offset); - w[56] = amd_bytealign (w[32], w[33], offset); - w[55] = amd_bytealign (w[31], w[32], offset); - w[54] = amd_bytealign (w[30], w[31], offset); - w[53] = amd_bytealign (w[29], w[30], offset); - w[52] = amd_bytealign (w[28], w[29], offset); - w[51] = amd_bytealign (w[27], w[28], offset); - w[50] = amd_bytealign (w[26], w[27], offset); - w[49] = amd_bytealign (w[25], w[26], offset); - w[48] = amd_bytealign (w[24], w[25], offset); - w[47] = amd_bytealign (w[23], w[24], offset); - w[46] = amd_bytealign (w[22], w[23], offset); - w[45] = amd_bytealign (w[21], w[22], offset); - w[44] = amd_bytealign (w[20], w[21], offset); - w[43] = amd_bytealign (w[19], w[20], offset); - w[42] = amd_bytealign (w[18], w[19], offset); - w[41] = amd_bytealign (w[17], w[18], offset); - w[40] = amd_bytealign (w[16], w[17], offset); - w[39] = amd_bytealign (w[15], w[16], offset); - w[38] = amd_bytealign (w[14], w[15], offset); - w[37] = amd_bytealign (w[13], w[14], offset); - w[36] = amd_bytealign (w[12], w[13], offset); - w[35] = amd_bytealign (w[11], w[12], offset); - w[34] = amd_bytealign (w[10], w[11], offset); - w[33] = amd_bytealign (w[ 9], w[10], offset); - w[32] = amd_bytealign (w[ 8], w[ 9], offset); - w[31] = amd_bytealign (w[ 7], w[ 8], offset); - w[30] = amd_bytealign (w[ 6], w[ 7], offset); - w[29] = amd_bytealign (w[ 5], w[ 6], offset); - w[28] = amd_bytealign (w[ 4], w[ 5], offset); - w[27] = amd_bytealign (w[ 3], w[ 4], offset); - w[26] = amd_bytealign (w[ 2], w[ 3], offset); - w[25] = amd_bytealign (w[ 1], w[ 2], offset); - w[24] = amd_bytealign (w[ 0], w[ 1], offset); - w[23] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[39], w[40], offset); + w[62] = hc_bytealign (w[38], w[39], offset); + w[61] = hc_bytealign (w[37], w[38], offset); + w[60] = hc_bytealign (w[36], w[37], offset); + w[59] = hc_bytealign (w[35], w[36], offset); + w[58] = hc_bytealign (w[34], w[35], offset); + w[57] = hc_bytealign (w[33], w[34], offset); + w[56] = hc_bytealign (w[32], w[33], offset); + w[55] = hc_bytealign (w[31], w[32], offset); + w[54] = hc_bytealign (w[30], w[31], offset); + w[53] = hc_bytealign (w[29], w[30], offset); + w[52] = hc_bytealign (w[28], w[29], offset); + w[51] = hc_bytealign (w[27], w[28], offset); + w[50] = hc_bytealign (w[26], w[27], offset); + w[49] = hc_bytealign (w[25], w[26], offset); + w[48] = hc_bytealign (w[24], w[25], offset); + w[47] = hc_bytealign (w[23], w[24], offset); + w[46] = hc_bytealign (w[22], w[23], offset); + w[45] = hc_bytealign (w[21], w[22], offset); + w[44] = hc_bytealign (w[20], w[21], offset); + w[43] = hc_bytealign (w[19], w[20], offset); + w[42] = hc_bytealign (w[18], w[19], offset); + w[41] = hc_bytealign (w[17], w[18], offset); + w[40] = hc_bytealign (w[16], w[17], offset); + w[39] = hc_bytealign (w[15], w[16], offset); + w[38] = hc_bytealign (w[14], w[15], offset); + w[37] = hc_bytealign (w[13], w[14], offset); + w[36] = hc_bytealign (w[12], w[13], offset); + w[35] = hc_bytealign (w[11], w[12], offset); + w[34] = hc_bytealign (w[10], w[11], offset); + w[33] = hc_bytealign (w[ 9], w[10], offset); + w[32] = hc_bytealign (w[ 8], w[ 9], offset); + w[31] = hc_bytealign (w[ 7], w[ 8], offset); + w[30] = hc_bytealign (w[ 6], w[ 7], offset); + w[29] = hc_bytealign (w[ 5], w[ 6], offset); + w[28] = hc_bytealign (w[ 4], w[ 5], offset); + w[27] = hc_bytealign (w[ 3], w[ 4], offset); + w[26] = hc_bytealign (w[ 2], w[ 3], offset); + w[25] = hc_bytealign (w[ 1], w[ 2], offset); + w[24] = hc_bytealign (w[ 0], w[ 1], offset); + w[23] = hc_bytealign ( 0, w[ 0], offset); w[22] = 0; w[21] = 0; w[20] = 0; @@ -22030,46 +22030,46 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 24: - w[63] = amd_bytealign (w[38], w[39], offset); - w[62] = amd_bytealign (w[37], w[38], offset); - w[61] = amd_bytealign (w[36], w[37], offset); - w[60] = amd_bytealign (w[35], w[36], offset); - w[59] = amd_bytealign (w[34], w[35], offset); - w[58] = amd_bytealign (w[33], w[34], offset); - w[57] = amd_bytealign (w[32], w[33], offset); - w[56] = amd_bytealign (w[31], w[32], offset); - w[55] = amd_bytealign (w[30], w[31], offset); - w[54] = amd_bytealign (w[29], w[30], offset); - w[53] = amd_bytealign (w[28], w[29], offset); - w[52] = amd_bytealign (w[27], w[28], offset); - w[51] = amd_bytealign (w[26], w[27], offset); - w[50] = amd_bytealign (w[25], w[26], offset); - w[49] = amd_bytealign (w[24], w[25], offset); - w[48] = amd_bytealign (w[23], w[24], offset); - w[47] = amd_bytealign (w[22], w[23], offset); - w[46] = amd_bytealign (w[21], w[22], offset); - w[45] = amd_bytealign (w[20], w[21], offset); - w[44] = amd_bytealign (w[19], w[20], offset); - w[43] = amd_bytealign (w[18], w[19], offset); - w[42] = amd_bytealign (w[17], w[18], offset); - w[41] = amd_bytealign (w[16], w[17], offset); - w[40] = amd_bytealign (w[15], w[16], offset); - w[39] = amd_bytealign (w[14], w[15], offset); - w[38] = amd_bytealign (w[13], w[14], offset); - w[37] = amd_bytealign (w[12], w[13], offset); - w[36] = amd_bytealign (w[11], w[12], offset); - w[35] = amd_bytealign (w[10], w[11], offset); - w[34] = amd_bytealign (w[ 9], w[10], offset); - w[33] = amd_bytealign (w[ 8], w[ 9], offset); - w[32] = amd_bytealign (w[ 7], w[ 8], offset); - w[31] = amd_bytealign (w[ 6], w[ 7], offset); - w[30] = amd_bytealign (w[ 5], w[ 6], offset); - w[29] = amd_bytealign (w[ 4], w[ 5], offset); - w[28] = amd_bytealign (w[ 3], w[ 4], offset); - w[27] = amd_bytealign (w[ 2], w[ 3], offset); - w[26] = amd_bytealign (w[ 1], w[ 2], offset); - w[25] = amd_bytealign (w[ 0], w[ 1], offset); - w[24] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[38], w[39], offset); + w[62] = hc_bytealign (w[37], w[38], offset); + w[61] = hc_bytealign (w[36], w[37], offset); + w[60] = hc_bytealign (w[35], w[36], offset); + w[59] = hc_bytealign (w[34], w[35], offset); + w[58] = hc_bytealign (w[33], w[34], offset); + w[57] = hc_bytealign (w[32], w[33], offset); + w[56] = hc_bytealign (w[31], w[32], offset); + w[55] = hc_bytealign (w[30], w[31], offset); + w[54] = hc_bytealign (w[29], w[30], offset); + w[53] = hc_bytealign (w[28], w[29], offset); + w[52] = hc_bytealign (w[27], w[28], offset); + w[51] = hc_bytealign (w[26], w[27], offset); + w[50] = hc_bytealign (w[25], w[26], offset); + w[49] = hc_bytealign (w[24], w[25], offset); + w[48] = hc_bytealign (w[23], w[24], offset); + w[47] = hc_bytealign (w[22], w[23], offset); + w[46] = hc_bytealign (w[21], w[22], offset); + w[45] = hc_bytealign (w[20], w[21], offset); + w[44] = hc_bytealign (w[19], w[20], offset); + w[43] = hc_bytealign (w[18], w[19], offset); + w[42] = hc_bytealign (w[17], w[18], offset); + w[41] = hc_bytealign (w[16], w[17], offset); + w[40] = hc_bytealign (w[15], w[16], offset); + w[39] = hc_bytealign (w[14], w[15], offset); + w[38] = hc_bytealign (w[13], w[14], offset); + w[37] = hc_bytealign (w[12], w[13], offset); + w[36] = hc_bytealign (w[11], w[12], offset); + w[35] = hc_bytealign (w[10], w[11], offset); + w[34] = hc_bytealign (w[ 9], w[10], offset); + w[33] = hc_bytealign (w[ 8], w[ 9], offset); + w[32] = hc_bytealign (w[ 7], w[ 8], offset); + w[31] = hc_bytealign (w[ 6], w[ 7], offset); + w[30] = hc_bytealign (w[ 5], w[ 6], offset); + w[29] = hc_bytealign (w[ 4], w[ 5], offset); + w[28] = hc_bytealign (w[ 3], w[ 4], offset); + w[27] = hc_bytealign (w[ 2], w[ 3], offset); + w[26] = hc_bytealign (w[ 1], w[ 2], offset); + w[25] = hc_bytealign (w[ 0], w[ 1], offset); + w[24] = hc_bytealign ( 0, w[ 0], offset); w[23] = 0; w[22] = 0; w[21] = 0; @@ -22098,45 +22098,45 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 25: - w[63] = amd_bytealign (w[37], w[38], offset); - w[62] = amd_bytealign (w[36], w[37], offset); - w[61] = amd_bytealign (w[35], w[36], offset); - w[60] = amd_bytealign (w[34], w[35], offset); - w[59] = amd_bytealign (w[33], w[34], offset); - w[58] = amd_bytealign (w[32], w[33], offset); - w[57] = amd_bytealign (w[31], w[32], offset); - w[56] = amd_bytealign (w[30], w[31], offset); - w[55] = amd_bytealign (w[29], w[30], offset); - w[54] = amd_bytealign (w[28], w[29], offset); - w[53] = amd_bytealign (w[27], w[28], offset); - w[52] = amd_bytealign (w[26], w[27], offset); - w[51] = amd_bytealign (w[25], w[26], offset); - w[50] = amd_bytealign (w[24], w[25], offset); - w[49] = amd_bytealign (w[23], w[24], offset); - w[48] = amd_bytealign (w[22], w[23], offset); - w[47] = amd_bytealign (w[21], w[22], offset); - w[46] = amd_bytealign (w[20], w[21], offset); - w[45] = amd_bytealign (w[19], w[20], offset); - w[44] = amd_bytealign (w[18], w[19], offset); - w[43] = amd_bytealign (w[17], w[18], offset); - w[42] = amd_bytealign (w[16], w[17], offset); - w[41] = amd_bytealign (w[15], w[16], offset); - w[40] = amd_bytealign (w[14], w[15], offset); - w[39] = amd_bytealign (w[13], w[14], offset); - w[38] = amd_bytealign (w[12], w[13], offset); - w[37] = amd_bytealign (w[11], w[12], offset); - w[36] = amd_bytealign (w[10], w[11], offset); - w[35] = amd_bytealign (w[ 9], w[10], offset); - w[34] = amd_bytealign (w[ 8], w[ 9], offset); - w[33] = amd_bytealign (w[ 7], w[ 8], offset); - w[32] = amd_bytealign (w[ 6], w[ 7], offset); - w[31] = amd_bytealign (w[ 5], w[ 6], offset); - w[30] = amd_bytealign (w[ 4], w[ 5], offset); - w[29] = amd_bytealign (w[ 3], w[ 4], offset); - w[28] = amd_bytealign (w[ 2], w[ 3], offset); - w[27] = amd_bytealign (w[ 1], w[ 2], offset); - w[26] = amd_bytealign (w[ 0], w[ 1], offset); - w[25] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[37], w[38], offset); + w[62] = hc_bytealign (w[36], w[37], offset); + w[61] = hc_bytealign (w[35], w[36], offset); + w[60] = hc_bytealign (w[34], w[35], offset); + w[59] = hc_bytealign (w[33], w[34], offset); + w[58] = hc_bytealign (w[32], w[33], offset); + w[57] = hc_bytealign (w[31], w[32], offset); + w[56] = hc_bytealign (w[30], w[31], offset); + w[55] = hc_bytealign (w[29], w[30], offset); + w[54] = hc_bytealign (w[28], w[29], offset); + w[53] = hc_bytealign (w[27], w[28], offset); + w[52] = hc_bytealign (w[26], w[27], offset); + w[51] = hc_bytealign (w[25], w[26], offset); + w[50] = hc_bytealign (w[24], w[25], offset); + w[49] = hc_bytealign (w[23], w[24], offset); + w[48] = hc_bytealign (w[22], w[23], offset); + w[47] = hc_bytealign (w[21], w[22], offset); + w[46] = hc_bytealign (w[20], w[21], offset); + w[45] = hc_bytealign (w[19], w[20], offset); + w[44] = hc_bytealign (w[18], w[19], offset); + w[43] = hc_bytealign (w[17], w[18], offset); + w[42] = hc_bytealign (w[16], w[17], offset); + w[41] = hc_bytealign (w[15], w[16], offset); + w[40] = hc_bytealign (w[14], w[15], offset); + w[39] = hc_bytealign (w[13], w[14], offset); + w[38] = hc_bytealign (w[12], w[13], offset); + w[37] = hc_bytealign (w[11], w[12], offset); + w[36] = hc_bytealign (w[10], w[11], offset); + w[35] = hc_bytealign (w[ 9], w[10], offset); + w[34] = hc_bytealign (w[ 8], w[ 9], offset); + w[33] = hc_bytealign (w[ 7], w[ 8], offset); + w[32] = hc_bytealign (w[ 6], w[ 7], offset); + w[31] = hc_bytealign (w[ 5], w[ 6], offset); + w[30] = hc_bytealign (w[ 4], w[ 5], offset); + w[29] = hc_bytealign (w[ 3], w[ 4], offset); + w[28] = hc_bytealign (w[ 2], w[ 3], offset); + w[27] = hc_bytealign (w[ 1], w[ 2], offset); + w[26] = hc_bytealign (w[ 0], w[ 1], offset); + w[25] = hc_bytealign ( 0, w[ 0], offset); w[24] = 0; w[23] = 0; w[22] = 0; @@ -22166,44 +22166,44 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 26: - w[63] = amd_bytealign (w[36], w[37], offset); - w[62] = amd_bytealign (w[35], w[36], offset); - w[61] = amd_bytealign (w[34], w[35], offset); - w[60] = amd_bytealign (w[33], w[34], offset); - w[59] = amd_bytealign (w[32], w[33], offset); - w[58] = amd_bytealign (w[31], w[32], offset); - w[57] = amd_bytealign (w[30], w[31], offset); - w[56] = amd_bytealign (w[29], w[30], offset); - w[55] = amd_bytealign (w[28], w[29], offset); - w[54] = amd_bytealign (w[27], w[28], offset); - w[53] = amd_bytealign (w[26], w[27], offset); - w[52] = amd_bytealign (w[25], w[26], offset); - w[51] = amd_bytealign (w[24], w[25], offset); - w[50] = amd_bytealign (w[23], w[24], offset); - w[49] = amd_bytealign (w[22], w[23], offset); - w[48] = amd_bytealign (w[21], w[22], offset); - w[47] = amd_bytealign (w[20], w[21], offset); - w[46] = amd_bytealign (w[19], w[20], offset); - w[45] = amd_bytealign (w[18], w[19], offset); - w[44] = amd_bytealign (w[17], w[18], offset); - w[43] = amd_bytealign (w[16], w[17], offset); - w[42] = amd_bytealign (w[15], w[16], offset); - w[41] = amd_bytealign (w[14], w[15], offset); - w[40] = amd_bytealign (w[13], w[14], offset); - w[39] = amd_bytealign (w[12], w[13], offset); - w[38] = amd_bytealign (w[11], w[12], offset); - w[37] = amd_bytealign (w[10], w[11], offset); - w[36] = amd_bytealign (w[ 9], w[10], offset); - w[35] = amd_bytealign (w[ 8], w[ 9], offset); - w[34] = amd_bytealign (w[ 7], w[ 8], offset); - w[33] = amd_bytealign (w[ 6], w[ 7], offset); - w[32] = amd_bytealign (w[ 5], w[ 6], offset); - w[31] = amd_bytealign (w[ 4], w[ 5], offset); - w[30] = amd_bytealign (w[ 3], w[ 4], offset); - w[29] = amd_bytealign (w[ 2], w[ 3], offset); - w[28] = amd_bytealign (w[ 1], w[ 2], offset); - w[27] = amd_bytealign (w[ 0], w[ 1], offset); - w[26] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[36], w[37], offset); + w[62] = hc_bytealign (w[35], w[36], offset); + w[61] = hc_bytealign (w[34], w[35], offset); + w[60] = hc_bytealign (w[33], w[34], offset); + w[59] = hc_bytealign (w[32], w[33], offset); + w[58] = hc_bytealign (w[31], w[32], offset); + w[57] = hc_bytealign (w[30], w[31], offset); + w[56] = hc_bytealign (w[29], w[30], offset); + w[55] = hc_bytealign (w[28], w[29], offset); + w[54] = hc_bytealign (w[27], w[28], offset); + w[53] = hc_bytealign (w[26], w[27], offset); + w[52] = hc_bytealign (w[25], w[26], offset); + w[51] = hc_bytealign (w[24], w[25], offset); + w[50] = hc_bytealign (w[23], w[24], offset); + w[49] = hc_bytealign (w[22], w[23], offset); + w[48] = hc_bytealign (w[21], w[22], offset); + w[47] = hc_bytealign (w[20], w[21], offset); + w[46] = hc_bytealign (w[19], w[20], offset); + w[45] = hc_bytealign (w[18], w[19], offset); + w[44] = hc_bytealign (w[17], w[18], offset); + w[43] = hc_bytealign (w[16], w[17], offset); + w[42] = hc_bytealign (w[15], w[16], offset); + w[41] = hc_bytealign (w[14], w[15], offset); + w[40] = hc_bytealign (w[13], w[14], offset); + w[39] = hc_bytealign (w[12], w[13], offset); + w[38] = hc_bytealign (w[11], w[12], offset); + w[37] = hc_bytealign (w[10], w[11], offset); + w[36] = hc_bytealign (w[ 9], w[10], offset); + w[35] = hc_bytealign (w[ 8], w[ 9], offset); + w[34] = hc_bytealign (w[ 7], w[ 8], offset); + w[33] = hc_bytealign (w[ 6], w[ 7], offset); + w[32] = hc_bytealign (w[ 5], w[ 6], offset); + w[31] = hc_bytealign (w[ 4], w[ 5], offset); + w[30] = hc_bytealign (w[ 3], w[ 4], offset); + w[29] = hc_bytealign (w[ 2], w[ 3], offset); + w[28] = hc_bytealign (w[ 1], w[ 2], offset); + w[27] = hc_bytealign (w[ 0], w[ 1], offset); + w[26] = hc_bytealign ( 0, w[ 0], offset); w[25] = 0; w[24] = 0; w[23] = 0; @@ -22234,43 +22234,43 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 27: - w[63] = amd_bytealign (w[35], w[36], offset); - w[62] = amd_bytealign (w[34], w[35], offset); - w[61] = amd_bytealign (w[33], w[34], offset); - w[60] = amd_bytealign (w[32], w[33], offset); - w[59] = amd_bytealign (w[31], w[32], offset); - w[58] = amd_bytealign (w[30], w[31], offset); - w[57] = amd_bytealign (w[29], w[30], offset); - w[56] = amd_bytealign (w[28], w[29], offset); - w[55] = amd_bytealign (w[27], w[28], offset); - w[54] = amd_bytealign (w[26], w[27], offset); - w[53] = amd_bytealign (w[25], w[26], offset); - w[52] = amd_bytealign (w[24], w[25], offset); - w[51] = amd_bytealign (w[23], w[24], offset); - w[50] = amd_bytealign (w[22], w[23], offset); - w[49] = amd_bytealign (w[21], w[22], offset); - w[48] = amd_bytealign (w[20], w[21], offset); - w[47] = amd_bytealign (w[19], w[20], offset); - w[46] = amd_bytealign (w[18], w[19], offset); - w[45] = amd_bytealign (w[17], w[18], offset); - w[44] = amd_bytealign (w[16], w[17], offset); - w[43] = amd_bytealign (w[15], w[16], offset); - w[42] = amd_bytealign (w[14], w[15], offset); - w[41] = amd_bytealign (w[13], w[14], offset); - w[40] = amd_bytealign (w[12], w[13], offset); - w[39] = amd_bytealign (w[11], w[12], offset); - w[38] = amd_bytealign (w[10], w[11], offset); - w[37] = amd_bytealign (w[ 9], w[10], offset); - w[36] = amd_bytealign (w[ 8], w[ 9], offset); - w[35] = amd_bytealign (w[ 7], w[ 8], offset); - w[34] = amd_bytealign (w[ 6], w[ 7], offset); - w[33] = amd_bytealign (w[ 5], w[ 6], offset); - w[32] = amd_bytealign (w[ 4], w[ 5], offset); - w[31] = amd_bytealign (w[ 3], w[ 4], offset); - w[30] = amd_bytealign (w[ 2], w[ 3], offset); - w[29] = amd_bytealign (w[ 1], w[ 2], offset); - w[28] = amd_bytealign (w[ 0], w[ 1], offset); - w[27] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[35], w[36], offset); + w[62] = hc_bytealign (w[34], w[35], offset); + w[61] = hc_bytealign (w[33], w[34], offset); + w[60] = hc_bytealign (w[32], w[33], offset); + w[59] = hc_bytealign (w[31], w[32], offset); + w[58] = hc_bytealign (w[30], w[31], offset); + w[57] = hc_bytealign (w[29], w[30], offset); + w[56] = hc_bytealign (w[28], w[29], offset); + w[55] = hc_bytealign (w[27], w[28], offset); + w[54] = hc_bytealign (w[26], w[27], offset); + w[53] = hc_bytealign (w[25], w[26], offset); + w[52] = hc_bytealign (w[24], w[25], offset); + w[51] = hc_bytealign (w[23], w[24], offset); + w[50] = hc_bytealign (w[22], w[23], offset); + w[49] = hc_bytealign (w[21], w[22], offset); + w[48] = hc_bytealign (w[20], w[21], offset); + w[47] = hc_bytealign (w[19], w[20], offset); + w[46] = hc_bytealign (w[18], w[19], offset); + w[45] = hc_bytealign (w[17], w[18], offset); + w[44] = hc_bytealign (w[16], w[17], offset); + w[43] = hc_bytealign (w[15], w[16], offset); + w[42] = hc_bytealign (w[14], w[15], offset); + w[41] = hc_bytealign (w[13], w[14], offset); + w[40] = hc_bytealign (w[12], w[13], offset); + w[39] = hc_bytealign (w[11], w[12], offset); + w[38] = hc_bytealign (w[10], w[11], offset); + w[37] = hc_bytealign (w[ 9], w[10], offset); + w[36] = hc_bytealign (w[ 8], w[ 9], offset); + w[35] = hc_bytealign (w[ 7], w[ 8], offset); + w[34] = hc_bytealign (w[ 6], w[ 7], offset); + w[33] = hc_bytealign (w[ 5], w[ 6], offset); + w[32] = hc_bytealign (w[ 4], w[ 5], offset); + w[31] = hc_bytealign (w[ 3], w[ 4], offset); + w[30] = hc_bytealign (w[ 2], w[ 3], offset); + w[29] = hc_bytealign (w[ 1], w[ 2], offset); + w[28] = hc_bytealign (w[ 0], w[ 1], offset); + w[27] = hc_bytealign ( 0, w[ 0], offset); w[26] = 0; w[25] = 0; w[24] = 0; @@ -22302,42 +22302,42 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 28: - w[63] = amd_bytealign (w[34], w[35], offset); - w[62] = amd_bytealign (w[33], w[34], offset); - w[61] = amd_bytealign (w[32], w[33], offset); - w[60] = amd_bytealign (w[31], w[32], offset); - w[59] = amd_bytealign (w[30], w[31], offset); - w[58] = amd_bytealign (w[29], w[30], offset); - w[57] = amd_bytealign (w[28], w[29], offset); - w[56] = amd_bytealign (w[27], w[28], offset); - w[55] = amd_bytealign (w[26], w[27], offset); - w[54] = amd_bytealign (w[25], w[26], offset); - w[53] = amd_bytealign (w[24], w[25], offset); - w[52] = amd_bytealign (w[23], w[24], offset); - w[51] = amd_bytealign (w[22], w[23], offset); - w[50] = amd_bytealign (w[21], w[22], offset); - w[49] = amd_bytealign (w[20], w[21], offset); - w[48] = amd_bytealign (w[19], w[20], offset); - w[47] = amd_bytealign (w[18], w[19], offset); - w[46] = amd_bytealign (w[17], w[18], offset); - w[45] = amd_bytealign (w[16], w[17], offset); - w[44] = amd_bytealign (w[15], w[16], offset); - w[43] = amd_bytealign (w[14], w[15], offset); - w[42] = amd_bytealign (w[13], w[14], offset); - w[41] = amd_bytealign (w[12], w[13], offset); - w[40] = amd_bytealign (w[11], w[12], offset); - w[39] = amd_bytealign (w[10], w[11], offset); - w[38] = amd_bytealign (w[ 9], w[10], offset); - w[37] = amd_bytealign (w[ 8], w[ 9], offset); - w[36] = amd_bytealign (w[ 7], w[ 8], offset); - w[35] = amd_bytealign (w[ 6], w[ 7], offset); - w[34] = amd_bytealign (w[ 5], w[ 6], offset); - w[33] = amd_bytealign (w[ 4], w[ 5], offset); - w[32] = amd_bytealign (w[ 3], w[ 4], offset); - w[31] = amd_bytealign (w[ 2], w[ 3], offset); - w[30] = amd_bytealign (w[ 1], w[ 2], offset); - w[29] = amd_bytealign (w[ 0], w[ 1], offset); - w[28] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[34], w[35], offset); + w[62] = hc_bytealign (w[33], w[34], offset); + w[61] = hc_bytealign (w[32], w[33], offset); + w[60] = hc_bytealign (w[31], w[32], offset); + w[59] = hc_bytealign (w[30], w[31], offset); + w[58] = hc_bytealign (w[29], w[30], offset); + w[57] = hc_bytealign (w[28], w[29], offset); + w[56] = hc_bytealign (w[27], w[28], offset); + w[55] = hc_bytealign (w[26], w[27], offset); + w[54] = hc_bytealign (w[25], w[26], offset); + w[53] = hc_bytealign (w[24], w[25], offset); + w[52] = hc_bytealign (w[23], w[24], offset); + w[51] = hc_bytealign (w[22], w[23], offset); + w[50] = hc_bytealign (w[21], w[22], offset); + w[49] = hc_bytealign (w[20], w[21], offset); + w[48] = hc_bytealign (w[19], w[20], offset); + w[47] = hc_bytealign (w[18], w[19], offset); + w[46] = hc_bytealign (w[17], w[18], offset); + w[45] = hc_bytealign (w[16], w[17], offset); + w[44] = hc_bytealign (w[15], w[16], offset); + w[43] = hc_bytealign (w[14], w[15], offset); + w[42] = hc_bytealign (w[13], w[14], offset); + w[41] = hc_bytealign (w[12], w[13], offset); + w[40] = hc_bytealign (w[11], w[12], offset); + w[39] = hc_bytealign (w[10], w[11], offset); + w[38] = hc_bytealign (w[ 9], w[10], offset); + w[37] = hc_bytealign (w[ 8], w[ 9], offset); + w[36] = hc_bytealign (w[ 7], w[ 8], offset); + w[35] = hc_bytealign (w[ 6], w[ 7], offset); + w[34] = hc_bytealign (w[ 5], w[ 6], offset); + w[33] = hc_bytealign (w[ 4], w[ 5], offset); + w[32] = hc_bytealign (w[ 3], w[ 4], offset); + w[31] = hc_bytealign (w[ 2], w[ 3], offset); + w[30] = hc_bytealign (w[ 1], w[ 2], offset); + w[29] = hc_bytealign (w[ 0], w[ 1], offset); + w[28] = hc_bytealign ( 0, w[ 0], offset); w[27] = 0; w[26] = 0; w[25] = 0; @@ -22370,41 +22370,41 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 29: - w[63] = amd_bytealign (w[33], w[34], offset); - w[62] = amd_bytealign (w[32], w[33], offset); - w[61] = amd_bytealign (w[31], w[32], offset); - w[60] = amd_bytealign (w[30], w[31], offset); - w[59] = amd_bytealign (w[29], w[30], offset); - w[58] = amd_bytealign (w[28], w[29], offset); - w[57] = amd_bytealign (w[27], w[28], offset); - w[56] = amd_bytealign (w[26], w[27], offset); - w[55] = amd_bytealign (w[25], w[26], offset); - w[54] = amd_bytealign (w[24], w[25], offset); - w[53] = amd_bytealign (w[23], w[24], offset); - w[52] = amd_bytealign (w[22], w[23], offset); - w[51] = amd_bytealign (w[21], w[22], offset); - w[50] = amd_bytealign (w[20], w[21], offset); - w[49] = amd_bytealign (w[19], w[20], offset); - w[48] = amd_bytealign (w[18], w[19], offset); - w[47] = amd_bytealign (w[17], w[18], offset); - w[46] = amd_bytealign (w[16], w[17], offset); - w[45] = amd_bytealign (w[15], w[16], offset); - w[44] = amd_bytealign (w[14], w[15], offset); - w[43] = amd_bytealign (w[13], w[14], offset); - w[42] = amd_bytealign (w[12], w[13], offset); - w[41] = amd_bytealign (w[11], w[12], offset); - w[40] = amd_bytealign (w[10], w[11], offset); - w[39] = amd_bytealign (w[ 9], w[10], offset); - w[38] = amd_bytealign (w[ 8], w[ 9], offset); - w[37] = amd_bytealign (w[ 7], w[ 8], offset); - w[36] = amd_bytealign (w[ 6], w[ 7], offset); - w[35] = amd_bytealign (w[ 5], w[ 6], offset); - w[34] = amd_bytealign (w[ 4], w[ 5], offset); - w[33] = amd_bytealign (w[ 3], w[ 4], offset); - w[32] = amd_bytealign (w[ 2], w[ 3], offset); - w[31] = amd_bytealign (w[ 1], w[ 2], offset); - w[30] = amd_bytealign (w[ 0], w[ 1], offset); - w[29] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[33], w[34], offset); + w[62] = hc_bytealign (w[32], w[33], offset); + w[61] = hc_bytealign (w[31], w[32], offset); + w[60] = hc_bytealign (w[30], w[31], offset); + w[59] = hc_bytealign (w[29], w[30], offset); + w[58] = hc_bytealign (w[28], w[29], offset); + w[57] = hc_bytealign (w[27], w[28], offset); + w[56] = hc_bytealign (w[26], w[27], offset); + w[55] = hc_bytealign (w[25], w[26], offset); + w[54] = hc_bytealign (w[24], w[25], offset); + w[53] = hc_bytealign (w[23], w[24], offset); + w[52] = hc_bytealign (w[22], w[23], offset); + w[51] = hc_bytealign (w[21], w[22], offset); + w[50] = hc_bytealign (w[20], w[21], offset); + w[49] = hc_bytealign (w[19], w[20], offset); + w[48] = hc_bytealign (w[18], w[19], offset); + w[47] = hc_bytealign (w[17], w[18], offset); + w[46] = hc_bytealign (w[16], w[17], offset); + w[45] = hc_bytealign (w[15], w[16], offset); + w[44] = hc_bytealign (w[14], w[15], offset); + w[43] = hc_bytealign (w[13], w[14], offset); + w[42] = hc_bytealign (w[12], w[13], offset); + w[41] = hc_bytealign (w[11], w[12], offset); + w[40] = hc_bytealign (w[10], w[11], offset); + w[39] = hc_bytealign (w[ 9], w[10], offset); + w[38] = hc_bytealign (w[ 8], w[ 9], offset); + w[37] = hc_bytealign (w[ 7], w[ 8], offset); + w[36] = hc_bytealign (w[ 6], w[ 7], offset); + w[35] = hc_bytealign (w[ 5], w[ 6], offset); + w[34] = hc_bytealign (w[ 4], w[ 5], offset); + w[33] = hc_bytealign (w[ 3], w[ 4], offset); + w[32] = hc_bytealign (w[ 2], w[ 3], offset); + w[31] = hc_bytealign (w[ 1], w[ 2], offset); + w[30] = hc_bytealign (w[ 0], w[ 1], offset); + w[29] = hc_bytealign ( 0, w[ 0], offset); w[28] = 0; w[27] = 0; w[26] = 0; @@ -22438,40 +22438,40 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 30: - w[63] = amd_bytealign (w[32], w[33], offset); - w[62] = amd_bytealign (w[31], w[32], offset); - w[61] = amd_bytealign (w[30], w[31], offset); - w[60] = amd_bytealign (w[29], w[30], offset); - w[59] = amd_bytealign (w[28], w[29], offset); - w[58] = amd_bytealign (w[27], w[28], offset); - w[57] = amd_bytealign (w[26], w[27], offset); - w[56] = amd_bytealign (w[25], w[26], offset); - w[55] = amd_bytealign (w[24], w[25], offset); - w[54] = amd_bytealign (w[23], w[24], offset); - w[53] = amd_bytealign (w[22], w[23], offset); - w[52] = amd_bytealign (w[21], w[22], offset); - w[51] = amd_bytealign (w[20], w[21], offset); - w[50] = amd_bytealign (w[19], w[20], offset); - w[49] = amd_bytealign (w[18], w[19], offset); - w[48] = amd_bytealign (w[17], w[18], offset); - w[47] = amd_bytealign (w[16], w[17], offset); - w[46] = amd_bytealign (w[15], w[16], offset); - w[45] = amd_bytealign (w[14], w[15], offset); - w[44] = amd_bytealign (w[13], w[14], offset); - w[43] = amd_bytealign (w[12], w[13], offset); - w[42] = amd_bytealign (w[11], w[12], offset); - w[41] = amd_bytealign (w[10], w[11], offset); - w[40] = amd_bytealign (w[ 9], w[10], offset); - w[39] = amd_bytealign (w[ 8], w[ 9], offset); - w[38] = amd_bytealign (w[ 7], w[ 8], offset); - w[37] = amd_bytealign (w[ 6], w[ 7], offset); - w[36] = amd_bytealign (w[ 5], w[ 6], offset); - w[35] = amd_bytealign (w[ 4], w[ 5], offset); - w[34] = amd_bytealign (w[ 3], w[ 4], offset); - w[33] = amd_bytealign (w[ 2], w[ 3], offset); - w[32] = amd_bytealign (w[ 1], w[ 2], offset); - w[31] = amd_bytealign (w[ 0], w[ 1], offset); - w[30] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[32], w[33], offset); + w[62] = hc_bytealign (w[31], w[32], offset); + w[61] = hc_bytealign (w[30], w[31], offset); + w[60] = hc_bytealign (w[29], w[30], offset); + w[59] = hc_bytealign (w[28], w[29], offset); + w[58] = hc_bytealign (w[27], w[28], offset); + w[57] = hc_bytealign (w[26], w[27], offset); + w[56] = hc_bytealign (w[25], w[26], offset); + w[55] = hc_bytealign (w[24], w[25], offset); + w[54] = hc_bytealign (w[23], w[24], offset); + w[53] = hc_bytealign (w[22], w[23], offset); + w[52] = hc_bytealign (w[21], w[22], offset); + w[51] = hc_bytealign (w[20], w[21], offset); + w[50] = hc_bytealign (w[19], w[20], offset); + w[49] = hc_bytealign (w[18], w[19], offset); + w[48] = hc_bytealign (w[17], w[18], offset); + w[47] = hc_bytealign (w[16], w[17], offset); + w[46] = hc_bytealign (w[15], w[16], offset); + w[45] = hc_bytealign (w[14], w[15], offset); + w[44] = hc_bytealign (w[13], w[14], offset); + w[43] = hc_bytealign (w[12], w[13], offset); + w[42] = hc_bytealign (w[11], w[12], offset); + w[41] = hc_bytealign (w[10], w[11], offset); + w[40] = hc_bytealign (w[ 9], w[10], offset); + w[39] = hc_bytealign (w[ 8], w[ 9], offset); + w[38] = hc_bytealign (w[ 7], w[ 8], offset); + w[37] = hc_bytealign (w[ 6], w[ 7], offset); + w[36] = hc_bytealign (w[ 5], w[ 6], offset); + w[35] = hc_bytealign (w[ 4], w[ 5], offset); + w[34] = hc_bytealign (w[ 3], w[ 4], offset); + w[33] = hc_bytealign (w[ 2], w[ 3], offset); + w[32] = hc_bytealign (w[ 1], w[ 2], offset); + w[31] = hc_bytealign (w[ 0], w[ 1], offset); + w[30] = hc_bytealign ( 0, w[ 0], offset); w[29] = 0; w[28] = 0; w[27] = 0; @@ -22506,39 +22506,39 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 31: - w[63] = amd_bytealign (w[31], w[32], offset); - w[62] = amd_bytealign (w[30], w[31], offset); - w[61] = amd_bytealign (w[29], w[30], offset); - w[60] = amd_bytealign (w[28], w[29], offset); - w[59] = amd_bytealign (w[27], w[28], offset); - w[58] = amd_bytealign (w[26], w[27], offset); - w[57] = amd_bytealign (w[25], w[26], offset); - w[56] = amd_bytealign (w[24], w[25], offset); - w[55] = amd_bytealign (w[23], w[24], offset); - w[54] = amd_bytealign (w[22], w[23], offset); - w[53] = amd_bytealign (w[21], w[22], offset); - w[52] = amd_bytealign (w[20], w[21], offset); - w[51] = amd_bytealign (w[19], w[20], offset); - w[50] = amd_bytealign (w[18], w[19], offset); - w[49] = amd_bytealign (w[17], w[18], offset); - w[48] = amd_bytealign (w[16], w[17], offset); - w[47] = amd_bytealign (w[15], w[16], offset); - w[46] = amd_bytealign (w[14], w[15], offset); - w[45] = amd_bytealign (w[13], w[14], offset); - w[44] = amd_bytealign (w[12], w[13], offset); - w[43] = amd_bytealign (w[11], w[12], offset); - w[42] = amd_bytealign (w[10], w[11], offset); - w[41] = amd_bytealign (w[ 9], w[10], offset); - w[40] = amd_bytealign (w[ 8], w[ 9], offset); - w[39] = amd_bytealign (w[ 7], w[ 8], offset); - w[38] = amd_bytealign (w[ 6], w[ 7], offset); - w[37] = amd_bytealign (w[ 5], w[ 6], offset); - w[36] = amd_bytealign (w[ 4], w[ 5], offset); - w[35] = amd_bytealign (w[ 3], w[ 4], offset); - w[34] = amd_bytealign (w[ 2], w[ 3], offset); - w[33] = amd_bytealign (w[ 1], w[ 2], offset); - w[32] = amd_bytealign (w[ 0], w[ 1], offset); - w[31] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[31], w[32], offset); + w[62] = hc_bytealign (w[30], w[31], offset); + w[61] = hc_bytealign (w[29], w[30], offset); + w[60] = hc_bytealign (w[28], w[29], offset); + w[59] = hc_bytealign (w[27], w[28], offset); + w[58] = hc_bytealign (w[26], w[27], offset); + w[57] = hc_bytealign (w[25], w[26], offset); + w[56] = hc_bytealign (w[24], w[25], offset); + w[55] = hc_bytealign (w[23], w[24], offset); + w[54] = hc_bytealign (w[22], w[23], offset); + w[53] = hc_bytealign (w[21], w[22], offset); + w[52] = hc_bytealign (w[20], w[21], offset); + w[51] = hc_bytealign (w[19], w[20], offset); + w[50] = hc_bytealign (w[18], w[19], offset); + w[49] = hc_bytealign (w[17], w[18], offset); + w[48] = hc_bytealign (w[16], w[17], offset); + w[47] = hc_bytealign (w[15], w[16], offset); + w[46] = hc_bytealign (w[14], w[15], offset); + w[45] = hc_bytealign (w[13], w[14], offset); + w[44] = hc_bytealign (w[12], w[13], offset); + w[43] = hc_bytealign (w[11], w[12], offset); + w[42] = hc_bytealign (w[10], w[11], offset); + w[41] = hc_bytealign (w[ 9], w[10], offset); + w[40] = hc_bytealign (w[ 8], w[ 9], offset); + w[39] = hc_bytealign (w[ 7], w[ 8], offset); + w[38] = hc_bytealign (w[ 6], w[ 7], offset); + w[37] = hc_bytealign (w[ 5], w[ 6], offset); + w[36] = hc_bytealign (w[ 4], w[ 5], offset); + w[35] = hc_bytealign (w[ 3], w[ 4], offset); + w[34] = hc_bytealign (w[ 2], w[ 3], offset); + w[33] = hc_bytealign (w[ 1], w[ 2], offset); + w[32] = hc_bytealign (w[ 0], w[ 1], offset); + w[31] = hc_bytealign ( 0, w[ 0], offset); w[30] = 0; w[29] = 0; w[28] = 0; @@ -22574,38 +22574,38 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 32: - w[63] = amd_bytealign (w[30], w[31], offset); - w[62] = amd_bytealign (w[29], w[30], offset); - w[61] = amd_bytealign (w[28], w[29], offset); - w[60] = amd_bytealign (w[27], w[28], offset); - w[59] = amd_bytealign (w[26], w[27], offset); - w[58] = amd_bytealign (w[25], w[26], offset); - w[57] = amd_bytealign (w[24], w[25], offset); - w[56] = amd_bytealign (w[23], w[24], offset); - w[55] = amd_bytealign (w[22], w[23], offset); - w[54] = amd_bytealign (w[21], w[22], offset); - w[53] = amd_bytealign (w[20], w[21], offset); - w[52] = amd_bytealign (w[19], w[20], offset); - w[51] = amd_bytealign (w[18], w[19], offset); - w[50] = amd_bytealign (w[17], w[18], offset); - w[49] = amd_bytealign (w[16], w[17], offset); - w[48] = amd_bytealign (w[15], w[16], offset); - w[47] = amd_bytealign (w[14], w[15], offset); - w[46] = amd_bytealign (w[13], w[14], offset); - w[45] = amd_bytealign (w[12], w[13], offset); - w[44] = amd_bytealign (w[11], w[12], offset); - w[43] = amd_bytealign (w[10], w[11], offset); - w[42] = amd_bytealign (w[ 9], w[10], offset); - w[41] = amd_bytealign (w[ 8], w[ 9], offset); - w[40] = amd_bytealign (w[ 7], w[ 8], offset); - w[39] = amd_bytealign (w[ 6], w[ 7], offset); - w[38] = amd_bytealign (w[ 5], w[ 6], offset); - w[37] = amd_bytealign (w[ 4], w[ 5], offset); - w[36] = amd_bytealign (w[ 3], w[ 4], offset); - w[35] = amd_bytealign (w[ 2], w[ 3], offset); - w[34] = amd_bytealign (w[ 1], w[ 2], offset); - w[33] = amd_bytealign (w[ 0], w[ 1], offset); - w[32] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[30], w[31], offset); + w[62] = hc_bytealign (w[29], w[30], offset); + w[61] = hc_bytealign (w[28], w[29], offset); + w[60] = hc_bytealign (w[27], w[28], offset); + w[59] = hc_bytealign (w[26], w[27], offset); + w[58] = hc_bytealign (w[25], w[26], offset); + w[57] = hc_bytealign (w[24], w[25], offset); + w[56] = hc_bytealign (w[23], w[24], offset); + w[55] = hc_bytealign (w[22], w[23], offset); + w[54] = hc_bytealign (w[21], w[22], offset); + w[53] = hc_bytealign (w[20], w[21], offset); + w[52] = hc_bytealign (w[19], w[20], offset); + w[51] = hc_bytealign (w[18], w[19], offset); + w[50] = hc_bytealign (w[17], w[18], offset); + w[49] = hc_bytealign (w[16], w[17], offset); + w[48] = hc_bytealign (w[15], w[16], offset); + w[47] = hc_bytealign (w[14], w[15], offset); + w[46] = hc_bytealign (w[13], w[14], offset); + w[45] = hc_bytealign (w[12], w[13], offset); + w[44] = hc_bytealign (w[11], w[12], offset); + w[43] = hc_bytealign (w[10], w[11], offset); + w[42] = hc_bytealign (w[ 9], w[10], offset); + w[41] = hc_bytealign (w[ 8], w[ 9], offset); + w[40] = hc_bytealign (w[ 7], w[ 8], offset); + w[39] = hc_bytealign (w[ 6], w[ 7], offset); + w[38] = hc_bytealign (w[ 5], w[ 6], offset); + w[37] = hc_bytealign (w[ 4], w[ 5], offset); + w[36] = hc_bytealign (w[ 3], w[ 4], offset); + w[35] = hc_bytealign (w[ 2], w[ 3], offset); + w[34] = hc_bytealign (w[ 1], w[ 2], offset); + w[33] = hc_bytealign (w[ 0], w[ 1], offset); + w[32] = hc_bytealign ( 0, w[ 0], offset); w[31] = 0; w[30] = 0; w[29] = 0; @@ -22642,37 +22642,37 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 33: - w[63] = amd_bytealign (w[29], w[30], offset); - w[62] = amd_bytealign (w[28], w[29], offset); - w[61] = amd_bytealign (w[27], w[28], offset); - w[60] = amd_bytealign (w[26], w[27], offset); - w[59] = amd_bytealign (w[25], w[26], offset); - w[58] = amd_bytealign (w[24], w[25], offset); - w[57] = amd_bytealign (w[23], w[24], offset); - w[56] = amd_bytealign (w[22], w[23], offset); - w[55] = amd_bytealign (w[21], w[22], offset); - w[54] = amd_bytealign (w[20], w[21], offset); - w[53] = amd_bytealign (w[19], w[20], offset); - w[52] = amd_bytealign (w[18], w[19], offset); - w[51] = amd_bytealign (w[17], w[18], offset); - w[50] = amd_bytealign (w[16], w[17], offset); - w[49] = amd_bytealign (w[15], w[16], offset); - w[48] = amd_bytealign (w[14], w[15], offset); - w[47] = amd_bytealign (w[13], w[14], offset); - w[46] = amd_bytealign (w[12], w[13], offset); - w[45] = amd_bytealign (w[11], w[12], offset); - w[44] = amd_bytealign (w[10], w[11], offset); - w[43] = amd_bytealign (w[ 9], w[10], offset); - w[42] = amd_bytealign (w[ 8], w[ 9], offset); - w[41] = amd_bytealign (w[ 7], w[ 8], offset); - w[40] = amd_bytealign (w[ 6], w[ 7], offset); - w[39] = amd_bytealign (w[ 5], w[ 6], offset); - w[38] = amd_bytealign (w[ 4], w[ 5], offset); - w[37] = amd_bytealign (w[ 3], w[ 4], offset); - w[36] = amd_bytealign (w[ 2], w[ 3], offset); - w[35] = amd_bytealign (w[ 1], w[ 2], offset); - w[34] = amd_bytealign (w[ 0], w[ 1], offset); - w[33] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[29], w[30], offset); + w[62] = hc_bytealign (w[28], w[29], offset); + w[61] = hc_bytealign (w[27], w[28], offset); + w[60] = hc_bytealign (w[26], w[27], offset); + w[59] = hc_bytealign (w[25], w[26], offset); + w[58] = hc_bytealign (w[24], w[25], offset); + w[57] = hc_bytealign (w[23], w[24], offset); + w[56] = hc_bytealign (w[22], w[23], offset); + w[55] = hc_bytealign (w[21], w[22], offset); + w[54] = hc_bytealign (w[20], w[21], offset); + w[53] = hc_bytealign (w[19], w[20], offset); + w[52] = hc_bytealign (w[18], w[19], offset); + w[51] = hc_bytealign (w[17], w[18], offset); + w[50] = hc_bytealign (w[16], w[17], offset); + w[49] = hc_bytealign (w[15], w[16], offset); + w[48] = hc_bytealign (w[14], w[15], offset); + w[47] = hc_bytealign (w[13], w[14], offset); + w[46] = hc_bytealign (w[12], w[13], offset); + w[45] = hc_bytealign (w[11], w[12], offset); + w[44] = hc_bytealign (w[10], w[11], offset); + w[43] = hc_bytealign (w[ 9], w[10], offset); + w[42] = hc_bytealign (w[ 8], w[ 9], offset); + w[41] = hc_bytealign (w[ 7], w[ 8], offset); + w[40] = hc_bytealign (w[ 6], w[ 7], offset); + w[39] = hc_bytealign (w[ 5], w[ 6], offset); + w[38] = hc_bytealign (w[ 4], w[ 5], offset); + w[37] = hc_bytealign (w[ 3], w[ 4], offset); + w[36] = hc_bytealign (w[ 2], w[ 3], offset); + w[35] = hc_bytealign (w[ 1], w[ 2], offset); + w[34] = hc_bytealign (w[ 0], w[ 1], offset); + w[33] = hc_bytealign ( 0, w[ 0], offset); w[32] = 0; w[31] = 0; w[30] = 0; @@ -22710,36 +22710,36 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 34: - w[63] = amd_bytealign (w[28], w[29], offset); - w[62] = amd_bytealign (w[27], w[28], offset); - w[61] = amd_bytealign (w[26], w[27], offset); - w[60] = amd_bytealign (w[25], w[26], offset); - w[59] = amd_bytealign (w[24], w[25], offset); - w[58] = amd_bytealign (w[23], w[24], offset); - w[57] = amd_bytealign (w[22], w[23], offset); - w[56] = amd_bytealign (w[21], w[22], offset); - w[55] = amd_bytealign (w[20], w[21], offset); - w[54] = amd_bytealign (w[19], w[20], offset); - w[53] = amd_bytealign (w[18], w[19], offset); - w[52] = amd_bytealign (w[17], w[18], offset); - w[51] = amd_bytealign (w[16], w[17], offset); - w[50] = amd_bytealign (w[15], w[16], offset); - w[49] = amd_bytealign (w[14], w[15], offset); - w[48] = amd_bytealign (w[13], w[14], offset); - w[47] = amd_bytealign (w[12], w[13], offset); - w[46] = amd_bytealign (w[11], w[12], offset); - w[45] = amd_bytealign (w[10], w[11], offset); - w[44] = amd_bytealign (w[ 9], w[10], offset); - w[43] = amd_bytealign (w[ 8], w[ 9], offset); - w[42] = amd_bytealign (w[ 7], w[ 8], offset); - w[41] = amd_bytealign (w[ 6], w[ 7], offset); - w[40] = amd_bytealign (w[ 5], w[ 6], offset); - w[39] = amd_bytealign (w[ 4], w[ 5], offset); - w[38] = amd_bytealign (w[ 3], w[ 4], offset); - w[37] = amd_bytealign (w[ 2], w[ 3], offset); - w[36] = amd_bytealign (w[ 1], w[ 2], offset); - w[35] = amd_bytealign (w[ 0], w[ 1], offset); - w[34] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[28], w[29], offset); + w[62] = hc_bytealign (w[27], w[28], offset); + w[61] = hc_bytealign (w[26], w[27], offset); + w[60] = hc_bytealign (w[25], w[26], offset); + w[59] = hc_bytealign (w[24], w[25], offset); + w[58] = hc_bytealign (w[23], w[24], offset); + w[57] = hc_bytealign (w[22], w[23], offset); + w[56] = hc_bytealign (w[21], w[22], offset); + w[55] = hc_bytealign (w[20], w[21], offset); + w[54] = hc_bytealign (w[19], w[20], offset); + w[53] = hc_bytealign (w[18], w[19], offset); + w[52] = hc_bytealign (w[17], w[18], offset); + w[51] = hc_bytealign (w[16], w[17], offset); + w[50] = hc_bytealign (w[15], w[16], offset); + w[49] = hc_bytealign (w[14], w[15], offset); + w[48] = hc_bytealign (w[13], w[14], offset); + w[47] = hc_bytealign (w[12], w[13], offset); + w[46] = hc_bytealign (w[11], w[12], offset); + w[45] = hc_bytealign (w[10], w[11], offset); + w[44] = hc_bytealign (w[ 9], w[10], offset); + w[43] = hc_bytealign (w[ 8], w[ 9], offset); + w[42] = hc_bytealign (w[ 7], w[ 8], offset); + w[41] = hc_bytealign (w[ 6], w[ 7], offset); + w[40] = hc_bytealign (w[ 5], w[ 6], offset); + w[39] = hc_bytealign (w[ 4], w[ 5], offset); + w[38] = hc_bytealign (w[ 3], w[ 4], offset); + w[37] = hc_bytealign (w[ 2], w[ 3], offset); + w[36] = hc_bytealign (w[ 1], w[ 2], offset); + w[35] = hc_bytealign (w[ 0], w[ 1], offset); + w[34] = hc_bytealign ( 0, w[ 0], offset); w[33] = 0; w[32] = 0; w[31] = 0; @@ -22778,35 +22778,35 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 35: - w[63] = amd_bytealign (w[27], w[28], offset); - w[62] = amd_bytealign (w[26], w[27], offset); - w[61] = amd_bytealign (w[25], w[26], offset); - w[60] = amd_bytealign (w[24], w[25], offset); - w[59] = amd_bytealign (w[23], w[24], offset); - w[58] = amd_bytealign (w[22], w[23], offset); - w[57] = amd_bytealign (w[21], w[22], offset); - w[56] = amd_bytealign (w[20], w[21], offset); - w[55] = amd_bytealign (w[19], w[20], offset); - w[54] = amd_bytealign (w[18], w[19], offset); - w[53] = amd_bytealign (w[17], w[18], offset); - w[52] = amd_bytealign (w[16], w[17], offset); - w[51] = amd_bytealign (w[15], w[16], offset); - w[50] = amd_bytealign (w[14], w[15], offset); - w[49] = amd_bytealign (w[13], w[14], offset); - w[48] = amd_bytealign (w[12], w[13], offset); - w[47] = amd_bytealign (w[11], w[12], offset); - w[46] = amd_bytealign (w[10], w[11], offset); - w[45] = amd_bytealign (w[ 9], w[10], offset); - w[44] = amd_bytealign (w[ 8], w[ 9], offset); - w[43] = amd_bytealign (w[ 7], w[ 8], offset); - w[42] = amd_bytealign (w[ 6], w[ 7], offset); - w[41] = amd_bytealign (w[ 5], w[ 6], offset); - w[40] = amd_bytealign (w[ 4], w[ 5], offset); - w[39] = amd_bytealign (w[ 3], w[ 4], offset); - w[38] = amd_bytealign (w[ 2], w[ 3], offset); - w[37] = amd_bytealign (w[ 1], w[ 2], offset); - w[36] = amd_bytealign (w[ 0], w[ 1], offset); - w[35] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[27], w[28], offset); + w[62] = hc_bytealign (w[26], w[27], offset); + w[61] = hc_bytealign (w[25], w[26], offset); + w[60] = hc_bytealign (w[24], w[25], offset); + w[59] = hc_bytealign (w[23], w[24], offset); + w[58] = hc_bytealign (w[22], w[23], offset); + w[57] = hc_bytealign (w[21], w[22], offset); + w[56] = hc_bytealign (w[20], w[21], offset); + w[55] = hc_bytealign (w[19], w[20], offset); + w[54] = hc_bytealign (w[18], w[19], offset); + w[53] = hc_bytealign (w[17], w[18], offset); + w[52] = hc_bytealign (w[16], w[17], offset); + w[51] = hc_bytealign (w[15], w[16], offset); + w[50] = hc_bytealign (w[14], w[15], offset); + w[49] = hc_bytealign (w[13], w[14], offset); + w[48] = hc_bytealign (w[12], w[13], offset); + w[47] = hc_bytealign (w[11], w[12], offset); + w[46] = hc_bytealign (w[10], w[11], offset); + w[45] = hc_bytealign (w[ 9], w[10], offset); + w[44] = hc_bytealign (w[ 8], w[ 9], offset); + w[43] = hc_bytealign (w[ 7], w[ 8], offset); + w[42] = hc_bytealign (w[ 6], w[ 7], offset); + w[41] = hc_bytealign (w[ 5], w[ 6], offset); + w[40] = hc_bytealign (w[ 4], w[ 5], offset); + w[39] = hc_bytealign (w[ 3], w[ 4], offset); + w[38] = hc_bytealign (w[ 2], w[ 3], offset); + w[37] = hc_bytealign (w[ 1], w[ 2], offset); + w[36] = hc_bytealign (w[ 0], w[ 1], offset); + w[35] = hc_bytealign ( 0, w[ 0], offset); w[34] = 0; w[33] = 0; w[32] = 0; @@ -22846,34 +22846,34 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 36: - w[63] = amd_bytealign (w[26], w[27], offset); - w[62] = amd_bytealign (w[25], w[26], offset); - w[61] = amd_bytealign (w[24], w[25], offset); - w[60] = amd_bytealign (w[23], w[24], offset); - w[59] = amd_bytealign (w[22], w[23], offset); - w[58] = amd_bytealign (w[21], w[22], offset); - w[57] = amd_bytealign (w[20], w[21], offset); - w[56] = amd_bytealign (w[19], w[20], offset); - w[55] = amd_bytealign (w[18], w[19], offset); - w[54] = amd_bytealign (w[17], w[18], offset); - w[53] = amd_bytealign (w[16], w[17], offset); - w[52] = amd_bytealign (w[15], w[16], offset); - w[51] = amd_bytealign (w[14], w[15], offset); - w[50] = amd_bytealign (w[13], w[14], offset); - w[49] = amd_bytealign (w[12], w[13], offset); - w[48] = amd_bytealign (w[11], w[12], offset); - w[47] = amd_bytealign (w[10], w[11], offset); - w[46] = amd_bytealign (w[ 9], w[10], offset); - w[45] = amd_bytealign (w[ 8], w[ 9], offset); - w[44] = amd_bytealign (w[ 7], w[ 8], offset); - w[43] = amd_bytealign (w[ 6], w[ 7], offset); - w[42] = amd_bytealign (w[ 5], w[ 6], offset); - w[41] = amd_bytealign (w[ 4], w[ 5], offset); - w[40] = amd_bytealign (w[ 3], w[ 4], offset); - w[39] = amd_bytealign (w[ 2], w[ 3], offset); - w[38] = amd_bytealign (w[ 1], w[ 2], offset); - w[37] = amd_bytealign (w[ 0], w[ 1], offset); - w[36] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[26], w[27], offset); + w[62] = hc_bytealign (w[25], w[26], offset); + w[61] = hc_bytealign (w[24], w[25], offset); + w[60] = hc_bytealign (w[23], w[24], offset); + w[59] = hc_bytealign (w[22], w[23], offset); + w[58] = hc_bytealign (w[21], w[22], offset); + w[57] = hc_bytealign (w[20], w[21], offset); + w[56] = hc_bytealign (w[19], w[20], offset); + w[55] = hc_bytealign (w[18], w[19], offset); + w[54] = hc_bytealign (w[17], w[18], offset); + w[53] = hc_bytealign (w[16], w[17], offset); + w[52] = hc_bytealign (w[15], w[16], offset); + w[51] = hc_bytealign (w[14], w[15], offset); + w[50] = hc_bytealign (w[13], w[14], offset); + w[49] = hc_bytealign (w[12], w[13], offset); + w[48] = hc_bytealign (w[11], w[12], offset); + w[47] = hc_bytealign (w[10], w[11], offset); + w[46] = hc_bytealign (w[ 9], w[10], offset); + w[45] = hc_bytealign (w[ 8], w[ 9], offset); + w[44] = hc_bytealign (w[ 7], w[ 8], offset); + w[43] = hc_bytealign (w[ 6], w[ 7], offset); + w[42] = hc_bytealign (w[ 5], w[ 6], offset); + w[41] = hc_bytealign (w[ 4], w[ 5], offset); + w[40] = hc_bytealign (w[ 3], w[ 4], offset); + w[39] = hc_bytealign (w[ 2], w[ 3], offset); + w[38] = hc_bytealign (w[ 1], w[ 2], offset); + w[37] = hc_bytealign (w[ 0], w[ 1], offset); + w[36] = hc_bytealign ( 0, w[ 0], offset); w[35] = 0; w[34] = 0; w[33] = 0; @@ -22914,33 +22914,33 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 37: - w[63] = amd_bytealign (w[25], w[26], offset); - w[62] = amd_bytealign (w[24], w[25], offset); - w[61] = amd_bytealign (w[23], w[24], offset); - w[60] = amd_bytealign (w[22], w[23], offset); - w[59] = amd_bytealign (w[21], w[22], offset); - w[58] = amd_bytealign (w[20], w[21], offset); - w[57] = amd_bytealign (w[19], w[20], offset); - w[56] = amd_bytealign (w[18], w[19], offset); - w[55] = amd_bytealign (w[17], w[18], offset); - w[54] = amd_bytealign (w[16], w[17], offset); - w[53] = amd_bytealign (w[15], w[16], offset); - w[52] = amd_bytealign (w[14], w[15], offset); - w[51] = amd_bytealign (w[13], w[14], offset); - w[50] = amd_bytealign (w[12], w[13], offset); - w[49] = amd_bytealign (w[11], w[12], offset); - w[48] = amd_bytealign (w[10], w[11], offset); - w[47] = amd_bytealign (w[ 9], w[10], offset); - w[46] = amd_bytealign (w[ 8], w[ 9], offset); - w[45] = amd_bytealign (w[ 7], w[ 8], offset); - w[44] = amd_bytealign (w[ 6], w[ 7], offset); - w[43] = amd_bytealign (w[ 5], w[ 6], offset); - w[42] = amd_bytealign (w[ 4], w[ 5], offset); - w[41] = amd_bytealign (w[ 3], w[ 4], offset); - w[40] = amd_bytealign (w[ 2], w[ 3], offset); - w[39] = amd_bytealign (w[ 1], w[ 2], offset); - w[38] = amd_bytealign (w[ 0], w[ 1], offset); - w[37] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[25], w[26], offset); + w[62] = hc_bytealign (w[24], w[25], offset); + w[61] = hc_bytealign (w[23], w[24], offset); + w[60] = hc_bytealign (w[22], w[23], offset); + w[59] = hc_bytealign (w[21], w[22], offset); + w[58] = hc_bytealign (w[20], w[21], offset); + w[57] = hc_bytealign (w[19], w[20], offset); + w[56] = hc_bytealign (w[18], w[19], offset); + w[55] = hc_bytealign (w[17], w[18], offset); + w[54] = hc_bytealign (w[16], w[17], offset); + w[53] = hc_bytealign (w[15], w[16], offset); + w[52] = hc_bytealign (w[14], w[15], offset); + w[51] = hc_bytealign (w[13], w[14], offset); + w[50] = hc_bytealign (w[12], w[13], offset); + w[49] = hc_bytealign (w[11], w[12], offset); + w[48] = hc_bytealign (w[10], w[11], offset); + w[47] = hc_bytealign (w[ 9], w[10], offset); + w[46] = hc_bytealign (w[ 8], w[ 9], offset); + w[45] = hc_bytealign (w[ 7], w[ 8], offset); + w[44] = hc_bytealign (w[ 6], w[ 7], offset); + w[43] = hc_bytealign (w[ 5], w[ 6], offset); + w[42] = hc_bytealign (w[ 4], w[ 5], offset); + w[41] = hc_bytealign (w[ 3], w[ 4], offset); + w[40] = hc_bytealign (w[ 2], w[ 3], offset); + w[39] = hc_bytealign (w[ 1], w[ 2], offset); + w[38] = hc_bytealign (w[ 0], w[ 1], offset); + w[37] = hc_bytealign ( 0, w[ 0], offset); w[36] = 0; w[35] = 0; w[34] = 0; @@ -22982,32 +22982,32 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 38: - w[63] = amd_bytealign (w[24], w[25], offset); - w[62] = amd_bytealign (w[23], w[24], offset); - w[61] = amd_bytealign (w[22], w[23], offset); - w[60] = amd_bytealign (w[21], w[22], offset); - w[59] = amd_bytealign (w[20], w[21], offset); - w[58] = amd_bytealign (w[19], w[20], offset); - w[57] = amd_bytealign (w[18], w[19], offset); - w[56] = amd_bytealign (w[17], w[18], offset); - w[55] = amd_bytealign (w[16], w[17], offset); - w[54] = amd_bytealign (w[15], w[16], offset); - w[53] = amd_bytealign (w[14], w[15], offset); - w[52] = amd_bytealign (w[13], w[14], offset); - w[51] = amd_bytealign (w[12], w[13], offset); - w[50] = amd_bytealign (w[11], w[12], offset); - w[49] = amd_bytealign (w[10], w[11], offset); - w[48] = amd_bytealign (w[ 9], w[10], offset); - w[47] = amd_bytealign (w[ 8], w[ 9], offset); - w[46] = amd_bytealign (w[ 7], w[ 8], offset); - w[45] = amd_bytealign (w[ 6], w[ 7], offset); - w[44] = amd_bytealign (w[ 5], w[ 6], offset); - w[43] = amd_bytealign (w[ 4], w[ 5], offset); - w[42] = amd_bytealign (w[ 3], w[ 4], offset); - w[41] = amd_bytealign (w[ 2], w[ 3], offset); - w[40] = amd_bytealign (w[ 1], w[ 2], offset); - w[39] = amd_bytealign (w[ 0], w[ 1], offset); - w[38] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[24], w[25], offset); + w[62] = hc_bytealign (w[23], w[24], offset); + w[61] = hc_bytealign (w[22], w[23], offset); + w[60] = hc_bytealign (w[21], w[22], offset); + w[59] = hc_bytealign (w[20], w[21], offset); + w[58] = hc_bytealign (w[19], w[20], offset); + w[57] = hc_bytealign (w[18], w[19], offset); + w[56] = hc_bytealign (w[17], w[18], offset); + w[55] = hc_bytealign (w[16], w[17], offset); + w[54] = hc_bytealign (w[15], w[16], offset); + w[53] = hc_bytealign (w[14], w[15], offset); + w[52] = hc_bytealign (w[13], w[14], offset); + w[51] = hc_bytealign (w[12], w[13], offset); + w[50] = hc_bytealign (w[11], w[12], offset); + w[49] = hc_bytealign (w[10], w[11], offset); + w[48] = hc_bytealign (w[ 9], w[10], offset); + w[47] = hc_bytealign (w[ 8], w[ 9], offset); + w[46] = hc_bytealign (w[ 7], w[ 8], offset); + w[45] = hc_bytealign (w[ 6], w[ 7], offset); + w[44] = hc_bytealign (w[ 5], w[ 6], offset); + w[43] = hc_bytealign (w[ 4], w[ 5], offset); + w[42] = hc_bytealign (w[ 3], w[ 4], offset); + w[41] = hc_bytealign (w[ 2], w[ 3], offset); + w[40] = hc_bytealign (w[ 1], w[ 2], offset); + w[39] = hc_bytealign (w[ 0], w[ 1], offset); + w[38] = hc_bytealign ( 0, w[ 0], offset); w[37] = 0; w[36] = 0; w[35] = 0; @@ -23050,31 +23050,31 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 39: - w[63] = amd_bytealign (w[23], w[24], offset); - w[62] = amd_bytealign (w[22], w[23], offset); - w[61] = amd_bytealign (w[21], w[22], offset); - w[60] = amd_bytealign (w[20], w[21], offset); - w[59] = amd_bytealign (w[19], w[20], offset); - w[58] = amd_bytealign (w[18], w[19], offset); - w[57] = amd_bytealign (w[17], w[18], offset); - w[56] = amd_bytealign (w[16], w[17], offset); - w[55] = amd_bytealign (w[15], w[16], offset); - w[54] = amd_bytealign (w[14], w[15], offset); - w[53] = amd_bytealign (w[13], w[14], offset); - w[52] = amd_bytealign (w[12], w[13], offset); - w[51] = amd_bytealign (w[11], w[12], offset); - w[50] = amd_bytealign (w[10], w[11], offset); - w[49] = amd_bytealign (w[ 9], w[10], offset); - w[48] = amd_bytealign (w[ 8], w[ 9], offset); - w[47] = amd_bytealign (w[ 7], w[ 8], offset); - w[46] = amd_bytealign (w[ 6], w[ 7], offset); - w[45] = amd_bytealign (w[ 5], w[ 6], offset); - w[44] = amd_bytealign (w[ 4], w[ 5], offset); - w[43] = amd_bytealign (w[ 3], w[ 4], offset); - w[42] = amd_bytealign (w[ 2], w[ 3], offset); - w[41] = amd_bytealign (w[ 1], w[ 2], offset); - w[40] = amd_bytealign (w[ 0], w[ 1], offset); - w[39] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[23], w[24], offset); + w[62] = hc_bytealign (w[22], w[23], offset); + w[61] = hc_bytealign (w[21], w[22], offset); + w[60] = hc_bytealign (w[20], w[21], offset); + w[59] = hc_bytealign (w[19], w[20], offset); + w[58] = hc_bytealign (w[18], w[19], offset); + w[57] = hc_bytealign (w[17], w[18], offset); + w[56] = hc_bytealign (w[16], w[17], offset); + w[55] = hc_bytealign (w[15], w[16], offset); + w[54] = hc_bytealign (w[14], w[15], offset); + w[53] = hc_bytealign (w[13], w[14], offset); + w[52] = hc_bytealign (w[12], w[13], offset); + w[51] = hc_bytealign (w[11], w[12], offset); + w[50] = hc_bytealign (w[10], w[11], offset); + w[49] = hc_bytealign (w[ 9], w[10], offset); + w[48] = hc_bytealign (w[ 8], w[ 9], offset); + w[47] = hc_bytealign (w[ 7], w[ 8], offset); + w[46] = hc_bytealign (w[ 6], w[ 7], offset); + w[45] = hc_bytealign (w[ 5], w[ 6], offset); + w[44] = hc_bytealign (w[ 4], w[ 5], offset); + w[43] = hc_bytealign (w[ 3], w[ 4], offset); + w[42] = hc_bytealign (w[ 2], w[ 3], offset); + w[41] = hc_bytealign (w[ 1], w[ 2], offset); + w[40] = hc_bytealign (w[ 0], w[ 1], offset); + w[39] = hc_bytealign ( 0, w[ 0], offset); w[38] = 0; w[37] = 0; w[36] = 0; @@ -23118,30 +23118,30 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 40: - w[63] = amd_bytealign (w[22], w[23], offset); - w[62] = amd_bytealign (w[21], w[22], offset); - w[61] = amd_bytealign (w[20], w[21], offset); - w[60] = amd_bytealign (w[19], w[20], offset); - w[59] = amd_bytealign (w[18], w[19], offset); - w[58] = amd_bytealign (w[17], w[18], offset); - w[57] = amd_bytealign (w[16], w[17], offset); - w[56] = amd_bytealign (w[15], w[16], offset); - w[55] = amd_bytealign (w[14], w[15], offset); - w[54] = amd_bytealign (w[13], w[14], offset); - w[53] = amd_bytealign (w[12], w[13], offset); - w[52] = amd_bytealign (w[11], w[12], offset); - w[51] = amd_bytealign (w[10], w[11], offset); - w[50] = amd_bytealign (w[ 9], w[10], offset); - w[49] = amd_bytealign (w[ 8], w[ 9], offset); - w[48] = amd_bytealign (w[ 7], w[ 8], offset); - w[47] = amd_bytealign (w[ 6], w[ 7], offset); - w[46] = amd_bytealign (w[ 5], w[ 6], offset); - w[45] = amd_bytealign (w[ 4], w[ 5], offset); - w[44] = amd_bytealign (w[ 3], w[ 4], offset); - w[43] = amd_bytealign (w[ 2], w[ 3], offset); - w[42] = amd_bytealign (w[ 1], w[ 2], offset); - w[41] = amd_bytealign (w[ 0], w[ 1], offset); - w[40] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[22], w[23], offset); + w[62] = hc_bytealign (w[21], w[22], offset); + w[61] = hc_bytealign (w[20], w[21], offset); + w[60] = hc_bytealign (w[19], w[20], offset); + w[59] = hc_bytealign (w[18], w[19], offset); + w[58] = hc_bytealign (w[17], w[18], offset); + w[57] = hc_bytealign (w[16], w[17], offset); + w[56] = hc_bytealign (w[15], w[16], offset); + w[55] = hc_bytealign (w[14], w[15], offset); + w[54] = hc_bytealign (w[13], w[14], offset); + w[53] = hc_bytealign (w[12], w[13], offset); + w[52] = hc_bytealign (w[11], w[12], offset); + w[51] = hc_bytealign (w[10], w[11], offset); + w[50] = hc_bytealign (w[ 9], w[10], offset); + w[49] = hc_bytealign (w[ 8], w[ 9], offset); + w[48] = hc_bytealign (w[ 7], w[ 8], offset); + w[47] = hc_bytealign (w[ 6], w[ 7], offset); + w[46] = hc_bytealign (w[ 5], w[ 6], offset); + w[45] = hc_bytealign (w[ 4], w[ 5], offset); + w[44] = hc_bytealign (w[ 3], w[ 4], offset); + w[43] = hc_bytealign (w[ 2], w[ 3], offset); + w[42] = hc_bytealign (w[ 1], w[ 2], offset); + w[41] = hc_bytealign (w[ 0], w[ 1], offset); + w[40] = hc_bytealign ( 0, w[ 0], offset); w[39] = 0; w[38] = 0; w[37] = 0; @@ -23186,29 +23186,29 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 41: - w[63] = amd_bytealign (w[21], w[22], offset); - w[62] = amd_bytealign (w[20], w[21], offset); - w[61] = amd_bytealign (w[19], w[20], offset); - w[60] = amd_bytealign (w[18], w[19], offset); - w[59] = amd_bytealign (w[17], w[18], offset); - w[58] = amd_bytealign (w[16], w[17], offset); - w[57] = amd_bytealign (w[15], w[16], offset); - w[56] = amd_bytealign (w[14], w[15], offset); - w[55] = amd_bytealign (w[13], w[14], offset); - w[54] = amd_bytealign (w[12], w[13], offset); - w[53] = amd_bytealign (w[11], w[12], offset); - w[52] = amd_bytealign (w[10], w[11], offset); - w[51] = amd_bytealign (w[ 9], w[10], offset); - w[50] = amd_bytealign (w[ 8], w[ 9], offset); - w[49] = amd_bytealign (w[ 7], w[ 8], offset); - w[48] = amd_bytealign (w[ 6], w[ 7], offset); - w[47] = amd_bytealign (w[ 5], w[ 6], offset); - w[46] = amd_bytealign (w[ 4], w[ 5], offset); - w[45] = amd_bytealign (w[ 3], w[ 4], offset); - w[44] = amd_bytealign (w[ 2], w[ 3], offset); - w[43] = amd_bytealign (w[ 1], w[ 2], offset); - w[42] = amd_bytealign (w[ 0], w[ 1], offset); - w[41] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[21], w[22], offset); + w[62] = hc_bytealign (w[20], w[21], offset); + w[61] = hc_bytealign (w[19], w[20], offset); + w[60] = hc_bytealign (w[18], w[19], offset); + w[59] = hc_bytealign (w[17], w[18], offset); + w[58] = hc_bytealign (w[16], w[17], offset); + w[57] = hc_bytealign (w[15], w[16], offset); + w[56] = hc_bytealign (w[14], w[15], offset); + w[55] = hc_bytealign (w[13], w[14], offset); + w[54] = hc_bytealign (w[12], w[13], offset); + w[53] = hc_bytealign (w[11], w[12], offset); + w[52] = hc_bytealign (w[10], w[11], offset); + w[51] = hc_bytealign (w[ 9], w[10], offset); + w[50] = hc_bytealign (w[ 8], w[ 9], offset); + w[49] = hc_bytealign (w[ 7], w[ 8], offset); + w[48] = hc_bytealign (w[ 6], w[ 7], offset); + w[47] = hc_bytealign (w[ 5], w[ 6], offset); + w[46] = hc_bytealign (w[ 4], w[ 5], offset); + w[45] = hc_bytealign (w[ 3], w[ 4], offset); + w[44] = hc_bytealign (w[ 2], w[ 3], offset); + w[43] = hc_bytealign (w[ 1], w[ 2], offset); + w[42] = hc_bytealign (w[ 0], w[ 1], offset); + w[41] = hc_bytealign ( 0, w[ 0], offset); w[40] = 0; w[39] = 0; w[38] = 0; @@ -23254,28 +23254,28 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 42: - w[63] = amd_bytealign (w[20], w[21], offset); - w[62] = amd_bytealign (w[19], w[20], offset); - w[61] = amd_bytealign (w[18], w[19], offset); - w[60] = amd_bytealign (w[17], w[18], offset); - w[59] = amd_bytealign (w[16], w[17], offset); - w[58] = amd_bytealign (w[15], w[16], offset); - w[57] = amd_bytealign (w[14], w[15], offset); - w[56] = amd_bytealign (w[13], w[14], offset); - w[55] = amd_bytealign (w[12], w[13], offset); - w[54] = amd_bytealign (w[11], w[12], offset); - w[53] = amd_bytealign (w[10], w[11], offset); - w[52] = amd_bytealign (w[ 9], w[10], offset); - w[51] = amd_bytealign (w[ 8], w[ 9], offset); - w[50] = amd_bytealign (w[ 7], w[ 8], offset); - w[49] = amd_bytealign (w[ 6], w[ 7], offset); - w[48] = amd_bytealign (w[ 5], w[ 6], offset); - w[47] = amd_bytealign (w[ 4], w[ 5], offset); - w[46] = amd_bytealign (w[ 3], w[ 4], offset); - w[45] = amd_bytealign (w[ 2], w[ 3], offset); - w[44] = amd_bytealign (w[ 1], w[ 2], offset); - w[43] = amd_bytealign (w[ 0], w[ 1], offset); - w[42] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[20], w[21], offset); + w[62] = hc_bytealign (w[19], w[20], offset); + w[61] = hc_bytealign (w[18], w[19], offset); + w[60] = hc_bytealign (w[17], w[18], offset); + w[59] = hc_bytealign (w[16], w[17], offset); + w[58] = hc_bytealign (w[15], w[16], offset); + w[57] = hc_bytealign (w[14], w[15], offset); + w[56] = hc_bytealign (w[13], w[14], offset); + w[55] = hc_bytealign (w[12], w[13], offset); + w[54] = hc_bytealign (w[11], w[12], offset); + w[53] = hc_bytealign (w[10], w[11], offset); + w[52] = hc_bytealign (w[ 9], w[10], offset); + w[51] = hc_bytealign (w[ 8], w[ 9], offset); + w[50] = hc_bytealign (w[ 7], w[ 8], offset); + w[49] = hc_bytealign (w[ 6], w[ 7], offset); + w[48] = hc_bytealign (w[ 5], w[ 6], offset); + w[47] = hc_bytealign (w[ 4], w[ 5], offset); + w[46] = hc_bytealign (w[ 3], w[ 4], offset); + w[45] = hc_bytealign (w[ 2], w[ 3], offset); + w[44] = hc_bytealign (w[ 1], w[ 2], offset); + w[43] = hc_bytealign (w[ 0], w[ 1], offset); + w[42] = hc_bytealign ( 0, w[ 0], offset); w[41] = 0; w[40] = 0; w[39] = 0; @@ -23322,27 +23322,27 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 43: - w[63] = amd_bytealign (w[19], w[20], offset); - w[62] = amd_bytealign (w[18], w[19], offset); - w[61] = amd_bytealign (w[17], w[18], offset); - w[60] = amd_bytealign (w[16], w[17], offset); - w[59] = amd_bytealign (w[15], w[16], offset); - w[58] = amd_bytealign (w[14], w[15], offset); - w[57] = amd_bytealign (w[13], w[14], offset); - w[56] = amd_bytealign (w[12], w[13], offset); - w[55] = amd_bytealign (w[11], w[12], offset); - w[54] = amd_bytealign (w[10], w[11], offset); - w[53] = amd_bytealign (w[ 9], w[10], offset); - w[52] = amd_bytealign (w[ 8], w[ 9], offset); - w[51] = amd_bytealign (w[ 7], w[ 8], offset); - w[50] = amd_bytealign (w[ 6], w[ 7], offset); - w[49] = amd_bytealign (w[ 5], w[ 6], offset); - w[48] = amd_bytealign (w[ 4], w[ 5], offset); - w[47] = amd_bytealign (w[ 3], w[ 4], offset); - w[46] = amd_bytealign (w[ 2], w[ 3], offset); - w[45] = amd_bytealign (w[ 1], w[ 2], offset); - w[44] = amd_bytealign (w[ 0], w[ 1], offset); - w[43] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[19], w[20], offset); + w[62] = hc_bytealign (w[18], w[19], offset); + w[61] = hc_bytealign (w[17], w[18], offset); + w[60] = hc_bytealign (w[16], w[17], offset); + w[59] = hc_bytealign (w[15], w[16], offset); + w[58] = hc_bytealign (w[14], w[15], offset); + w[57] = hc_bytealign (w[13], w[14], offset); + w[56] = hc_bytealign (w[12], w[13], offset); + w[55] = hc_bytealign (w[11], w[12], offset); + w[54] = hc_bytealign (w[10], w[11], offset); + w[53] = hc_bytealign (w[ 9], w[10], offset); + w[52] = hc_bytealign (w[ 8], w[ 9], offset); + w[51] = hc_bytealign (w[ 7], w[ 8], offset); + w[50] = hc_bytealign (w[ 6], w[ 7], offset); + w[49] = hc_bytealign (w[ 5], w[ 6], offset); + w[48] = hc_bytealign (w[ 4], w[ 5], offset); + w[47] = hc_bytealign (w[ 3], w[ 4], offset); + w[46] = hc_bytealign (w[ 2], w[ 3], offset); + w[45] = hc_bytealign (w[ 1], w[ 2], offset); + w[44] = hc_bytealign (w[ 0], w[ 1], offset); + w[43] = hc_bytealign ( 0, w[ 0], offset); w[42] = 0; w[41] = 0; w[40] = 0; @@ -23390,26 +23390,26 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 44: - w[63] = amd_bytealign (w[18], w[19], offset); - w[62] = amd_bytealign (w[17], w[18], offset); - w[61] = amd_bytealign (w[16], w[17], offset); - w[60] = amd_bytealign (w[15], w[16], offset); - w[59] = amd_bytealign (w[14], w[15], offset); - w[58] = amd_bytealign (w[13], w[14], offset); - w[57] = amd_bytealign (w[12], w[13], offset); - w[56] = amd_bytealign (w[11], w[12], offset); - w[55] = amd_bytealign (w[10], w[11], offset); - w[54] = amd_bytealign (w[ 9], w[10], offset); - w[53] = amd_bytealign (w[ 8], w[ 9], offset); - w[52] = amd_bytealign (w[ 7], w[ 8], offset); - w[51] = amd_bytealign (w[ 6], w[ 7], offset); - w[50] = amd_bytealign (w[ 5], w[ 6], offset); - w[49] = amd_bytealign (w[ 4], w[ 5], offset); - w[48] = amd_bytealign (w[ 3], w[ 4], offset); - w[47] = amd_bytealign (w[ 2], w[ 3], offset); - w[46] = amd_bytealign (w[ 1], w[ 2], offset); - w[45] = amd_bytealign (w[ 0], w[ 1], offset); - w[44] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[18], w[19], offset); + w[62] = hc_bytealign (w[17], w[18], offset); + w[61] = hc_bytealign (w[16], w[17], offset); + w[60] = hc_bytealign (w[15], w[16], offset); + w[59] = hc_bytealign (w[14], w[15], offset); + w[58] = hc_bytealign (w[13], w[14], offset); + w[57] = hc_bytealign (w[12], w[13], offset); + w[56] = hc_bytealign (w[11], w[12], offset); + w[55] = hc_bytealign (w[10], w[11], offset); + w[54] = hc_bytealign (w[ 9], w[10], offset); + w[53] = hc_bytealign (w[ 8], w[ 9], offset); + w[52] = hc_bytealign (w[ 7], w[ 8], offset); + w[51] = hc_bytealign (w[ 6], w[ 7], offset); + w[50] = hc_bytealign (w[ 5], w[ 6], offset); + w[49] = hc_bytealign (w[ 4], w[ 5], offset); + w[48] = hc_bytealign (w[ 3], w[ 4], offset); + w[47] = hc_bytealign (w[ 2], w[ 3], offset); + w[46] = hc_bytealign (w[ 1], w[ 2], offset); + w[45] = hc_bytealign (w[ 0], w[ 1], offset); + w[44] = hc_bytealign ( 0, w[ 0], offset); w[43] = 0; w[42] = 0; w[41] = 0; @@ -23458,25 +23458,25 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 45: - w[63] = amd_bytealign (w[17], w[18], offset); - w[62] = amd_bytealign (w[16], w[17], offset); - w[61] = amd_bytealign (w[15], w[16], offset); - w[60] = amd_bytealign (w[14], w[15], offset); - w[59] = amd_bytealign (w[13], w[14], offset); - w[58] = amd_bytealign (w[12], w[13], offset); - w[57] = amd_bytealign (w[11], w[12], offset); - w[56] = amd_bytealign (w[10], w[11], offset); - w[55] = amd_bytealign (w[ 9], w[10], offset); - w[54] = amd_bytealign (w[ 8], w[ 9], offset); - w[53] = amd_bytealign (w[ 7], w[ 8], offset); - w[52] = amd_bytealign (w[ 6], w[ 7], offset); - w[51] = amd_bytealign (w[ 5], w[ 6], offset); - w[50] = amd_bytealign (w[ 4], w[ 5], offset); - w[49] = amd_bytealign (w[ 3], w[ 4], offset); - w[48] = amd_bytealign (w[ 2], w[ 3], offset); - w[47] = amd_bytealign (w[ 1], w[ 2], offset); - w[46] = amd_bytealign (w[ 0], w[ 1], offset); - w[45] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[17], w[18], offset); + w[62] = hc_bytealign (w[16], w[17], offset); + w[61] = hc_bytealign (w[15], w[16], offset); + w[60] = hc_bytealign (w[14], w[15], offset); + w[59] = hc_bytealign (w[13], w[14], offset); + w[58] = hc_bytealign (w[12], w[13], offset); + w[57] = hc_bytealign (w[11], w[12], offset); + w[56] = hc_bytealign (w[10], w[11], offset); + w[55] = hc_bytealign (w[ 9], w[10], offset); + w[54] = hc_bytealign (w[ 8], w[ 9], offset); + w[53] = hc_bytealign (w[ 7], w[ 8], offset); + w[52] = hc_bytealign (w[ 6], w[ 7], offset); + w[51] = hc_bytealign (w[ 5], w[ 6], offset); + w[50] = hc_bytealign (w[ 4], w[ 5], offset); + w[49] = hc_bytealign (w[ 3], w[ 4], offset); + w[48] = hc_bytealign (w[ 2], w[ 3], offset); + w[47] = hc_bytealign (w[ 1], w[ 2], offset); + w[46] = hc_bytealign (w[ 0], w[ 1], offset); + w[45] = hc_bytealign ( 0, w[ 0], offset); w[44] = 0; w[43] = 0; w[42] = 0; @@ -23526,24 +23526,24 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 46: - w[63] = amd_bytealign (w[16], w[17], offset); - w[62] = amd_bytealign (w[15], w[16], offset); - w[61] = amd_bytealign (w[14], w[15], offset); - w[60] = amd_bytealign (w[13], w[14], offset); - w[59] = amd_bytealign (w[12], w[13], offset); - w[58] = amd_bytealign (w[11], w[12], offset); - w[57] = amd_bytealign (w[10], w[11], offset); - w[56] = amd_bytealign (w[ 9], w[10], offset); - w[55] = amd_bytealign (w[ 8], w[ 9], offset); - w[54] = amd_bytealign (w[ 7], w[ 8], offset); - w[53] = amd_bytealign (w[ 6], w[ 7], offset); - w[52] = amd_bytealign (w[ 5], w[ 6], offset); - w[51] = amd_bytealign (w[ 4], w[ 5], offset); - w[50] = amd_bytealign (w[ 3], w[ 4], offset); - w[49] = amd_bytealign (w[ 2], w[ 3], offset); - w[48] = amd_bytealign (w[ 1], w[ 2], offset); - w[47] = amd_bytealign (w[ 0], w[ 1], offset); - w[46] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[16], w[17], offset); + w[62] = hc_bytealign (w[15], w[16], offset); + w[61] = hc_bytealign (w[14], w[15], offset); + w[60] = hc_bytealign (w[13], w[14], offset); + w[59] = hc_bytealign (w[12], w[13], offset); + w[58] = hc_bytealign (w[11], w[12], offset); + w[57] = hc_bytealign (w[10], w[11], offset); + w[56] = hc_bytealign (w[ 9], w[10], offset); + w[55] = hc_bytealign (w[ 8], w[ 9], offset); + w[54] = hc_bytealign (w[ 7], w[ 8], offset); + w[53] = hc_bytealign (w[ 6], w[ 7], offset); + w[52] = hc_bytealign (w[ 5], w[ 6], offset); + w[51] = hc_bytealign (w[ 4], w[ 5], offset); + w[50] = hc_bytealign (w[ 3], w[ 4], offset); + w[49] = hc_bytealign (w[ 2], w[ 3], offset); + w[48] = hc_bytealign (w[ 1], w[ 2], offset); + w[47] = hc_bytealign (w[ 0], w[ 1], offset); + w[46] = hc_bytealign ( 0, w[ 0], offset); w[45] = 0; w[44] = 0; w[43] = 0; @@ -23594,23 +23594,23 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 47: - w[63] = amd_bytealign (w[15], w[16], offset); - w[62] = amd_bytealign (w[14], w[15], offset); - w[61] = amd_bytealign (w[13], w[14], offset); - w[60] = amd_bytealign (w[12], w[13], offset); - w[59] = amd_bytealign (w[11], w[12], offset); - w[58] = amd_bytealign (w[10], w[11], offset); - w[57] = amd_bytealign (w[ 9], w[10], offset); - w[56] = amd_bytealign (w[ 8], w[ 9], offset); - w[55] = amd_bytealign (w[ 7], w[ 8], offset); - w[54] = amd_bytealign (w[ 6], w[ 7], offset); - w[53] = amd_bytealign (w[ 5], w[ 6], offset); - w[52] = amd_bytealign (w[ 4], w[ 5], offset); - w[51] = amd_bytealign (w[ 3], w[ 4], offset); - w[50] = amd_bytealign (w[ 2], w[ 3], offset); - w[49] = amd_bytealign (w[ 1], w[ 2], offset); - w[48] = amd_bytealign (w[ 0], w[ 1], offset); - w[47] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[15], w[16], offset); + w[62] = hc_bytealign (w[14], w[15], offset); + w[61] = hc_bytealign (w[13], w[14], offset); + w[60] = hc_bytealign (w[12], w[13], offset); + w[59] = hc_bytealign (w[11], w[12], offset); + w[58] = hc_bytealign (w[10], w[11], offset); + w[57] = hc_bytealign (w[ 9], w[10], offset); + w[56] = hc_bytealign (w[ 8], w[ 9], offset); + w[55] = hc_bytealign (w[ 7], w[ 8], offset); + w[54] = hc_bytealign (w[ 6], w[ 7], offset); + w[53] = hc_bytealign (w[ 5], w[ 6], offset); + w[52] = hc_bytealign (w[ 4], w[ 5], offset); + w[51] = hc_bytealign (w[ 3], w[ 4], offset); + w[50] = hc_bytealign (w[ 2], w[ 3], offset); + w[49] = hc_bytealign (w[ 1], w[ 2], offset); + w[48] = hc_bytealign (w[ 0], w[ 1], offset); + w[47] = hc_bytealign ( 0, w[ 0], offset); w[46] = 0; w[45] = 0; w[44] = 0; @@ -23662,22 +23662,22 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 48: - w[63] = amd_bytealign (w[14], w[15], offset); - w[62] = amd_bytealign (w[13], w[14], offset); - w[61] = amd_bytealign (w[12], w[13], offset); - w[60] = amd_bytealign (w[11], w[12], offset); - w[59] = amd_bytealign (w[10], w[11], offset); - w[58] = amd_bytealign (w[ 9], w[10], offset); - w[57] = amd_bytealign (w[ 8], w[ 9], offset); - w[56] = amd_bytealign (w[ 7], w[ 8], offset); - w[55] = amd_bytealign (w[ 6], w[ 7], offset); - w[54] = amd_bytealign (w[ 5], w[ 6], offset); - w[53] = amd_bytealign (w[ 4], w[ 5], offset); - w[52] = amd_bytealign (w[ 3], w[ 4], offset); - w[51] = amd_bytealign (w[ 2], w[ 3], offset); - w[50] = amd_bytealign (w[ 1], w[ 2], offset); - w[49] = amd_bytealign (w[ 0], w[ 1], offset); - w[48] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[14], w[15], offset); + w[62] = hc_bytealign (w[13], w[14], offset); + w[61] = hc_bytealign (w[12], w[13], offset); + w[60] = hc_bytealign (w[11], w[12], offset); + w[59] = hc_bytealign (w[10], w[11], offset); + w[58] = hc_bytealign (w[ 9], w[10], offset); + w[57] = hc_bytealign (w[ 8], w[ 9], offset); + w[56] = hc_bytealign (w[ 7], w[ 8], offset); + w[55] = hc_bytealign (w[ 6], w[ 7], offset); + w[54] = hc_bytealign (w[ 5], w[ 6], offset); + w[53] = hc_bytealign (w[ 4], w[ 5], offset); + w[52] = hc_bytealign (w[ 3], w[ 4], offset); + w[51] = hc_bytealign (w[ 2], w[ 3], offset); + w[50] = hc_bytealign (w[ 1], w[ 2], offset); + w[49] = hc_bytealign (w[ 0], w[ 1], offset); + w[48] = hc_bytealign ( 0, w[ 0], offset); w[47] = 0; w[46] = 0; w[45] = 0; @@ -23730,21 +23730,21 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 49: - w[63] = amd_bytealign (w[13], w[14], offset); - w[62] = amd_bytealign (w[12], w[13], offset); - w[61] = amd_bytealign (w[11], w[12], offset); - w[60] = amd_bytealign (w[10], w[11], offset); - w[59] = amd_bytealign (w[ 9], w[10], offset); - w[58] = amd_bytealign (w[ 8], w[ 9], offset); - w[57] = amd_bytealign (w[ 7], w[ 8], offset); - w[56] = amd_bytealign (w[ 6], w[ 7], offset); - w[55] = amd_bytealign (w[ 5], w[ 6], offset); - w[54] = amd_bytealign (w[ 4], w[ 5], offset); - w[53] = amd_bytealign (w[ 3], w[ 4], offset); - w[52] = amd_bytealign (w[ 2], w[ 3], offset); - w[51] = amd_bytealign (w[ 1], w[ 2], offset); - w[50] = amd_bytealign (w[ 0], w[ 1], offset); - w[49] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[13], w[14], offset); + w[62] = hc_bytealign (w[12], w[13], offset); + w[61] = hc_bytealign (w[11], w[12], offset); + w[60] = hc_bytealign (w[10], w[11], offset); + w[59] = hc_bytealign (w[ 9], w[10], offset); + w[58] = hc_bytealign (w[ 8], w[ 9], offset); + w[57] = hc_bytealign (w[ 7], w[ 8], offset); + w[56] = hc_bytealign (w[ 6], w[ 7], offset); + w[55] = hc_bytealign (w[ 5], w[ 6], offset); + w[54] = hc_bytealign (w[ 4], w[ 5], offset); + w[53] = hc_bytealign (w[ 3], w[ 4], offset); + w[52] = hc_bytealign (w[ 2], w[ 3], offset); + w[51] = hc_bytealign (w[ 1], w[ 2], offset); + w[50] = hc_bytealign (w[ 0], w[ 1], offset); + w[49] = hc_bytealign ( 0, w[ 0], offset); w[48] = 0; w[47] = 0; w[46] = 0; @@ -23798,20 +23798,20 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 50: - w[63] = amd_bytealign (w[12], w[13], offset); - w[62] = amd_bytealign (w[11], w[12], offset); - w[61] = amd_bytealign (w[10], w[11], offset); - w[60] = amd_bytealign (w[ 9], w[10], offset); - w[59] = amd_bytealign (w[ 8], w[ 9], offset); - w[58] = amd_bytealign (w[ 7], w[ 8], offset); - w[57] = amd_bytealign (w[ 6], w[ 7], offset); - w[56] = amd_bytealign (w[ 5], w[ 6], offset); - w[55] = amd_bytealign (w[ 4], w[ 5], offset); - w[54] = amd_bytealign (w[ 3], w[ 4], offset); - w[53] = amd_bytealign (w[ 2], w[ 3], offset); - w[52] = amd_bytealign (w[ 1], w[ 2], offset); - w[51] = amd_bytealign (w[ 0], w[ 1], offset); - w[50] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[12], w[13], offset); + w[62] = hc_bytealign (w[11], w[12], offset); + w[61] = hc_bytealign (w[10], w[11], offset); + w[60] = hc_bytealign (w[ 9], w[10], offset); + w[59] = hc_bytealign (w[ 8], w[ 9], offset); + w[58] = hc_bytealign (w[ 7], w[ 8], offset); + w[57] = hc_bytealign (w[ 6], w[ 7], offset); + w[56] = hc_bytealign (w[ 5], w[ 6], offset); + w[55] = hc_bytealign (w[ 4], w[ 5], offset); + w[54] = hc_bytealign (w[ 3], w[ 4], offset); + w[53] = hc_bytealign (w[ 2], w[ 3], offset); + w[52] = hc_bytealign (w[ 1], w[ 2], offset); + w[51] = hc_bytealign (w[ 0], w[ 1], offset); + w[50] = hc_bytealign ( 0, w[ 0], offset); w[49] = 0; w[48] = 0; w[47] = 0; @@ -23866,19 +23866,19 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 51: - w[63] = amd_bytealign (w[11], w[12], offset); - w[62] = amd_bytealign (w[10], w[11], offset); - w[61] = amd_bytealign (w[ 9], w[10], offset); - w[60] = amd_bytealign (w[ 8], w[ 9], offset); - w[59] = amd_bytealign (w[ 7], w[ 8], offset); - w[58] = amd_bytealign (w[ 6], w[ 7], offset); - w[57] = amd_bytealign (w[ 5], w[ 6], offset); - w[56] = amd_bytealign (w[ 4], w[ 5], offset); - w[55] = amd_bytealign (w[ 3], w[ 4], offset); - w[54] = amd_bytealign (w[ 2], w[ 3], offset); - w[53] = amd_bytealign (w[ 1], w[ 2], offset); - w[52] = amd_bytealign (w[ 0], w[ 1], offset); - w[51] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[11], w[12], offset); + w[62] = hc_bytealign (w[10], w[11], offset); + w[61] = hc_bytealign (w[ 9], w[10], offset); + w[60] = hc_bytealign (w[ 8], w[ 9], offset); + w[59] = hc_bytealign (w[ 7], w[ 8], offset); + w[58] = hc_bytealign (w[ 6], w[ 7], offset); + w[57] = hc_bytealign (w[ 5], w[ 6], offset); + w[56] = hc_bytealign (w[ 4], w[ 5], offset); + w[55] = hc_bytealign (w[ 3], w[ 4], offset); + w[54] = hc_bytealign (w[ 2], w[ 3], offset); + w[53] = hc_bytealign (w[ 1], w[ 2], offset); + w[52] = hc_bytealign (w[ 0], w[ 1], offset); + w[51] = hc_bytealign ( 0, w[ 0], offset); w[50] = 0; w[49] = 0; w[48] = 0; @@ -23934,18 +23934,18 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 52: - w[63] = amd_bytealign (w[10], w[11], offset); - w[62] = amd_bytealign (w[ 9], w[10], offset); - w[61] = amd_bytealign (w[ 8], w[ 9], offset); - w[60] = amd_bytealign (w[ 7], w[ 8], offset); - w[59] = amd_bytealign (w[ 6], w[ 7], offset); - w[58] = amd_bytealign (w[ 5], w[ 6], offset); - w[57] = amd_bytealign (w[ 4], w[ 5], offset); - w[56] = amd_bytealign (w[ 3], w[ 4], offset); - w[55] = amd_bytealign (w[ 2], w[ 3], offset); - w[54] = amd_bytealign (w[ 1], w[ 2], offset); - w[53] = amd_bytealign (w[ 0], w[ 1], offset); - w[52] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[10], w[11], offset); + w[62] = hc_bytealign (w[ 9], w[10], offset); + w[61] = hc_bytealign (w[ 8], w[ 9], offset); + w[60] = hc_bytealign (w[ 7], w[ 8], offset); + w[59] = hc_bytealign (w[ 6], w[ 7], offset); + w[58] = hc_bytealign (w[ 5], w[ 6], offset); + w[57] = hc_bytealign (w[ 4], w[ 5], offset); + w[56] = hc_bytealign (w[ 3], w[ 4], offset); + w[55] = hc_bytealign (w[ 2], w[ 3], offset); + w[54] = hc_bytealign (w[ 1], w[ 2], offset); + w[53] = hc_bytealign (w[ 0], w[ 1], offset); + w[52] = hc_bytealign ( 0, w[ 0], offset); w[51] = 0; w[50] = 0; w[49] = 0; @@ -24002,17 +24002,17 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 53: - w[63] = amd_bytealign (w[ 9], w[10], offset); - w[62] = amd_bytealign (w[ 8], w[ 9], offset); - w[61] = amd_bytealign (w[ 7], w[ 8], offset); - w[60] = amd_bytealign (w[ 6], w[ 7], offset); - w[59] = amd_bytealign (w[ 5], w[ 6], offset); - w[58] = amd_bytealign (w[ 4], w[ 5], offset); - w[57] = amd_bytealign (w[ 3], w[ 4], offset); - w[56] = amd_bytealign (w[ 2], w[ 3], offset); - w[55] = amd_bytealign (w[ 1], w[ 2], offset); - w[54] = amd_bytealign (w[ 0], w[ 1], offset); - w[53] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[ 9], w[10], offset); + w[62] = hc_bytealign (w[ 8], w[ 9], offset); + w[61] = hc_bytealign (w[ 7], w[ 8], offset); + w[60] = hc_bytealign (w[ 6], w[ 7], offset); + w[59] = hc_bytealign (w[ 5], w[ 6], offset); + w[58] = hc_bytealign (w[ 4], w[ 5], offset); + w[57] = hc_bytealign (w[ 3], w[ 4], offset); + w[56] = hc_bytealign (w[ 2], w[ 3], offset); + w[55] = hc_bytealign (w[ 1], w[ 2], offset); + w[54] = hc_bytealign (w[ 0], w[ 1], offset); + w[53] = hc_bytealign ( 0, w[ 0], offset); w[52] = 0; w[51] = 0; w[50] = 0; @@ -24070,16 +24070,16 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 54: - w[63] = amd_bytealign (w[ 8], w[ 9], offset); - w[62] = amd_bytealign (w[ 7], w[ 8], offset); - w[61] = amd_bytealign (w[ 6], w[ 7], offset); - w[60] = amd_bytealign (w[ 5], w[ 6], offset); - w[59] = amd_bytealign (w[ 4], w[ 5], offset); - w[58] = amd_bytealign (w[ 3], w[ 4], offset); - w[57] = amd_bytealign (w[ 2], w[ 3], offset); - w[56] = amd_bytealign (w[ 1], w[ 2], offset); - w[55] = amd_bytealign (w[ 0], w[ 1], offset); - w[54] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[ 8], w[ 9], offset); + w[62] = hc_bytealign (w[ 7], w[ 8], offset); + w[61] = hc_bytealign (w[ 6], w[ 7], offset); + w[60] = hc_bytealign (w[ 5], w[ 6], offset); + w[59] = hc_bytealign (w[ 4], w[ 5], offset); + w[58] = hc_bytealign (w[ 3], w[ 4], offset); + w[57] = hc_bytealign (w[ 2], w[ 3], offset); + w[56] = hc_bytealign (w[ 1], w[ 2], offset); + w[55] = hc_bytealign (w[ 0], w[ 1], offset); + w[54] = hc_bytealign ( 0, w[ 0], offset); w[53] = 0; w[52] = 0; w[51] = 0; @@ -24138,15 +24138,15 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 55: - w[63] = amd_bytealign (w[ 7], w[ 8], offset); - w[62] = amd_bytealign (w[ 6], w[ 7], offset); - w[61] = amd_bytealign (w[ 5], w[ 6], offset); - w[60] = amd_bytealign (w[ 4], w[ 5], offset); - w[59] = amd_bytealign (w[ 3], w[ 4], offset); - w[58] = amd_bytealign (w[ 2], w[ 3], offset); - w[57] = amd_bytealign (w[ 1], w[ 2], offset); - w[56] = amd_bytealign (w[ 0], w[ 1], offset); - w[55] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[ 7], w[ 8], offset); + w[62] = hc_bytealign (w[ 6], w[ 7], offset); + w[61] = hc_bytealign (w[ 5], w[ 6], offset); + w[60] = hc_bytealign (w[ 4], w[ 5], offset); + w[59] = hc_bytealign (w[ 3], w[ 4], offset); + w[58] = hc_bytealign (w[ 2], w[ 3], offset); + w[57] = hc_bytealign (w[ 1], w[ 2], offset); + w[56] = hc_bytealign (w[ 0], w[ 1], offset); + w[55] = hc_bytealign ( 0, w[ 0], offset); w[54] = 0; w[53] = 0; w[52] = 0; @@ -24206,14 +24206,14 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 56: - w[63] = amd_bytealign (w[ 6], w[ 7], offset); - w[62] = amd_bytealign (w[ 5], w[ 6], offset); - w[61] = amd_bytealign (w[ 4], w[ 5], offset); - w[60] = amd_bytealign (w[ 3], w[ 4], offset); - w[59] = amd_bytealign (w[ 2], w[ 3], offset); - w[58] = amd_bytealign (w[ 1], w[ 2], offset); - w[57] = amd_bytealign (w[ 0], w[ 1], offset); - w[56] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[ 6], w[ 7], offset); + w[62] = hc_bytealign (w[ 5], w[ 6], offset); + w[61] = hc_bytealign (w[ 4], w[ 5], offset); + w[60] = hc_bytealign (w[ 3], w[ 4], offset); + w[59] = hc_bytealign (w[ 2], w[ 3], offset); + w[58] = hc_bytealign (w[ 1], w[ 2], offset); + w[57] = hc_bytealign (w[ 0], w[ 1], offset); + w[56] = hc_bytealign ( 0, w[ 0], offset); w[55] = 0; w[54] = 0; w[53] = 0; @@ -24274,13 +24274,13 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 57: - w[63] = amd_bytealign (w[ 5], w[ 6], offset); - w[62] = amd_bytealign (w[ 4], w[ 5], offset); - w[61] = amd_bytealign (w[ 3], w[ 4], offset); - w[60] = amd_bytealign (w[ 2], w[ 3], offset); - w[59] = amd_bytealign (w[ 1], w[ 2], offset); - w[58] = amd_bytealign (w[ 0], w[ 1], offset); - w[57] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[ 5], w[ 6], offset); + w[62] = hc_bytealign (w[ 4], w[ 5], offset); + w[61] = hc_bytealign (w[ 3], w[ 4], offset); + w[60] = hc_bytealign (w[ 2], w[ 3], offset); + w[59] = hc_bytealign (w[ 1], w[ 2], offset); + w[58] = hc_bytealign (w[ 0], w[ 1], offset); + w[57] = hc_bytealign ( 0, w[ 0], offset); w[56] = 0; w[55] = 0; w[54] = 0; @@ -24342,12 +24342,12 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 58: - w[63] = amd_bytealign (w[ 4], w[ 5], offset); - w[62] = amd_bytealign (w[ 3], w[ 4], offset); - w[61] = amd_bytealign (w[ 2], w[ 3], offset); - w[60] = amd_bytealign (w[ 1], w[ 2], offset); - w[59] = amd_bytealign (w[ 0], w[ 1], offset); - w[58] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[ 4], w[ 5], offset); + w[62] = hc_bytealign (w[ 3], w[ 4], offset); + w[61] = hc_bytealign (w[ 2], w[ 3], offset); + w[60] = hc_bytealign (w[ 1], w[ 2], offset); + w[59] = hc_bytealign (w[ 0], w[ 1], offset); + w[58] = hc_bytealign ( 0, w[ 0], offset); w[57] = 0; w[56] = 0; w[55] = 0; @@ -24410,11 +24410,11 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 59: - w[63] = amd_bytealign (w[ 3], w[ 4], offset); - w[62] = amd_bytealign (w[ 2], w[ 3], offset); - w[61] = amd_bytealign (w[ 1], w[ 2], offset); - w[60] = amd_bytealign (w[ 0], w[ 1], offset); - w[59] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[ 3], w[ 4], offset); + w[62] = hc_bytealign (w[ 2], w[ 3], offset); + w[61] = hc_bytealign (w[ 1], w[ 2], offset); + w[60] = hc_bytealign (w[ 0], w[ 1], offset); + w[59] = hc_bytealign ( 0, w[ 0], offset); w[58] = 0; w[57] = 0; w[56] = 0; @@ -24478,10 +24478,10 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 60: - w[63] = amd_bytealign (w[ 2], w[ 3], offset); - w[62] = amd_bytealign (w[ 1], w[ 2], offset); - w[61] = amd_bytealign (w[ 0], w[ 1], offset); - w[60] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[ 2], w[ 3], offset); + w[62] = hc_bytealign (w[ 1], w[ 2], offset); + w[61] = hc_bytealign (w[ 0], w[ 1], offset); + w[60] = hc_bytealign ( 0, w[ 0], offset); w[59] = 0; w[58] = 0; w[57] = 0; @@ -24546,9 +24546,9 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 61: - w[63] = amd_bytealign (w[ 1], w[ 2], offset); - w[62] = amd_bytealign (w[ 0], w[ 1], offset); - w[61] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[ 1], w[ 2], offset); + w[62] = hc_bytealign (w[ 0], w[ 1], offset); + w[61] = hc_bytealign ( 0, w[ 0], offset); w[60] = 0; w[59] = 0; w[58] = 0; @@ -24614,8 +24614,8 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 62: - w[63] = amd_bytealign (w[ 0], w[ 1], offset); - w[62] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign (w[ 0], w[ 1], offset); + w[62] = hc_bytealign ( 0, w[ 0], offset); w[61] = 0; w[60] = 0; w[59] = 0; @@ -24682,7 +24682,7 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 63: - w[63] = amd_bytealign ( 0, w[ 0], offset); + w[63] = hc_bytealign ( 0, w[ 0], offset); w[62] = 0; w[61] = 0; w[60] = 0; @@ -24764,271 +24764,271 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) switch (offset_switch) { case 0: - w[63] = __byte_perm (w[63], w[62], selector); - w[62] = __byte_perm (w[62], w[61], selector); - w[61] = __byte_perm (w[61], w[60], selector); - w[60] = __byte_perm (w[60], w[59], selector); - w[59] = __byte_perm (w[59], w[58], selector); - w[58] = __byte_perm (w[58], w[57], selector); - w[57] = __byte_perm (w[57], w[56], selector); - w[56] = __byte_perm (w[56], w[55], selector); - w[55] = __byte_perm (w[55], w[54], selector); - w[54] = __byte_perm (w[54], w[53], selector); - w[53] = __byte_perm (w[53], w[52], selector); - w[52] = __byte_perm (w[52], w[51], selector); - w[51] = __byte_perm (w[51], w[50], selector); - w[50] = __byte_perm (w[50], w[49], selector); - w[49] = __byte_perm (w[49], w[48], selector); - w[48] = __byte_perm (w[48], w[47], selector); - w[47] = __byte_perm (w[47], w[46], selector); - w[46] = __byte_perm (w[46], w[45], selector); - w[45] = __byte_perm (w[45], w[44], selector); - w[44] = __byte_perm (w[44], w[43], selector); - w[43] = __byte_perm (w[43], w[42], selector); - w[42] = __byte_perm (w[42], w[41], selector); - w[41] = __byte_perm (w[41], w[40], selector); - w[40] = __byte_perm (w[40], w[39], selector); - w[39] = __byte_perm (w[39], w[38], selector); - w[38] = __byte_perm (w[38], w[37], selector); - w[37] = __byte_perm (w[37], w[36], selector); - w[36] = __byte_perm (w[36], w[35], selector); - w[35] = __byte_perm (w[35], w[34], selector); - w[34] = __byte_perm (w[34], w[33], selector); - w[33] = __byte_perm (w[33], w[32], selector); - w[32] = __byte_perm (w[32], w[31], selector); - w[31] = __byte_perm (w[31], w[30], selector); - w[30] = __byte_perm (w[30], w[29], selector); - w[29] = __byte_perm (w[29], w[28], selector); - w[28] = __byte_perm (w[28], w[27], selector); - w[27] = __byte_perm (w[27], w[26], selector); - w[26] = __byte_perm (w[26], w[25], selector); - w[25] = __byte_perm (w[25], w[24], selector); - w[24] = __byte_perm (w[24], w[23], selector); - w[23] = __byte_perm (w[23], w[22], selector); - w[22] = __byte_perm (w[22], w[21], selector); - w[21] = __byte_perm (w[21], w[20], selector); - w[20] = __byte_perm (w[20], w[19], selector); - w[19] = __byte_perm (w[19], w[18], selector); - w[18] = __byte_perm (w[18], w[17], selector); - w[17] = __byte_perm (w[17], w[16], selector); - w[16] = __byte_perm (w[16], w[15], selector); - w[15] = __byte_perm (w[15], w[14], selector); - w[14] = __byte_perm (w[14], w[13], selector); - w[13] = __byte_perm (w[13], w[12], selector); - w[12] = __byte_perm (w[12], w[11], selector); - w[11] = __byte_perm (w[11], w[10], selector); - w[10] = __byte_perm (w[10], w[ 9], selector); - w[ 9] = __byte_perm (w[ 9], w[ 8], selector); - w[ 8] = __byte_perm (w[ 8], w[ 7], selector); - w[ 7] = __byte_perm (w[ 7], w[ 6], selector); - w[ 6] = __byte_perm (w[ 6], w[ 5], selector); - w[ 5] = __byte_perm (w[ 5], w[ 4], selector); - w[ 4] = __byte_perm (w[ 4], w[ 3], selector); - w[ 3] = __byte_perm (w[ 3], w[ 2], selector); - w[ 2] = __byte_perm (w[ 2], w[ 1], selector); - w[ 1] = __byte_perm (w[ 1], w[ 0], selector); - w[ 0] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[63], w[62], selector); + w[62] = hc_byte_perm (w[62], w[61], selector); + w[61] = hc_byte_perm (w[61], w[60], selector); + w[60] = hc_byte_perm (w[60], w[59], selector); + w[59] = hc_byte_perm (w[59], w[58], selector); + w[58] = hc_byte_perm (w[58], w[57], selector); + w[57] = hc_byte_perm (w[57], w[56], selector); + w[56] = hc_byte_perm (w[56], w[55], selector); + w[55] = hc_byte_perm (w[55], w[54], selector); + w[54] = hc_byte_perm (w[54], w[53], selector); + w[53] = hc_byte_perm (w[53], w[52], selector); + w[52] = hc_byte_perm (w[52], w[51], selector); + w[51] = hc_byte_perm (w[51], w[50], selector); + w[50] = hc_byte_perm (w[50], w[49], selector); + w[49] = hc_byte_perm (w[49], w[48], selector); + w[48] = hc_byte_perm (w[48], w[47], selector); + w[47] = hc_byte_perm (w[47], w[46], selector); + w[46] = hc_byte_perm (w[46], w[45], selector); + w[45] = hc_byte_perm (w[45], w[44], selector); + w[44] = hc_byte_perm (w[44], w[43], selector); + w[43] = hc_byte_perm (w[43], w[42], selector); + w[42] = hc_byte_perm (w[42], w[41], selector); + w[41] = hc_byte_perm (w[41], w[40], selector); + w[40] = hc_byte_perm (w[40], w[39], selector); + w[39] = hc_byte_perm (w[39], w[38], selector); + w[38] = hc_byte_perm (w[38], w[37], selector); + w[37] = hc_byte_perm (w[37], w[36], selector); + w[36] = hc_byte_perm (w[36], w[35], selector); + w[35] = hc_byte_perm (w[35], w[34], selector); + w[34] = hc_byte_perm (w[34], w[33], selector); + w[33] = hc_byte_perm (w[33], w[32], selector); + w[32] = hc_byte_perm (w[32], w[31], selector); + w[31] = hc_byte_perm (w[31], w[30], selector); + w[30] = hc_byte_perm (w[30], w[29], selector); + w[29] = hc_byte_perm (w[29], w[28], selector); + w[28] = hc_byte_perm (w[28], w[27], selector); + w[27] = hc_byte_perm (w[27], w[26], selector); + w[26] = hc_byte_perm (w[26], w[25], selector); + w[25] = hc_byte_perm (w[25], w[24], selector); + w[24] = hc_byte_perm (w[24], w[23], selector); + w[23] = hc_byte_perm (w[23], w[22], selector); + w[22] = hc_byte_perm (w[22], w[21], selector); + w[21] = hc_byte_perm (w[21], w[20], selector); + w[20] = hc_byte_perm (w[20], w[19], selector); + w[19] = hc_byte_perm (w[19], w[18], selector); + w[18] = hc_byte_perm (w[18], w[17], selector); + w[17] = hc_byte_perm (w[17], w[16], selector); + w[16] = hc_byte_perm (w[16], w[15], selector); + w[15] = hc_byte_perm (w[15], w[14], selector); + w[14] = hc_byte_perm (w[14], w[13], selector); + w[13] = hc_byte_perm (w[13], w[12], selector); + w[12] = hc_byte_perm (w[12], w[11], selector); + w[11] = hc_byte_perm (w[11], w[10], selector); + w[10] = hc_byte_perm (w[10], w[ 9], selector); + w[ 9] = hc_byte_perm (w[ 9], w[ 8], selector); + w[ 8] = hc_byte_perm (w[ 8], w[ 7], selector); + w[ 7] = hc_byte_perm (w[ 7], w[ 6], selector); + w[ 6] = hc_byte_perm (w[ 6], w[ 5], selector); + w[ 5] = hc_byte_perm (w[ 5], w[ 4], selector); + w[ 4] = hc_byte_perm (w[ 4], w[ 3], selector); + w[ 3] = hc_byte_perm (w[ 3], w[ 2], selector); + w[ 2] = hc_byte_perm (w[ 2], w[ 1], selector); + w[ 1] = hc_byte_perm (w[ 1], w[ 0], selector); + w[ 0] = hc_byte_perm (w[ 0], 0, selector); break; case 1: - w[63] = __byte_perm (w[62], w[61], selector); - w[62] = __byte_perm (w[61], w[60], selector); - w[61] = __byte_perm (w[60], w[59], selector); - w[60] = __byte_perm (w[59], w[58], selector); - w[59] = __byte_perm (w[58], w[57], selector); - w[58] = __byte_perm (w[57], w[56], selector); - w[57] = __byte_perm (w[56], w[55], selector); - w[56] = __byte_perm (w[55], w[54], selector); - w[55] = __byte_perm (w[54], w[53], selector); - w[54] = __byte_perm (w[53], w[52], selector); - w[53] = __byte_perm (w[52], w[51], selector); - w[52] = __byte_perm (w[51], w[50], selector); - w[51] = __byte_perm (w[50], w[49], selector); - w[50] = __byte_perm (w[49], w[48], selector); - w[49] = __byte_perm (w[48], w[47], selector); - w[48] = __byte_perm (w[47], w[46], selector); - w[47] = __byte_perm (w[46], w[45], selector); - w[46] = __byte_perm (w[45], w[44], selector); - w[45] = __byte_perm (w[44], w[43], selector); - w[44] = __byte_perm (w[43], w[42], selector); - w[43] = __byte_perm (w[42], w[41], selector); - w[42] = __byte_perm (w[41], w[40], selector); - w[41] = __byte_perm (w[40], w[39], selector); - w[40] = __byte_perm (w[39], w[38], selector); - w[39] = __byte_perm (w[38], w[37], selector); - w[38] = __byte_perm (w[37], w[36], selector); - w[37] = __byte_perm (w[36], w[35], selector); - w[36] = __byte_perm (w[35], w[34], selector); - w[35] = __byte_perm (w[34], w[33], selector); - w[34] = __byte_perm (w[33], w[32], selector); - w[33] = __byte_perm (w[32], w[31], selector); - w[32] = __byte_perm (w[31], w[30], selector); - w[31] = __byte_perm (w[30], w[29], selector); - w[30] = __byte_perm (w[29], w[28], selector); - w[29] = __byte_perm (w[28], w[27], selector); - w[28] = __byte_perm (w[27], w[26], selector); - w[27] = __byte_perm (w[26], w[25], selector); - w[26] = __byte_perm (w[25], w[24], selector); - w[25] = __byte_perm (w[24], w[23], selector); - w[24] = __byte_perm (w[23], w[22], selector); - w[23] = __byte_perm (w[22], w[21], selector); - w[22] = __byte_perm (w[21], w[20], selector); - w[21] = __byte_perm (w[20], w[19], selector); - w[20] = __byte_perm (w[19], w[18], selector); - w[19] = __byte_perm (w[18], w[17], selector); - w[18] = __byte_perm (w[17], w[16], selector); - w[17] = __byte_perm (w[16], w[15], selector); - w[16] = __byte_perm (w[15], w[14], selector); - w[15] = __byte_perm (w[14], w[13], selector); - w[14] = __byte_perm (w[13], w[12], selector); - w[13] = __byte_perm (w[12], w[11], selector); - w[12] = __byte_perm (w[11], w[10], selector); - w[11] = __byte_perm (w[10], w[ 9], selector); - w[10] = __byte_perm (w[ 9], w[ 8], selector); - w[ 9] = __byte_perm (w[ 8], w[ 7], selector); - w[ 8] = __byte_perm (w[ 7], w[ 6], selector); - w[ 7] = __byte_perm (w[ 6], w[ 5], selector); - w[ 6] = __byte_perm (w[ 5], w[ 4], selector); - w[ 5] = __byte_perm (w[ 4], w[ 3], selector); - w[ 4] = __byte_perm (w[ 3], w[ 2], selector); - w[ 3] = __byte_perm (w[ 2], w[ 1], selector); - w[ 2] = __byte_perm (w[ 1], w[ 0], selector); - w[ 1] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[62], w[61], selector); + w[62] = hc_byte_perm (w[61], w[60], selector); + w[61] = hc_byte_perm (w[60], w[59], selector); + w[60] = hc_byte_perm (w[59], w[58], selector); + w[59] = hc_byte_perm (w[58], w[57], selector); + w[58] = hc_byte_perm (w[57], w[56], selector); + w[57] = hc_byte_perm (w[56], w[55], selector); + w[56] = hc_byte_perm (w[55], w[54], selector); + w[55] = hc_byte_perm (w[54], w[53], selector); + w[54] = hc_byte_perm (w[53], w[52], selector); + w[53] = hc_byte_perm (w[52], w[51], selector); + w[52] = hc_byte_perm (w[51], w[50], selector); + w[51] = hc_byte_perm (w[50], w[49], selector); + w[50] = hc_byte_perm (w[49], w[48], selector); + w[49] = hc_byte_perm (w[48], w[47], selector); + w[48] = hc_byte_perm (w[47], w[46], selector); + w[47] = hc_byte_perm (w[46], w[45], selector); + w[46] = hc_byte_perm (w[45], w[44], selector); + w[45] = hc_byte_perm (w[44], w[43], selector); + w[44] = hc_byte_perm (w[43], w[42], selector); + w[43] = hc_byte_perm (w[42], w[41], selector); + w[42] = hc_byte_perm (w[41], w[40], selector); + w[41] = hc_byte_perm (w[40], w[39], selector); + w[40] = hc_byte_perm (w[39], w[38], selector); + w[39] = hc_byte_perm (w[38], w[37], selector); + w[38] = hc_byte_perm (w[37], w[36], selector); + w[37] = hc_byte_perm (w[36], w[35], selector); + w[36] = hc_byte_perm (w[35], w[34], selector); + w[35] = hc_byte_perm (w[34], w[33], selector); + w[34] = hc_byte_perm (w[33], w[32], selector); + w[33] = hc_byte_perm (w[32], w[31], selector); + w[32] = hc_byte_perm (w[31], w[30], selector); + w[31] = hc_byte_perm (w[30], w[29], selector); + w[30] = hc_byte_perm (w[29], w[28], selector); + w[29] = hc_byte_perm (w[28], w[27], selector); + w[28] = hc_byte_perm (w[27], w[26], selector); + w[27] = hc_byte_perm (w[26], w[25], selector); + w[26] = hc_byte_perm (w[25], w[24], selector); + w[25] = hc_byte_perm (w[24], w[23], selector); + w[24] = hc_byte_perm (w[23], w[22], selector); + w[23] = hc_byte_perm (w[22], w[21], selector); + w[22] = hc_byte_perm (w[21], w[20], selector); + w[21] = hc_byte_perm (w[20], w[19], selector); + w[20] = hc_byte_perm (w[19], w[18], selector); + w[19] = hc_byte_perm (w[18], w[17], selector); + w[18] = hc_byte_perm (w[17], w[16], selector); + w[17] = hc_byte_perm (w[16], w[15], selector); + w[16] = hc_byte_perm (w[15], w[14], selector); + w[15] = hc_byte_perm (w[14], w[13], selector); + w[14] = hc_byte_perm (w[13], w[12], selector); + w[13] = hc_byte_perm (w[12], w[11], selector); + w[12] = hc_byte_perm (w[11], w[10], selector); + w[11] = hc_byte_perm (w[10], w[ 9], selector); + w[10] = hc_byte_perm (w[ 9], w[ 8], selector); + w[ 9] = hc_byte_perm (w[ 8], w[ 7], selector); + w[ 8] = hc_byte_perm (w[ 7], w[ 6], selector); + w[ 7] = hc_byte_perm (w[ 6], w[ 5], selector); + w[ 6] = hc_byte_perm (w[ 5], w[ 4], selector); + w[ 5] = hc_byte_perm (w[ 4], w[ 3], selector); + w[ 4] = hc_byte_perm (w[ 3], w[ 2], selector); + w[ 3] = hc_byte_perm (w[ 2], w[ 1], selector); + w[ 2] = hc_byte_perm (w[ 1], w[ 0], selector); + w[ 1] = hc_byte_perm (w[ 0], 0, selector); w[ 0] = 0; break; case 2: - w[63] = __byte_perm (w[61], w[60], selector); - w[62] = __byte_perm (w[60], w[59], selector); - w[61] = __byte_perm (w[59], w[58], selector); - w[60] = __byte_perm (w[58], w[57], selector); - w[59] = __byte_perm (w[57], w[56], selector); - w[58] = __byte_perm (w[56], w[55], selector); - w[57] = __byte_perm (w[55], w[54], selector); - w[56] = __byte_perm (w[54], w[53], selector); - w[55] = __byte_perm (w[53], w[52], selector); - w[54] = __byte_perm (w[52], w[51], selector); - w[53] = __byte_perm (w[51], w[50], selector); - w[52] = __byte_perm (w[50], w[49], selector); - w[51] = __byte_perm (w[49], w[48], selector); - w[50] = __byte_perm (w[48], w[47], selector); - w[49] = __byte_perm (w[47], w[46], selector); - w[48] = __byte_perm (w[46], w[45], selector); - w[47] = __byte_perm (w[45], w[44], selector); - w[46] = __byte_perm (w[44], w[43], selector); - w[45] = __byte_perm (w[43], w[42], selector); - w[44] = __byte_perm (w[42], w[41], selector); - w[43] = __byte_perm (w[41], w[40], selector); - w[42] = __byte_perm (w[40], w[39], selector); - w[41] = __byte_perm (w[39], w[38], selector); - w[40] = __byte_perm (w[38], w[37], selector); - w[39] = __byte_perm (w[37], w[36], selector); - w[38] = __byte_perm (w[36], w[35], selector); - w[37] = __byte_perm (w[35], w[34], selector); - w[36] = __byte_perm (w[34], w[33], selector); - w[35] = __byte_perm (w[33], w[32], selector); - w[34] = __byte_perm (w[32], w[31], selector); - w[33] = __byte_perm (w[31], w[30], selector); - w[32] = __byte_perm (w[30], w[29], selector); - w[31] = __byte_perm (w[29], w[28], selector); - w[30] = __byte_perm (w[28], w[27], selector); - w[29] = __byte_perm (w[27], w[26], selector); - w[28] = __byte_perm (w[26], w[25], selector); - w[27] = __byte_perm (w[25], w[24], selector); - w[26] = __byte_perm (w[24], w[23], selector); - w[25] = __byte_perm (w[23], w[22], selector); - w[24] = __byte_perm (w[22], w[21], selector); - w[23] = __byte_perm (w[21], w[20], selector); - w[22] = __byte_perm (w[20], w[19], selector); - w[21] = __byte_perm (w[19], w[18], selector); - w[20] = __byte_perm (w[18], w[17], selector); - w[19] = __byte_perm (w[17], w[16], selector); - w[18] = __byte_perm (w[16], w[15], selector); - w[17] = __byte_perm (w[15], w[14], selector); - w[16] = __byte_perm (w[14], w[13], selector); - w[15] = __byte_perm (w[13], w[12], selector); - w[14] = __byte_perm (w[12], w[11], selector); - w[13] = __byte_perm (w[11], w[10], selector); - w[12] = __byte_perm (w[10], w[ 9], selector); - w[11] = __byte_perm (w[ 9], w[ 8], selector); - w[10] = __byte_perm (w[ 8], w[ 7], selector); - w[ 9] = __byte_perm (w[ 7], w[ 6], selector); - w[ 8] = __byte_perm (w[ 6], w[ 5], selector); - w[ 7] = __byte_perm (w[ 5], w[ 4], selector); - w[ 6] = __byte_perm (w[ 4], w[ 3], selector); - w[ 5] = __byte_perm (w[ 3], w[ 2], selector); - w[ 4] = __byte_perm (w[ 2], w[ 1], selector); - w[ 3] = __byte_perm (w[ 1], w[ 0], selector); - w[ 2] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[61], w[60], selector); + w[62] = hc_byte_perm (w[60], w[59], selector); + w[61] = hc_byte_perm (w[59], w[58], selector); + w[60] = hc_byte_perm (w[58], w[57], selector); + w[59] = hc_byte_perm (w[57], w[56], selector); + w[58] = hc_byte_perm (w[56], w[55], selector); + w[57] = hc_byte_perm (w[55], w[54], selector); + w[56] = hc_byte_perm (w[54], w[53], selector); + w[55] = hc_byte_perm (w[53], w[52], selector); + w[54] = hc_byte_perm (w[52], w[51], selector); + w[53] = hc_byte_perm (w[51], w[50], selector); + w[52] = hc_byte_perm (w[50], w[49], selector); + w[51] = hc_byte_perm (w[49], w[48], selector); + w[50] = hc_byte_perm (w[48], w[47], selector); + w[49] = hc_byte_perm (w[47], w[46], selector); + w[48] = hc_byte_perm (w[46], w[45], selector); + w[47] = hc_byte_perm (w[45], w[44], selector); + w[46] = hc_byte_perm (w[44], w[43], selector); + w[45] = hc_byte_perm (w[43], w[42], selector); + w[44] = hc_byte_perm (w[42], w[41], selector); + w[43] = hc_byte_perm (w[41], w[40], selector); + w[42] = hc_byte_perm (w[40], w[39], selector); + w[41] = hc_byte_perm (w[39], w[38], selector); + w[40] = hc_byte_perm (w[38], w[37], selector); + w[39] = hc_byte_perm (w[37], w[36], selector); + w[38] = hc_byte_perm (w[36], w[35], selector); + w[37] = hc_byte_perm (w[35], w[34], selector); + w[36] = hc_byte_perm (w[34], w[33], selector); + w[35] = hc_byte_perm (w[33], w[32], selector); + w[34] = hc_byte_perm (w[32], w[31], selector); + w[33] = hc_byte_perm (w[31], w[30], selector); + w[32] = hc_byte_perm (w[30], w[29], selector); + w[31] = hc_byte_perm (w[29], w[28], selector); + w[30] = hc_byte_perm (w[28], w[27], selector); + w[29] = hc_byte_perm (w[27], w[26], selector); + w[28] = hc_byte_perm (w[26], w[25], selector); + w[27] = hc_byte_perm (w[25], w[24], selector); + w[26] = hc_byte_perm (w[24], w[23], selector); + w[25] = hc_byte_perm (w[23], w[22], selector); + w[24] = hc_byte_perm (w[22], w[21], selector); + w[23] = hc_byte_perm (w[21], w[20], selector); + w[22] = hc_byte_perm (w[20], w[19], selector); + w[21] = hc_byte_perm (w[19], w[18], selector); + w[20] = hc_byte_perm (w[18], w[17], selector); + w[19] = hc_byte_perm (w[17], w[16], selector); + w[18] = hc_byte_perm (w[16], w[15], selector); + w[17] = hc_byte_perm (w[15], w[14], selector); + w[16] = hc_byte_perm (w[14], w[13], selector); + w[15] = hc_byte_perm (w[13], w[12], selector); + w[14] = hc_byte_perm (w[12], w[11], selector); + w[13] = hc_byte_perm (w[11], w[10], selector); + w[12] = hc_byte_perm (w[10], w[ 9], selector); + w[11] = hc_byte_perm (w[ 9], w[ 8], selector); + w[10] = hc_byte_perm (w[ 8], w[ 7], selector); + w[ 9] = hc_byte_perm (w[ 7], w[ 6], selector); + w[ 8] = hc_byte_perm (w[ 6], w[ 5], selector); + w[ 7] = hc_byte_perm (w[ 5], w[ 4], selector); + w[ 6] = hc_byte_perm (w[ 4], w[ 3], selector); + w[ 5] = hc_byte_perm (w[ 3], w[ 2], selector); + w[ 4] = hc_byte_perm (w[ 2], w[ 1], selector); + w[ 3] = hc_byte_perm (w[ 1], w[ 0], selector); + w[ 2] = hc_byte_perm (w[ 0], 0, selector); w[ 1] = 0; w[ 0] = 0; break; case 3: - w[63] = __byte_perm (w[60], w[59], selector); - w[62] = __byte_perm (w[59], w[58], selector); - w[61] = __byte_perm (w[58], w[57], selector); - w[60] = __byte_perm (w[57], w[56], selector); - w[59] = __byte_perm (w[56], w[55], selector); - w[58] = __byte_perm (w[55], w[54], selector); - w[57] = __byte_perm (w[54], w[53], selector); - w[56] = __byte_perm (w[53], w[52], selector); - w[55] = __byte_perm (w[52], w[51], selector); - w[54] = __byte_perm (w[51], w[50], selector); - w[53] = __byte_perm (w[50], w[49], selector); - w[52] = __byte_perm (w[49], w[48], selector); - w[51] = __byte_perm (w[48], w[47], selector); - w[50] = __byte_perm (w[47], w[46], selector); - w[49] = __byte_perm (w[46], w[45], selector); - w[48] = __byte_perm (w[45], w[44], selector); - w[47] = __byte_perm (w[44], w[43], selector); - w[46] = __byte_perm (w[43], w[42], selector); - w[45] = __byte_perm (w[42], w[41], selector); - w[44] = __byte_perm (w[41], w[40], selector); - w[43] = __byte_perm (w[40], w[39], selector); - w[42] = __byte_perm (w[39], w[38], selector); - w[41] = __byte_perm (w[38], w[37], selector); - w[40] = __byte_perm (w[37], w[36], selector); - w[39] = __byte_perm (w[36], w[35], selector); - w[38] = __byte_perm (w[35], w[34], selector); - w[37] = __byte_perm (w[34], w[33], selector); - w[36] = __byte_perm (w[33], w[32], selector); - w[35] = __byte_perm (w[32], w[31], selector); - w[34] = __byte_perm (w[31], w[30], selector); - w[33] = __byte_perm (w[30], w[29], selector); - w[32] = __byte_perm (w[29], w[28], selector); - w[31] = __byte_perm (w[28], w[27], selector); - w[30] = __byte_perm (w[27], w[26], selector); - w[29] = __byte_perm (w[26], w[25], selector); - w[28] = __byte_perm (w[25], w[24], selector); - w[27] = __byte_perm (w[24], w[23], selector); - w[26] = __byte_perm (w[23], w[22], selector); - w[25] = __byte_perm (w[22], w[21], selector); - w[24] = __byte_perm (w[21], w[20], selector); - w[23] = __byte_perm (w[20], w[19], selector); - w[22] = __byte_perm (w[19], w[18], selector); - w[21] = __byte_perm (w[18], w[17], selector); - w[20] = __byte_perm (w[17], w[16], selector); - w[19] = __byte_perm (w[16], w[15], selector); - w[18] = __byte_perm (w[15], w[14], selector); - w[17] = __byte_perm (w[14], w[13], selector); - w[16] = __byte_perm (w[13], w[12], selector); - w[15] = __byte_perm (w[12], w[11], selector); - w[14] = __byte_perm (w[11], w[10], selector); - w[13] = __byte_perm (w[10], w[ 9], selector); - w[12] = __byte_perm (w[ 9], w[ 8], selector); - w[11] = __byte_perm (w[ 8], w[ 7], selector); - w[10] = __byte_perm (w[ 7], w[ 6], selector); - w[ 9] = __byte_perm (w[ 6], w[ 5], selector); - w[ 8] = __byte_perm (w[ 5], w[ 4], selector); - w[ 7] = __byte_perm (w[ 4], w[ 3], selector); - w[ 6] = __byte_perm (w[ 3], w[ 2], selector); - w[ 5] = __byte_perm (w[ 2], w[ 1], selector); - w[ 4] = __byte_perm (w[ 1], w[ 0], selector); - w[ 3] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[60], w[59], selector); + w[62] = hc_byte_perm (w[59], w[58], selector); + w[61] = hc_byte_perm (w[58], w[57], selector); + w[60] = hc_byte_perm (w[57], w[56], selector); + w[59] = hc_byte_perm (w[56], w[55], selector); + w[58] = hc_byte_perm (w[55], w[54], selector); + w[57] = hc_byte_perm (w[54], w[53], selector); + w[56] = hc_byte_perm (w[53], w[52], selector); + w[55] = hc_byte_perm (w[52], w[51], selector); + w[54] = hc_byte_perm (w[51], w[50], selector); + w[53] = hc_byte_perm (w[50], w[49], selector); + w[52] = hc_byte_perm (w[49], w[48], selector); + w[51] = hc_byte_perm (w[48], w[47], selector); + w[50] = hc_byte_perm (w[47], w[46], selector); + w[49] = hc_byte_perm (w[46], w[45], selector); + w[48] = hc_byte_perm (w[45], w[44], selector); + w[47] = hc_byte_perm (w[44], w[43], selector); + w[46] = hc_byte_perm (w[43], w[42], selector); + w[45] = hc_byte_perm (w[42], w[41], selector); + w[44] = hc_byte_perm (w[41], w[40], selector); + w[43] = hc_byte_perm (w[40], w[39], selector); + w[42] = hc_byte_perm (w[39], w[38], selector); + w[41] = hc_byte_perm (w[38], w[37], selector); + w[40] = hc_byte_perm (w[37], w[36], selector); + w[39] = hc_byte_perm (w[36], w[35], selector); + w[38] = hc_byte_perm (w[35], w[34], selector); + w[37] = hc_byte_perm (w[34], w[33], selector); + w[36] = hc_byte_perm (w[33], w[32], selector); + w[35] = hc_byte_perm (w[32], w[31], selector); + w[34] = hc_byte_perm (w[31], w[30], selector); + w[33] = hc_byte_perm (w[30], w[29], selector); + w[32] = hc_byte_perm (w[29], w[28], selector); + w[31] = hc_byte_perm (w[28], w[27], selector); + w[30] = hc_byte_perm (w[27], w[26], selector); + w[29] = hc_byte_perm (w[26], w[25], selector); + w[28] = hc_byte_perm (w[25], w[24], selector); + w[27] = hc_byte_perm (w[24], w[23], selector); + w[26] = hc_byte_perm (w[23], w[22], selector); + w[25] = hc_byte_perm (w[22], w[21], selector); + w[24] = hc_byte_perm (w[21], w[20], selector); + w[23] = hc_byte_perm (w[20], w[19], selector); + w[22] = hc_byte_perm (w[19], w[18], selector); + w[21] = hc_byte_perm (w[18], w[17], selector); + w[20] = hc_byte_perm (w[17], w[16], selector); + w[19] = hc_byte_perm (w[16], w[15], selector); + w[18] = hc_byte_perm (w[15], w[14], selector); + w[17] = hc_byte_perm (w[14], w[13], selector); + w[16] = hc_byte_perm (w[13], w[12], selector); + w[15] = hc_byte_perm (w[12], w[11], selector); + w[14] = hc_byte_perm (w[11], w[10], selector); + w[13] = hc_byte_perm (w[10], w[ 9], selector); + w[12] = hc_byte_perm (w[ 9], w[ 8], selector); + w[11] = hc_byte_perm (w[ 8], w[ 7], selector); + w[10] = hc_byte_perm (w[ 7], w[ 6], selector); + w[ 9] = hc_byte_perm (w[ 6], w[ 5], selector); + w[ 8] = hc_byte_perm (w[ 5], w[ 4], selector); + w[ 7] = hc_byte_perm (w[ 4], w[ 3], selector); + w[ 6] = hc_byte_perm (w[ 3], w[ 2], selector); + w[ 5] = hc_byte_perm (w[ 2], w[ 1], selector); + w[ 4] = hc_byte_perm (w[ 1], w[ 0], selector); + w[ 3] = hc_byte_perm (w[ 0], 0, selector); w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; @@ -25036,66 +25036,66 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 4: - w[63] = __byte_perm (w[59], w[58], selector); - w[62] = __byte_perm (w[58], w[57], selector); - w[61] = __byte_perm (w[57], w[56], selector); - w[60] = __byte_perm (w[56], w[55], selector); - w[59] = __byte_perm (w[55], w[54], selector); - w[58] = __byte_perm (w[54], w[53], selector); - w[57] = __byte_perm (w[53], w[52], selector); - w[56] = __byte_perm (w[52], w[51], selector); - w[55] = __byte_perm (w[51], w[50], selector); - w[54] = __byte_perm (w[50], w[49], selector); - w[53] = __byte_perm (w[49], w[48], selector); - w[52] = __byte_perm (w[48], w[47], selector); - w[51] = __byte_perm (w[47], w[46], selector); - w[50] = __byte_perm (w[46], w[45], selector); - w[49] = __byte_perm (w[45], w[44], selector); - w[48] = __byte_perm (w[44], w[43], selector); - w[47] = __byte_perm (w[43], w[42], selector); - w[46] = __byte_perm (w[42], w[41], selector); - w[45] = __byte_perm (w[41], w[40], selector); - w[44] = __byte_perm (w[40], w[39], selector); - w[43] = __byte_perm (w[39], w[38], selector); - w[42] = __byte_perm (w[38], w[37], selector); - w[41] = __byte_perm (w[37], w[36], selector); - w[40] = __byte_perm (w[36], w[35], selector); - w[39] = __byte_perm (w[35], w[34], selector); - w[38] = __byte_perm (w[34], w[33], selector); - w[37] = __byte_perm (w[33], w[32], selector); - w[36] = __byte_perm (w[32], w[31], selector); - w[35] = __byte_perm (w[31], w[30], selector); - w[34] = __byte_perm (w[30], w[29], selector); - w[33] = __byte_perm (w[29], w[28], selector); - w[32] = __byte_perm (w[28], w[27], selector); - w[31] = __byte_perm (w[27], w[26], selector); - w[30] = __byte_perm (w[26], w[25], selector); - w[29] = __byte_perm (w[25], w[24], selector); - w[28] = __byte_perm (w[24], w[23], selector); - w[27] = __byte_perm (w[23], w[22], selector); - w[26] = __byte_perm (w[22], w[21], selector); - w[25] = __byte_perm (w[21], w[20], selector); - w[24] = __byte_perm (w[20], w[19], selector); - w[23] = __byte_perm (w[19], w[18], selector); - w[22] = __byte_perm (w[18], w[17], selector); - w[21] = __byte_perm (w[17], w[16], selector); - w[20] = __byte_perm (w[16], w[15], selector); - w[19] = __byte_perm (w[15], w[14], selector); - w[18] = __byte_perm (w[14], w[13], selector); - w[17] = __byte_perm (w[13], w[12], selector); - w[16] = __byte_perm (w[12], w[11], selector); - w[15] = __byte_perm (w[11], w[10], selector); - w[14] = __byte_perm (w[10], w[ 9], selector); - w[13] = __byte_perm (w[ 9], w[ 8], selector); - w[12] = __byte_perm (w[ 8], w[ 7], selector); - w[11] = __byte_perm (w[ 7], w[ 6], selector); - w[10] = __byte_perm (w[ 6], w[ 5], selector); - w[ 9] = __byte_perm (w[ 5], w[ 4], selector); - w[ 8] = __byte_perm (w[ 4], w[ 3], selector); - w[ 7] = __byte_perm (w[ 3], w[ 2], selector); - w[ 6] = __byte_perm (w[ 2], w[ 1], selector); - w[ 5] = __byte_perm (w[ 1], w[ 0], selector); - w[ 4] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[59], w[58], selector); + w[62] = hc_byte_perm (w[58], w[57], selector); + w[61] = hc_byte_perm (w[57], w[56], selector); + w[60] = hc_byte_perm (w[56], w[55], selector); + w[59] = hc_byte_perm (w[55], w[54], selector); + w[58] = hc_byte_perm (w[54], w[53], selector); + w[57] = hc_byte_perm (w[53], w[52], selector); + w[56] = hc_byte_perm (w[52], w[51], selector); + w[55] = hc_byte_perm (w[51], w[50], selector); + w[54] = hc_byte_perm (w[50], w[49], selector); + w[53] = hc_byte_perm (w[49], w[48], selector); + w[52] = hc_byte_perm (w[48], w[47], selector); + w[51] = hc_byte_perm (w[47], w[46], selector); + w[50] = hc_byte_perm (w[46], w[45], selector); + w[49] = hc_byte_perm (w[45], w[44], selector); + w[48] = hc_byte_perm (w[44], w[43], selector); + w[47] = hc_byte_perm (w[43], w[42], selector); + w[46] = hc_byte_perm (w[42], w[41], selector); + w[45] = hc_byte_perm (w[41], w[40], selector); + w[44] = hc_byte_perm (w[40], w[39], selector); + w[43] = hc_byte_perm (w[39], w[38], selector); + w[42] = hc_byte_perm (w[38], w[37], selector); + w[41] = hc_byte_perm (w[37], w[36], selector); + w[40] = hc_byte_perm (w[36], w[35], selector); + w[39] = hc_byte_perm (w[35], w[34], selector); + w[38] = hc_byte_perm (w[34], w[33], selector); + w[37] = hc_byte_perm (w[33], w[32], selector); + w[36] = hc_byte_perm (w[32], w[31], selector); + w[35] = hc_byte_perm (w[31], w[30], selector); + w[34] = hc_byte_perm (w[30], w[29], selector); + w[33] = hc_byte_perm (w[29], w[28], selector); + w[32] = hc_byte_perm (w[28], w[27], selector); + w[31] = hc_byte_perm (w[27], w[26], selector); + w[30] = hc_byte_perm (w[26], w[25], selector); + w[29] = hc_byte_perm (w[25], w[24], selector); + w[28] = hc_byte_perm (w[24], w[23], selector); + w[27] = hc_byte_perm (w[23], w[22], selector); + w[26] = hc_byte_perm (w[22], w[21], selector); + w[25] = hc_byte_perm (w[21], w[20], selector); + w[24] = hc_byte_perm (w[20], w[19], selector); + w[23] = hc_byte_perm (w[19], w[18], selector); + w[22] = hc_byte_perm (w[18], w[17], selector); + w[21] = hc_byte_perm (w[17], w[16], selector); + w[20] = hc_byte_perm (w[16], w[15], selector); + w[19] = hc_byte_perm (w[15], w[14], selector); + w[18] = hc_byte_perm (w[14], w[13], selector); + w[17] = hc_byte_perm (w[13], w[12], selector); + w[16] = hc_byte_perm (w[12], w[11], selector); + w[15] = hc_byte_perm (w[11], w[10], selector); + w[14] = hc_byte_perm (w[10], w[ 9], selector); + w[13] = hc_byte_perm (w[ 9], w[ 8], selector); + w[12] = hc_byte_perm (w[ 8], w[ 7], selector); + w[11] = hc_byte_perm (w[ 7], w[ 6], selector); + w[10] = hc_byte_perm (w[ 6], w[ 5], selector); + w[ 9] = hc_byte_perm (w[ 5], w[ 4], selector); + w[ 8] = hc_byte_perm (w[ 4], w[ 3], selector); + w[ 7] = hc_byte_perm (w[ 3], w[ 2], selector); + w[ 6] = hc_byte_perm (w[ 2], w[ 1], selector); + w[ 5] = hc_byte_perm (w[ 1], w[ 0], selector); + w[ 4] = hc_byte_perm (w[ 0], 0, selector); w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; @@ -25104,65 +25104,65 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 5: - w[63] = __byte_perm (w[58], w[57], selector); - w[62] = __byte_perm (w[57], w[56], selector); - w[61] = __byte_perm (w[56], w[55], selector); - w[60] = __byte_perm (w[55], w[54], selector); - w[59] = __byte_perm (w[54], w[53], selector); - w[58] = __byte_perm (w[53], w[52], selector); - w[57] = __byte_perm (w[52], w[51], selector); - w[56] = __byte_perm (w[51], w[50], selector); - w[55] = __byte_perm (w[50], w[49], selector); - w[54] = __byte_perm (w[49], w[48], selector); - w[53] = __byte_perm (w[48], w[47], selector); - w[52] = __byte_perm (w[47], w[46], selector); - w[51] = __byte_perm (w[46], w[45], selector); - w[50] = __byte_perm (w[45], w[44], selector); - w[49] = __byte_perm (w[44], w[43], selector); - w[48] = __byte_perm (w[43], w[42], selector); - w[47] = __byte_perm (w[42], w[41], selector); - w[46] = __byte_perm (w[41], w[40], selector); - w[45] = __byte_perm (w[40], w[39], selector); - w[44] = __byte_perm (w[39], w[38], selector); - w[43] = __byte_perm (w[38], w[37], selector); - w[42] = __byte_perm (w[37], w[36], selector); - w[41] = __byte_perm (w[36], w[35], selector); - w[40] = __byte_perm (w[35], w[34], selector); - w[39] = __byte_perm (w[34], w[33], selector); - w[38] = __byte_perm (w[33], w[32], selector); - w[37] = __byte_perm (w[32], w[31], selector); - w[36] = __byte_perm (w[31], w[30], selector); - w[35] = __byte_perm (w[30], w[29], selector); - w[34] = __byte_perm (w[29], w[28], selector); - w[33] = __byte_perm (w[28], w[27], selector); - w[32] = __byte_perm (w[27], w[26], selector); - w[31] = __byte_perm (w[26], w[25], selector); - w[30] = __byte_perm (w[25], w[24], selector); - w[29] = __byte_perm (w[24], w[23], selector); - w[28] = __byte_perm (w[23], w[22], selector); - w[27] = __byte_perm (w[22], w[21], selector); - w[26] = __byte_perm (w[21], w[20], selector); - w[25] = __byte_perm (w[20], w[19], selector); - w[24] = __byte_perm (w[19], w[18], selector); - w[23] = __byte_perm (w[18], w[17], selector); - w[22] = __byte_perm (w[17], w[16], selector); - w[21] = __byte_perm (w[16], w[15], selector); - w[20] = __byte_perm (w[15], w[14], selector); - w[19] = __byte_perm (w[14], w[13], selector); - w[18] = __byte_perm (w[13], w[12], selector); - w[17] = __byte_perm (w[12], w[11], selector); - w[16] = __byte_perm (w[11], w[10], selector); - w[15] = __byte_perm (w[10], w[ 9], selector); - w[14] = __byte_perm (w[ 9], w[ 8], selector); - w[13] = __byte_perm (w[ 8], w[ 7], selector); - w[12] = __byte_perm (w[ 7], w[ 6], selector); - w[11] = __byte_perm (w[ 6], w[ 5], selector); - w[10] = __byte_perm (w[ 5], w[ 4], selector); - w[ 9] = __byte_perm (w[ 4], w[ 3], selector); - w[ 8] = __byte_perm (w[ 3], w[ 2], selector); - w[ 7] = __byte_perm (w[ 2], w[ 1], selector); - w[ 6] = __byte_perm (w[ 1], w[ 0], selector); - w[ 5] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[58], w[57], selector); + w[62] = hc_byte_perm (w[57], w[56], selector); + w[61] = hc_byte_perm (w[56], w[55], selector); + w[60] = hc_byte_perm (w[55], w[54], selector); + w[59] = hc_byte_perm (w[54], w[53], selector); + w[58] = hc_byte_perm (w[53], w[52], selector); + w[57] = hc_byte_perm (w[52], w[51], selector); + w[56] = hc_byte_perm (w[51], w[50], selector); + w[55] = hc_byte_perm (w[50], w[49], selector); + w[54] = hc_byte_perm (w[49], w[48], selector); + w[53] = hc_byte_perm (w[48], w[47], selector); + w[52] = hc_byte_perm (w[47], w[46], selector); + w[51] = hc_byte_perm (w[46], w[45], selector); + w[50] = hc_byte_perm (w[45], w[44], selector); + w[49] = hc_byte_perm (w[44], w[43], selector); + w[48] = hc_byte_perm (w[43], w[42], selector); + w[47] = hc_byte_perm (w[42], w[41], selector); + w[46] = hc_byte_perm (w[41], w[40], selector); + w[45] = hc_byte_perm (w[40], w[39], selector); + w[44] = hc_byte_perm (w[39], w[38], selector); + w[43] = hc_byte_perm (w[38], w[37], selector); + w[42] = hc_byte_perm (w[37], w[36], selector); + w[41] = hc_byte_perm (w[36], w[35], selector); + w[40] = hc_byte_perm (w[35], w[34], selector); + w[39] = hc_byte_perm (w[34], w[33], selector); + w[38] = hc_byte_perm (w[33], w[32], selector); + w[37] = hc_byte_perm (w[32], w[31], selector); + w[36] = hc_byte_perm (w[31], w[30], selector); + w[35] = hc_byte_perm (w[30], w[29], selector); + w[34] = hc_byte_perm (w[29], w[28], selector); + w[33] = hc_byte_perm (w[28], w[27], selector); + w[32] = hc_byte_perm (w[27], w[26], selector); + w[31] = hc_byte_perm (w[26], w[25], selector); + w[30] = hc_byte_perm (w[25], w[24], selector); + w[29] = hc_byte_perm (w[24], w[23], selector); + w[28] = hc_byte_perm (w[23], w[22], selector); + w[27] = hc_byte_perm (w[22], w[21], selector); + w[26] = hc_byte_perm (w[21], w[20], selector); + w[25] = hc_byte_perm (w[20], w[19], selector); + w[24] = hc_byte_perm (w[19], w[18], selector); + w[23] = hc_byte_perm (w[18], w[17], selector); + w[22] = hc_byte_perm (w[17], w[16], selector); + w[21] = hc_byte_perm (w[16], w[15], selector); + w[20] = hc_byte_perm (w[15], w[14], selector); + w[19] = hc_byte_perm (w[14], w[13], selector); + w[18] = hc_byte_perm (w[13], w[12], selector); + w[17] = hc_byte_perm (w[12], w[11], selector); + w[16] = hc_byte_perm (w[11], w[10], selector); + w[15] = hc_byte_perm (w[10], w[ 9], selector); + w[14] = hc_byte_perm (w[ 9], w[ 8], selector); + w[13] = hc_byte_perm (w[ 8], w[ 7], selector); + w[12] = hc_byte_perm (w[ 7], w[ 6], selector); + w[11] = hc_byte_perm (w[ 6], w[ 5], selector); + w[10] = hc_byte_perm (w[ 5], w[ 4], selector); + w[ 9] = hc_byte_perm (w[ 4], w[ 3], selector); + w[ 8] = hc_byte_perm (w[ 3], w[ 2], selector); + w[ 7] = hc_byte_perm (w[ 2], w[ 1], selector); + w[ 6] = hc_byte_perm (w[ 1], w[ 0], selector); + w[ 5] = hc_byte_perm (w[ 0], 0, selector); w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; @@ -25172,64 +25172,64 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 6: - w[63] = __byte_perm (w[57], w[56], selector); - w[62] = __byte_perm (w[56], w[55], selector); - w[61] = __byte_perm (w[55], w[54], selector); - w[60] = __byte_perm (w[54], w[53], selector); - w[59] = __byte_perm (w[53], w[52], selector); - w[58] = __byte_perm (w[52], w[51], selector); - w[57] = __byte_perm (w[51], w[50], selector); - w[56] = __byte_perm (w[50], w[49], selector); - w[55] = __byte_perm (w[49], w[48], selector); - w[54] = __byte_perm (w[48], w[47], selector); - w[53] = __byte_perm (w[47], w[46], selector); - w[52] = __byte_perm (w[46], w[45], selector); - w[51] = __byte_perm (w[45], w[44], selector); - w[50] = __byte_perm (w[44], w[43], selector); - w[49] = __byte_perm (w[43], w[42], selector); - w[48] = __byte_perm (w[42], w[41], selector); - w[47] = __byte_perm (w[41], w[40], selector); - w[46] = __byte_perm (w[40], w[39], selector); - w[45] = __byte_perm (w[39], w[38], selector); - w[44] = __byte_perm (w[38], w[37], selector); - w[43] = __byte_perm (w[37], w[36], selector); - w[42] = __byte_perm (w[36], w[35], selector); - w[41] = __byte_perm (w[35], w[34], selector); - w[40] = __byte_perm (w[34], w[33], selector); - w[39] = __byte_perm (w[33], w[32], selector); - w[38] = __byte_perm (w[32], w[31], selector); - w[37] = __byte_perm (w[31], w[30], selector); - w[36] = __byte_perm (w[30], w[29], selector); - w[35] = __byte_perm (w[29], w[28], selector); - w[34] = __byte_perm (w[28], w[27], selector); - w[33] = __byte_perm (w[27], w[26], selector); - w[32] = __byte_perm (w[26], w[25], selector); - w[31] = __byte_perm (w[25], w[24], selector); - w[30] = __byte_perm (w[24], w[23], selector); - w[29] = __byte_perm (w[23], w[22], selector); - w[28] = __byte_perm (w[22], w[21], selector); - w[27] = __byte_perm (w[21], w[20], selector); - w[26] = __byte_perm (w[20], w[19], selector); - w[25] = __byte_perm (w[19], w[18], selector); - w[24] = __byte_perm (w[18], w[17], selector); - w[23] = __byte_perm (w[17], w[16], selector); - w[22] = __byte_perm (w[16], w[15], selector); - w[21] = __byte_perm (w[15], w[14], selector); - w[20] = __byte_perm (w[14], w[13], selector); - w[19] = __byte_perm (w[13], w[12], selector); - w[18] = __byte_perm (w[12], w[11], selector); - w[17] = __byte_perm (w[11], w[10], selector); - w[16] = __byte_perm (w[10], w[ 9], selector); - w[15] = __byte_perm (w[ 9], w[ 8], selector); - w[14] = __byte_perm (w[ 8], w[ 7], selector); - w[13] = __byte_perm (w[ 7], w[ 6], selector); - w[12] = __byte_perm (w[ 6], w[ 5], selector); - w[11] = __byte_perm (w[ 5], w[ 4], selector); - w[10] = __byte_perm (w[ 4], w[ 3], selector); - w[ 9] = __byte_perm (w[ 3], w[ 2], selector); - w[ 8] = __byte_perm (w[ 2], w[ 1], selector); - w[ 7] = __byte_perm (w[ 1], w[ 0], selector); - w[ 6] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[57], w[56], selector); + w[62] = hc_byte_perm (w[56], w[55], selector); + w[61] = hc_byte_perm (w[55], w[54], selector); + w[60] = hc_byte_perm (w[54], w[53], selector); + w[59] = hc_byte_perm (w[53], w[52], selector); + w[58] = hc_byte_perm (w[52], w[51], selector); + w[57] = hc_byte_perm (w[51], w[50], selector); + w[56] = hc_byte_perm (w[50], w[49], selector); + w[55] = hc_byte_perm (w[49], w[48], selector); + w[54] = hc_byte_perm (w[48], w[47], selector); + w[53] = hc_byte_perm (w[47], w[46], selector); + w[52] = hc_byte_perm (w[46], w[45], selector); + w[51] = hc_byte_perm (w[45], w[44], selector); + w[50] = hc_byte_perm (w[44], w[43], selector); + w[49] = hc_byte_perm (w[43], w[42], selector); + w[48] = hc_byte_perm (w[42], w[41], selector); + w[47] = hc_byte_perm (w[41], w[40], selector); + w[46] = hc_byte_perm (w[40], w[39], selector); + w[45] = hc_byte_perm (w[39], w[38], selector); + w[44] = hc_byte_perm (w[38], w[37], selector); + w[43] = hc_byte_perm (w[37], w[36], selector); + w[42] = hc_byte_perm (w[36], w[35], selector); + w[41] = hc_byte_perm (w[35], w[34], selector); + w[40] = hc_byte_perm (w[34], w[33], selector); + w[39] = hc_byte_perm (w[33], w[32], selector); + w[38] = hc_byte_perm (w[32], w[31], selector); + w[37] = hc_byte_perm (w[31], w[30], selector); + w[36] = hc_byte_perm (w[30], w[29], selector); + w[35] = hc_byte_perm (w[29], w[28], selector); + w[34] = hc_byte_perm (w[28], w[27], selector); + w[33] = hc_byte_perm (w[27], w[26], selector); + w[32] = hc_byte_perm (w[26], w[25], selector); + w[31] = hc_byte_perm (w[25], w[24], selector); + w[30] = hc_byte_perm (w[24], w[23], selector); + w[29] = hc_byte_perm (w[23], w[22], selector); + w[28] = hc_byte_perm (w[22], w[21], selector); + w[27] = hc_byte_perm (w[21], w[20], selector); + w[26] = hc_byte_perm (w[20], w[19], selector); + w[25] = hc_byte_perm (w[19], w[18], selector); + w[24] = hc_byte_perm (w[18], w[17], selector); + w[23] = hc_byte_perm (w[17], w[16], selector); + w[22] = hc_byte_perm (w[16], w[15], selector); + w[21] = hc_byte_perm (w[15], w[14], selector); + w[20] = hc_byte_perm (w[14], w[13], selector); + w[19] = hc_byte_perm (w[13], w[12], selector); + w[18] = hc_byte_perm (w[12], w[11], selector); + w[17] = hc_byte_perm (w[11], w[10], selector); + w[16] = hc_byte_perm (w[10], w[ 9], selector); + w[15] = hc_byte_perm (w[ 9], w[ 8], selector); + w[14] = hc_byte_perm (w[ 8], w[ 7], selector); + w[13] = hc_byte_perm (w[ 7], w[ 6], selector); + w[12] = hc_byte_perm (w[ 6], w[ 5], selector); + w[11] = hc_byte_perm (w[ 5], w[ 4], selector); + w[10] = hc_byte_perm (w[ 4], w[ 3], selector); + w[ 9] = hc_byte_perm (w[ 3], w[ 2], selector); + w[ 8] = hc_byte_perm (w[ 2], w[ 1], selector); + w[ 7] = hc_byte_perm (w[ 1], w[ 0], selector); + w[ 6] = hc_byte_perm (w[ 0], 0, selector); w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; @@ -25240,63 +25240,63 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 7: - w[63] = __byte_perm (w[56], w[55], selector); - w[62] = __byte_perm (w[55], w[54], selector); - w[61] = __byte_perm (w[54], w[53], selector); - w[60] = __byte_perm (w[53], w[52], selector); - w[59] = __byte_perm (w[52], w[51], selector); - w[58] = __byte_perm (w[51], w[50], selector); - w[57] = __byte_perm (w[50], w[49], selector); - w[56] = __byte_perm (w[49], w[48], selector); - w[55] = __byte_perm (w[48], w[47], selector); - w[54] = __byte_perm (w[47], w[46], selector); - w[53] = __byte_perm (w[46], w[45], selector); - w[52] = __byte_perm (w[45], w[44], selector); - w[51] = __byte_perm (w[44], w[43], selector); - w[50] = __byte_perm (w[43], w[42], selector); - w[49] = __byte_perm (w[42], w[41], selector); - w[48] = __byte_perm (w[41], w[40], selector); - w[47] = __byte_perm (w[40], w[39], selector); - w[46] = __byte_perm (w[39], w[38], selector); - w[45] = __byte_perm (w[38], w[37], selector); - w[44] = __byte_perm (w[37], w[36], selector); - w[43] = __byte_perm (w[36], w[35], selector); - w[42] = __byte_perm (w[35], w[34], selector); - w[41] = __byte_perm (w[34], w[33], selector); - w[40] = __byte_perm (w[33], w[32], selector); - w[39] = __byte_perm (w[32], w[31], selector); - w[38] = __byte_perm (w[31], w[30], selector); - w[37] = __byte_perm (w[30], w[29], selector); - w[36] = __byte_perm (w[29], w[28], selector); - w[35] = __byte_perm (w[28], w[27], selector); - w[34] = __byte_perm (w[27], w[26], selector); - w[33] = __byte_perm (w[26], w[25], selector); - w[32] = __byte_perm (w[25], w[24], selector); - w[31] = __byte_perm (w[24], w[23], selector); - w[30] = __byte_perm (w[23], w[22], selector); - w[29] = __byte_perm (w[22], w[21], selector); - w[28] = __byte_perm (w[21], w[20], selector); - w[27] = __byte_perm (w[20], w[19], selector); - w[26] = __byte_perm (w[19], w[18], selector); - w[25] = __byte_perm (w[18], w[17], selector); - w[24] = __byte_perm (w[17], w[16], selector); - w[23] = __byte_perm (w[16], w[15], selector); - w[22] = __byte_perm (w[15], w[14], selector); - w[21] = __byte_perm (w[14], w[13], selector); - w[20] = __byte_perm (w[13], w[12], selector); - w[19] = __byte_perm (w[12], w[11], selector); - w[18] = __byte_perm (w[11], w[10], selector); - w[17] = __byte_perm (w[10], w[ 9], selector); - w[16] = __byte_perm (w[ 9], w[ 8], selector); - w[15] = __byte_perm (w[ 8], w[ 7], selector); - w[14] = __byte_perm (w[ 7], w[ 6], selector); - w[13] = __byte_perm (w[ 6], w[ 5], selector); - w[12] = __byte_perm (w[ 5], w[ 4], selector); - w[11] = __byte_perm (w[ 4], w[ 3], selector); - w[10] = __byte_perm (w[ 3], w[ 2], selector); - w[ 9] = __byte_perm (w[ 2], w[ 1], selector); - w[ 8] = __byte_perm (w[ 1], w[ 0], selector); - w[ 7] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[56], w[55], selector); + w[62] = hc_byte_perm (w[55], w[54], selector); + w[61] = hc_byte_perm (w[54], w[53], selector); + w[60] = hc_byte_perm (w[53], w[52], selector); + w[59] = hc_byte_perm (w[52], w[51], selector); + w[58] = hc_byte_perm (w[51], w[50], selector); + w[57] = hc_byte_perm (w[50], w[49], selector); + w[56] = hc_byte_perm (w[49], w[48], selector); + w[55] = hc_byte_perm (w[48], w[47], selector); + w[54] = hc_byte_perm (w[47], w[46], selector); + w[53] = hc_byte_perm (w[46], w[45], selector); + w[52] = hc_byte_perm (w[45], w[44], selector); + w[51] = hc_byte_perm (w[44], w[43], selector); + w[50] = hc_byte_perm (w[43], w[42], selector); + w[49] = hc_byte_perm (w[42], w[41], selector); + w[48] = hc_byte_perm (w[41], w[40], selector); + w[47] = hc_byte_perm (w[40], w[39], selector); + w[46] = hc_byte_perm (w[39], w[38], selector); + w[45] = hc_byte_perm (w[38], w[37], selector); + w[44] = hc_byte_perm (w[37], w[36], selector); + w[43] = hc_byte_perm (w[36], w[35], selector); + w[42] = hc_byte_perm (w[35], w[34], selector); + w[41] = hc_byte_perm (w[34], w[33], selector); + w[40] = hc_byte_perm (w[33], w[32], selector); + w[39] = hc_byte_perm (w[32], w[31], selector); + w[38] = hc_byte_perm (w[31], w[30], selector); + w[37] = hc_byte_perm (w[30], w[29], selector); + w[36] = hc_byte_perm (w[29], w[28], selector); + w[35] = hc_byte_perm (w[28], w[27], selector); + w[34] = hc_byte_perm (w[27], w[26], selector); + w[33] = hc_byte_perm (w[26], w[25], selector); + w[32] = hc_byte_perm (w[25], w[24], selector); + w[31] = hc_byte_perm (w[24], w[23], selector); + w[30] = hc_byte_perm (w[23], w[22], selector); + w[29] = hc_byte_perm (w[22], w[21], selector); + w[28] = hc_byte_perm (w[21], w[20], selector); + w[27] = hc_byte_perm (w[20], w[19], selector); + w[26] = hc_byte_perm (w[19], w[18], selector); + w[25] = hc_byte_perm (w[18], w[17], selector); + w[24] = hc_byte_perm (w[17], w[16], selector); + w[23] = hc_byte_perm (w[16], w[15], selector); + w[22] = hc_byte_perm (w[15], w[14], selector); + w[21] = hc_byte_perm (w[14], w[13], selector); + w[20] = hc_byte_perm (w[13], w[12], selector); + w[19] = hc_byte_perm (w[12], w[11], selector); + w[18] = hc_byte_perm (w[11], w[10], selector); + w[17] = hc_byte_perm (w[10], w[ 9], selector); + w[16] = hc_byte_perm (w[ 9], w[ 8], selector); + w[15] = hc_byte_perm (w[ 8], w[ 7], selector); + w[14] = hc_byte_perm (w[ 7], w[ 6], selector); + w[13] = hc_byte_perm (w[ 6], w[ 5], selector); + w[12] = hc_byte_perm (w[ 5], w[ 4], selector); + w[11] = hc_byte_perm (w[ 4], w[ 3], selector); + w[10] = hc_byte_perm (w[ 3], w[ 2], selector); + w[ 9] = hc_byte_perm (w[ 2], w[ 1], selector); + w[ 8] = hc_byte_perm (w[ 1], w[ 0], selector); + w[ 7] = hc_byte_perm (w[ 0], 0, selector); w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; @@ -25308,62 +25308,62 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 8: - w[63] = __byte_perm (w[55], w[54], selector); - w[62] = __byte_perm (w[54], w[53], selector); - w[61] = __byte_perm (w[53], w[52], selector); - w[60] = __byte_perm (w[52], w[51], selector); - w[59] = __byte_perm (w[51], w[50], selector); - w[58] = __byte_perm (w[50], w[49], selector); - w[57] = __byte_perm (w[49], w[48], selector); - w[56] = __byte_perm (w[48], w[47], selector); - w[55] = __byte_perm (w[47], w[46], selector); - w[54] = __byte_perm (w[46], w[45], selector); - w[53] = __byte_perm (w[45], w[44], selector); - w[52] = __byte_perm (w[44], w[43], selector); - w[51] = __byte_perm (w[43], w[42], selector); - w[50] = __byte_perm (w[42], w[41], selector); - w[49] = __byte_perm (w[41], w[40], selector); - w[48] = __byte_perm (w[40], w[39], selector); - w[47] = __byte_perm (w[39], w[38], selector); - w[46] = __byte_perm (w[38], w[37], selector); - w[45] = __byte_perm (w[37], w[36], selector); - w[44] = __byte_perm (w[36], w[35], selector); - w[43] = __byte_perm (w[35], w[34], selector); - w[42] = __byte_perm (w[34], w[33], selector); - w[41] = __byte_perm (w[33], w[32], selector); - w[40] = __byte_perm (w[32], w[31], selector); - w[39] = __byte_perm (w[31], w[30], selector); - w[38] = __byte_perm (w[30], w[29], selector); - w[37] = __byte_perm (w[29], w[28], selector); - w[36] = __byte_perm (w[28], w[27], selector); - w[35] = __byte_perm (w[27], w[26], selector); - w[34] = __byte_perm (w[26], w[25], selector); - w[33] = __byte_perm (w[25], w[24], selector); - w[32] = __byte_perm (w[24], w[23], selector); - w[31] = __byte_perm (w[23], w[22], selector); - w[30] = __byte_perm (w[22], w[21], selector); - w[29] = __byte_perm (w[21], w[20], selector); - w[28] = __byte_perm (w[20], w[19], selector); - w[27] = __byte_perm (w[19], w[18], selector); - w[26] = __byte_perm (w[18], w[17], selector); - w[25] = __byte_perm (w[17], w[16], selector); - w[24] = __byte_perm (w[16], w[15], selector); - w[23] = __byte_perm (w[15], w[14], selector); - w[22] = __byte_perm (w[14], w[13], selector); - w[21] = __byte_perm (w[13], w[12], selector); - w[20] = __byte_perm (w[12], w[11], selector); - w[19] = __byte_perm (w[11], w[10], selector); - w[18] = __byte_perm (w[10], w[ 9], selector); - w[17] = __byte_perm (w[ 9], w[ 8], selector); - w[16] = __byte_perm (w[ 8], w[ 7], selector); - w[15] = __byte_perm (w[ 7], w[ 6], selector); - w[14] = __byte_perm (w[ 6], w[ 5], selector); - w[13] = __byte_perm (w[ 5], w[ 4], selector); - w[12] = __byte_perm (w[ 4], w[ 3], selector); - w[11] = __byte_perm (w[ 3], w[ 2], selector); - w[10] = __byte_perm (w[ 2], w[ 1], selector); - w[ 9] = __byte_perm (w[ 1], w[ 0], selector); - w[ 8] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[55], w[54], selector); + w[62] = hc_byte_perm (w[54], w[53], selector); + w[61] = hc_byte_perm (w[53], w[52], selector); + w[60] = hc_byte_perm (w[52], w[51], selector); + w[59] = hc_byte_perm (w[51], w[50], selector); + w[58] = hc_byte_perm (w[50], w[49], selector); + w[57] = hc_byte_perm (w[49], w[48], selector); + w[56] = hc_byte_perm (w[48], w[47], selector); + w[55] = hc_byte_perm (w[47], w[46], selector); + w[54] = hc_byte_perm (w[46], w[45], selector); + w[53] = hc_byte_perm (w[45], w[44], selector); + w[52] = hc_byte_perm (w[44], w[43], selector); + w[51] = hc_byte_perm (w[43], w[42], selector); + w[50] = hc_byte_perm (w[42], w[41], selector); + w[49] = hc_byte_perm (w[41], w[40], selector); + w[48] = hc_byte_perm (w[40], w[39], selector); + w[47] = hc_byte_perm (w[39], w[38], selector); + w[46] = hc_byte_perm (w[38], w[37], selector); + w[45] = hc_byte_perm (w[37], w[36], selector); + w[44] = hc_byte_perm (w[36], w[35], selector); + w[43] = hc_byte_perm (w[35], w[34], selector); + w[42] = hc_byte_perm (w[34], w[33], selector); + w[41] = hc_byte_perm (w[33], w[32], selector); + w[40] = hc_byte_perm (w[32], w[31], selector); + w[39] = hc_byte_perm (w[31], w[30], selector); + w[38] = hc_byte_perm (w[30], w[29], selector); + w[37] = hc_byte_perm (w[29], w[28], selector); + w[36] = hc_byte_perm (w[28], w[27], selector); + w[35] = hc_byte_perm (w[27], w[26], selector); + w[34] = hc_byte_perm (w[26], w[25], selector); + w[33] = hc_byte_perm (w[25], w[24], selector); + w[32] = hc_byte_perm (w[24], w[23], selector); + w[31] = hc_byte_perm (w[23], w[22], selector); + w[30] = hc_byte_perm (w[22], w[21], selector); + w[29] = hc_byte_perm (w[21], w[20], selector); + w[28] = hc_byte_perm (w[20], w[19], selector); + w[27] = hc_byte_perm (w[19], w[18], selector); + w[26] = hc_byte_perm (w[18], w[17], selector); + w[25] = hc_byte_perm (w[17], w[16], selector); + w[24] = hc_byte_perm (w[16], w[15], selector); + w[23] = hc_byte_perm (w[15], w[14], selector); + w[22] = hc_byte_perm (w[14], w[13], selector); + w[21] = hc_byte_perm (w[13], w[12], selector); + w[20] = hc_byte_perm (w[12], w[11], selector); + w[19] = hc_byte_perm (w[11], w[10], selector); + w[18] = hc_byte_perm (w[10], w[ 9], selector); + w[17] = hc_byte_perm (w[ 9], w[ 8], selector); + w[16] = hc_byte_perm (w[ 8], w[ 7], selector); + w[15] = hc_byte_perm (w[ 7], w[ 6], selector); + w[14] = hc_byte_perm (w[ 6], w[ 5], selector); + w[13] = hc_byte_perm (w[ 5], w[ 4], selector); + w[12] = hc_byte_perm (w[ 4], w[ 3], selector); + w[11] = hc_byte_perm (w[ 3], w[ 2], selector); + w[10] = hc_byte_perm (w[ 2], w[ 1], selector); + w[ 9] = hc_byte_perm (w[ 1], w[ 0], selector); + w[ 8] = hc_byte_perm (w[ 0], 0, selector); w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; @@ -25376,61 +25376,61 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 9: - w[63] = __byte_perm (w[54], w[53], selector); - w[62] = __byte_perm (w[53], w[52], selector); - w[61] = __byte_perm (w[52], w[51], selector); - w[60] = __byte_perm (w[51], w[50], selector); - w[59] = __byte_perm (w[50], w[49], selector); - w[58] = __byte_perm (w[49], w[48], selector); - w[57] = __byte_perm (w[48], w[47], selector); - w[56] = __byte_perm (w[47], w[46], selector); - w[55] = __byte_perm (w[46], w[45], selector); - w[54] = __byte_perm (w[45], w[44], selector); - w[53] = __byte_perm (w[44], w[43], selector); - w[52] = __byte_perm (w[43], w[42], selector); - w[51] = __byte_perm (w[42], w[41], selector); - w[50] = __byte_perm (w[41], w[40], selector); - w[49] = __byte_perm (w[40], w[39], selector); - w[48] = __byte_perm (w[39], w[38], selector); - w[47] = __byte_perm (w[38], w[37], selector); - w[46] = __byte_perm (w[37], w[36], selector); - w[45] = __byte_perm (w[36], w[35], selector); - w[44] = __byte_perm (w[35], w[34], selector); - w[43] = __byte_perm (w[34], w[33], selector); - w[42] = __byte_perm (w[33], w[32], selector); - w[41] = __byte_perm (w[32], w[31], selector); - w[40] = __byte_perm (w[31], w[30], selector); - w[39] = __byte_perm (w[30], w[29], selector); - w[38] = __byte_perm (w[29], w[28], selector); - w[37] = __byte_perm (w[28], w[27], selector); - w[36] = __byte_perm (w[27], w[26], selector); - w[35] = __byte_perm (w[26], w[25], selector); - w[34] = __byte_perm (w[25], w[24], selector); - w[33] = __byte_perm (w[24], w[23], selector); - w[32] = __byte_perm (w[23], w[22], selector); - w[31] = __byte_perm (w[22], w[21], selector); - w[30] = __byte_perm (w[21], w[20], selector); - w[29] = __byte_perm (w[20], w[19], selector); - w[28] = __byte_perm (w[19], w[18], selector); - w[27] = __byte_perm (w[18], w[17], selector); - w[26] = __byte_perm (w[17], w[16], selector); - w[25] = __byte_perm (w[16], w[15], selector); - w[24] = __byte_perm (w[15], w[14], selector); - w[23] = __byte_perm (w[14], w[13], selector); - w[22] = __byte_perm (w[13], w[12], selector); - w[21] = __byte_perm (w[12], w[11], selector); - w[20] = __byte_perm (w[11], w[10], selector); - w[19] = __byte_perm (w[10], w[ 9], selector); - w[18] = __byte_perm (w[ 9], w[ 8], selector); - w[17] = __byte_perm (w[ 8], w[ 7], selector); - w[16] = __byte_perm (w[ 7], w[ 6], selector); - w[15] = __byte_perm (w[ 6], w[ 5], selector); - w[14] = __byte_perm (w[ 5], w[ 4], selector); - w[13] = __byte_perm (w[ 4], w[ 3], selector); - w[12] = __byte_perm (w[ 3], w[ 2], selector); - w[11] = __byte_perm (w[ 2], w[ 1], selector); - w[10] = __byte_perm (w[ 1], w[ 0], selector); - w[ 9] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[54], w[53], selector); + w[62] = hc_byte_perm (w[53], w[52], selector); + w[61] = hc_byte_perm (w[52], w[51], selector); + w[60] = hc_byte_perm (w[51], w[50], selector); + w[59] = hc_byte_perm (w[50], w[49], selector); + w[58] = hc_byte_perm (w[49], w[48], selector); + w[57] = hc_byte_perm (w[48], w[47], selector); + w[56] = hc_byte_perm (w[47], w[46], selector); + w[55] = hc_byte_perm (w[46], w[45], selector); + w[54] = hc_byte_perm (w[45], w[44], selector); + w[53] = hc_byte_perm (w[44], w[43], selector); + w[52] = hc_byte_perm (w[43], w[42], selector); + w[51] = hc_byte_perm (w[42], w[41], selector); + w[50] = hc_byte_perm (w[41], w[40], selector); + w[49] = hc_byte_perm (w[40], w[39], selector); + w[48] = hc_byte_perm (w[39], w[38], selector); + w[47] = hc_byte_perm (w[38], w[37], selector); + w[46] = hc_byte_perm (w[37], w[36], selector); + w[45] = hc_byte_perm (w[36], w[35], selector); + w[44] = hc_byte_perm (w[35], w[34], selector); + w[43] = hc_byte_perm (w[34], w[33], selector); + w[42] = hc_byte_perm (w[33], w[32], selector); + w[41] = hc_byte_perm (w[32], w[31], selector); + w[40] = hc_byte_perm (w[31], w[30], selector); + w[39] = hc_byte_perm (w[30], w[29], selector); + w[38] = hc_byte_perm (w[29], w[28], selector); + w[37] = hc_byte_perm (w[28], w[27], selector); + w[36] = hc_byte_perm (w[27], w[26], selector); + w[35] = hc_byte_perm (w[26], w[25], selector); + w[34] = hc_byte_perm (w[25], w[24], selector); + w[33] = hc_byte_perm (w[24], w[23], selector); + w[32] = hc_byte_perm (w[23], w[22], selector); + w[31] = hc_byte_perm (w[22], w[21], selector); + w[30] = hc_byte_perm (w[21], w[20], selector); + w[29] = hc_byte_perm (w[20], w[19], selector); + w[28] = hc_byte_perm (w[19], w[18], selector); + w[27] = hc_byte_perm (w[18], w[17], selector); + w[26] = hc_byte_perm (w[17], w[16], selector); + w[25] = hc_byte_perm (w[16], w[15], selector); + w[24] = hc_byte_perm (w[15], w[14], selector); + w[23] = hc_byte_perm (w[14], w[13], selector); + w[22] = hc_byte_perm (w[13], w[12], selector); + w[21] = hc_byte_perm (w[12], w[11], selector); + w[20] = hc_byte_perm (w[11], w[10], selector); + w[19] = hc_byte_perm (w[10], w[ 9], selector); + w[18] = hc_byte_perm (w[ 9], w[ 8], selector); + w[17] = hc_byte_perm (w[ 8], w[ 7], selector); + w[16] = hc_byte_perm (w[ 7], w[ 6], selector); + w[15] = hc_byte_perm (w[ 6], w[ 5], selector); + w[14] = hc_byte_perm (w[ 5], w[ 4], selector); + w[13] = hc_byte_perm (w[ 4], w[ 3], selector); + w[12] = hc_byte_perm (w[ 3], w[ 2], selector); + w[11] = hc_byte_perm (w[ 2], w[ 1], selector); + w[10] = hc_byte_perm (w[ 1], w[ 0], selector); + w[ 9] = hc_byte_perm (w[ 0], 0, selector); w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; @@ -25444,60 +25444,60 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 10: - w[63] = __byte_perm (w[53], w[52], selector); - w[62] = __byte_perm (w[52], w[51], selector); - w[61] = __byte_perm (w[51], w[50], selector); - w[60] = __byte_perm (w[50], w[49], selector); - w[59] = __byte_perm (w[49], w[48], selector); - w[58] = __byte_perm (w[48], w[47], selector); - w[57] = __byte_perm (w[47], w[46], selector); - w[56] = __byte_perm (w[46], w[45], selector); - w[55] = __byte_perm (w[45], w[44], selector); - w[54] = __byte_perm (w[44], w[43], selector); - w[53] = __byte_perm (w[43], w[42], selector); - w[52] = __byte_perm (w[42], w[41], selector); - w[51] = __byte_perm (w[41], w[40], selector); - w[50] = __byte_perm (w[40], w[39], selector); - w[49] = __byte_perm (w[39], w[38], selector); - w[48] = __byte_perm (w[38], w[37], selector); - w[47] = __byte_perm (w[37], w[36], selector); - w[46] = __byte_perm (w[36], w[35], selector); - w[45] = __byte_perm (w[35], w[34], selector); - w[44] = __byte_perm (w[34], w[33], selector); - w[43] = __byte_perm (w[33], w[32], selector); - w[42] = __byte_perm (w[32], w[31], selector); - w[41] = __byte_perm (w[31], w[30], selector); - w[40] = __byte_perm (w[30], w[29], selector); - w[39] = __byte_perm (w[29], w[28], selector); - w[38] = __byte_perm (w[28], w[27], selector); - w[37] = __byte_perm (w[27], w[26], selector); - w[36] = __byte_perm (w[26], w[25], selector); - w[35] = __byte_perm (w[25], w[24], selector); - w[34] = __byte_perm (w[24], w[23], selector); - w[33] = __byte_perm (w[23], w[22], selector); - w[32] = __byte_perm (w[22], w[21], selector); - w[31] = __byte_perm (w[21], w[20], selector); - w[30] = __byte_perm (w[20], w[19], selector); - w[29] = __byte_perm (w[19], w[18], selector); - w[28] = __byte_perm (w[18], w[17], selector); - w[27] = __byte_perm (w[17], w[16], selector); - w[26] = __byte_perm (w[16], w[15], selector); - w[25] = __byte_perm (w[15], w[14], selector); - w[24] = __byte_perm (w[14], w[13], selector); - w[23] = __byte_perm (w[13], w[12], selector); - w[22] = __byte_perm (w[12], w[11], selector); - w[21] = __byte_perm (w[11], w[10], selector); - w[20] = __byte_perm (w[10], w[ 9], selector); - w[19] = __byte_perm (w[ 9], w[ 8], selector); - w[18] = __byte_perm (w[ 8], w[ 7], selector); - w[17] = __byte_perm (w[ 7], w[ 6], selector); - w[16] = __byte_perm (w[ 6], w[ 5], selector); - w[15] = __byte_perm (w[ 5], w[ 4], selector); - w[14] = __byte_perm (w[ 4], w[ 3], selector); - w[13] = __byte_perm (w[ 3], w[ 2], selector); - w[12] = __byte_perm (w[ 2], w[ 1], selector); - w[11] = __byte_perm (w[ 1], w[ 0], selector); - w[10] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[53], w[52], selector); + w[62] = hc_byte_perm (w[52], w[51], selector); + w[61] = hc_byte_perm (w[51], w[50], selector); + w[60] = hc_byte_perm (w[50], w[49], selector); + w[59] = hc_byte_perm (w[49], w[48], selector); + w[58] = hc_byte_perm (w[48], w[47], selector); + w[57] = hc_byte_perm (w[47], w[46], selector); + w[56] = hc_byte_perm (w[46], w[45], selector); + w[55] = hc_byte_perm (w[45], w[44], selector); + w[54] = hc_byte_perm (w[44], w[43], selector); + w[53] = hc_byte_perm (w[43], w[42], selector); + w[52] = hc_byte_perm (w[42], w[41], selector); + w[51] = hc_byte_perm (w[41], w[40], selector); + w[50] = hc_byte_perm (w[40], w[39], selector); + w[49] = hc_byte_perm (w[39], w[38], selector); + w[48] = hc_byte_perm (w[38], w[37], selector); + w[47] = hc_byte_perm (w[37], w[36], selector); + w[46] = hc_byte_perm (w[36], w[35], selector); + w[45] = hc_byte_perm (w[35], w[34], selector); + w[44] = hc_byte_perm (w[34], w[33], selector); + w[43] = hc_byte_perm (w[33], w[32], selector); + w[42] = hc_byte_perm (w[32], w[31], selector); + w[41] = hc_byte_perm (w[31], w[30], selector); + w[40] = hc_byte_perm (w[30], w[29], selector); + w[39] = hc_byte_perm (w[29], w[28], selector); + w[38] = hc_byte_perm (w[28], w[27], selector); + w[37] = hc_byte_perm (w[27], w[26], selector); + w[36] = hc_byte_perm (w[26], w[25], selector); + w[35] = hc_byte_perm (w[25], w[24], selector); + w[34] = hc_byte_perm (w[24], w[23], selector); + w[33] = hc_byte_perm (w[23], w[22], selector); + w[32] = hc_byte_perm (w[22], w[21], selector); + w[31] = hc_byte_perm (w[21], w[20], selector); + w[30] = hc_byte_perm (w[20], w[19], selector); + w[29] = hc_byte_perm (w[19], w[18], selector); + w[28] = hc_byte_perm (w[18], w[17], selector); + w[27] = hc_byte_perm (w[17], w[16], selector); + w[26] = hc_byte_perm (w[16], w[15], selector); + w[25] = hc_byte_perm (w[15], w[14], selector); + w[24] = hc_byte_perm (w[14], w[13], selector); + w[23] = hc_byte_perm (w[13], w[12], selector); + w[22] = hc_byte_perm (w[12], w[11], selector); + w[21] = hc_byte_perm (w[11], w[10], selector); + w[20] = hc_byte_perm (w[10], w[ 9], selector); + w[19] = hc_byte_perm (w[ 9], w[ 8], selector); + w[18] = hc_byte_perm (w[ 8], w[ 7], selector); + w[17] = hc_byte_perm (w[ 7], w[ 6], selector); + w[16] = hc_byte_perm (w[ 6], w[ 5], selector); + w[15] = hc_byte_perm (w[ 5], w[ 4], selector); + w[14] = hc_byte_perm (w[ 4], w[ 3], selector); + w[13] = hc_byte_perm (w[ 3], w[ 2], selector); + w[12] = hc_byte_perm (w[ 2], w[ 1], selector); + w[11] = hc_byte_perm (w[ 1], w[ 0], selector); + w[10] = hc_byte_perm (w[ 0], 0, selector); w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; @@ -25512,59 +25512,59 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 11: - w[63] = __byte_perm (w[52], w[51], selector); - w[62] = __byte_perm (w[51], w[50], selector); - w[61] = __byte_perm (w[50], w[49], selector); - w[60] = __byte_perm (w[49], w[48], selector); - w[59] = __byte_perm (w[48], w[47], selector); - w[58] = __byte_perm (w[47], w[46], selector); - w[57] = __byte_perm (w[46], w[45], selector); - w[56] = __byte_perm (w[45], w[44], selector); - w[55] = __byte_perm (w[44], w[43], selector); - w[54] = __byte_perm (w[43], w[42], selector); - w[53] = __byte_perm (w[42], w[41], selector); - w[52] = __byte_perm (w[41], w[40], selector); - w[51] = __byte_perm (w[40], w[39], selector); - w[50] = __byte_perm (w[39], w[38], selector); - w[49] = __byte_perm (w[38], w[37], selector); - w[48] = __byte_perm (w[37], w[36], selector); - w[47] = __byte_perm (w[36], w[35], selector); - w[46] = __byte_perm (w[35], w[34], selector); - w[45] = __byte_perm (w[34], w[33], selector); - w[44] = __byte_perm (w[33], w[32], selector); - w[43] = __byte_perm (w[32], w[31], selector); - w[42] = __byte_perm (w[31], w[30], selector); - w[41] = __byte_perm (w[30], w[29], selector); - w[40] = __byte_perm (w[29], w[28], selector); - w[39] = __byte_perm (w[28], w[27], selector); - w[38] = __byte_perm (w[27], w[26], selector); - w[37] = __byte_perm (w[26], w[25], selector); - w[36] = __byte_perm (w[25], w[24], selector); - w[35] = __byte_perm (w[24], w[23], selector); - w[34] = __byte_perm (w[23], w[22], selector); - w[33] = __byte_perm (w[22], w[21], selector); - w[32] = __byte_perm (w[21], w[20], selector); - w[31] = __byte_perm (w[20], w[19], selector); - w[30] = __byte_perm (w[19], w[18], selector); - w[29] = __byte_perm (w[18], w[17], selector); - w[28] = __byte_perm (w[17], w[16], selector); - w[27] = __byte_perm (w[16], w[15], selector); - w[26] = __byte_perm (w[15], w[14], selector); - w[25] = __byte_perm (w[14], w[13], selector); - w[24] = __byte_perm (w[13], w[12], selector); - w[23] = __byte_perm (w[12], w[11], selector); - w[22] = __byte_perm (w[11], w[10], selector); - w[21] = __byte_perm (w[10], w[ 9], selector); - w[20] = __byte_perm (w[ 9], w[ 8], selector); - w[19] = __byte_perm (w[ 8], w[ 7], selector); - w[18] = __byte_perm (w[ 7], w[ 6], selector); - w[17] = __byte_perm (w[ 6], w[ 5], selector); - w[16] = __byte_perm (w[ 5], w[ 4], selector); - w[15] = __byte_perm (w[ 4], w[ 3], selector); - w[14] = __byte_perm (w[ 3], w[ 2], selector); - w[13] = __byte_perm (w[ 2], w[ 1], selector); - w[12] = __byte_perm (w[ 1], w[ 0], selector); - w[11] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[52], w[51], selector); + w[62] = hc_byte_perm (w[51], w[50], selector); + w[61] = hc_byte_perm (w[50], w[49], selector); + w[60] = hc_byte_perm (w[49], w[48], selector); + w[59] = hc_byte_perm (w[48], w[47], selector); + w[58] = hc_byte_perm (w[47], w[46], selector); + w[57] = hc_byte_perm (w[46], w[45], selector); + w[56] = hc_byte_perm (w[45], w[44], selector); + w[55] = hc_byte_perm (w[44], w[43], selector); + w[54] = hc_byte_perm (w[43], w[42], selector); + w[53] = hc_byte_perm (w[42], w[41], selector); + w[52] = hc_byte_perm (w[41], w[40], selector); + w[51] = hc_byte_perm (w[40], w[39], selector); + w[50] = hc_byte_perm (w[39], w[38], selector); + w[49] = hc_byte_perm (w[38], w[37], selector); + w[48] = hc_byte_perm (w[37], w[36], selector); + w[47] = hc_byte_perm (w[36], w[35], selector); + w[46] = hc_byte_perm (w[35], w[34], selector); + w[45] = hc_byte_perm (w[34], w[33], selector); + w[44] = hc_byte_perm (w[33], w[32], selector); + w[43] = hc_byte_perm (w[32], w[31], selector); + w[42] = hc_byte_perm (w[31], w[30], selector); + w[41] = hc_byte_perm (w[30], w[29], selector); + w[40] = hc_byte_perm (w[29], w[28], selector); + w[39] = hc_byte_perm (w[28], w[27], selector); + w[38] = hc_byte_perm (w[27], w[26], selector); + w[37] = hc_byte_perm (w[26], w[25], selector); + w[36] = hc_byte_perm (w[25], w[24], selector); + w[35] = hc_byte_perm (w[24], w[23], selector); + w[34] = hc_byte_perm (w[23], w[22], selector); + w[33] = hc_byte_perm (w[22], w[21], selector); + w[32] = hc_byte_perm (w[21], w[20], selector); + w[31] = hc_byte_perm (w[20], w[19], selector); + w[30] = hc_byte_perm (w[19], w[18], selector); + w[29] = hc_byte_perm (w[18], w[17], selector); + w[28] = hc_byte_perm (w[17], w[16], selector); + w[27] = hc_byte_perm (w[16], w[15], selector); + w[26] = hc_byte_perm (w[15], w[14], selector); + w[25] = hc_byte_perm (w[14], w[13], selector); + w[24] = hc_byte_perm (w[13], w[12], selector); + w[23] = hc_byte_perm (w[12], w[11], selector); + w[22] = hc_byte_perm (w[11], w[10], selector); + w[21] = hc_byte_perm (w[10], w[ 9], selector); + w[20] = hc_byte_perm (w[ 9], w[ 8], selector); + w[19] = hc_byte_perm (w[ 8], w[ 7], selector); + w[18] = hc_byte_perm (w[ 7], w[ 6], selector); + w[17] = hc_byte_perm (w[ 6], w[ 5], selector); + w[16] = hc_byte_perm (w[ 5], w[ 4], selector); + w[15] = hc_byte_perm (w[ 4], w[ 3], selector); + w[14] = hc_byte_perm (w[ 3], w[ 2], selector); + w[13] = hc_byte_perm (w[ 2], w[ 1], selector); + w[12] = hc_byte_perm (w[ 1], w[ 0], selector); + w[11] = hc_byte_perm (w[ 0], 0, selector); w[10] = 0; w[ 9] = 0; w[ 8] = 0; @@ -25580,58 +25580,58 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 12: - w[63] = __byte_perm (w[51], w[50], selector); - w[62] = __byte_perm (w[50], w[49], selector); - w[61] = __byte_perm (w[49], w[48], selector); - w[60] = __byte_perm (w[48], w[47], selector); - w[59] = __byte_perm (w[47], w[46], selector); - w[58] = __byte_perm (w[46], w[45], selector); - w[57] = __byte_perm (w[45], w[44], selector); - w[56] = __byte_perm (w[44], w[43], selector); - w[55] = __byte_perm (w[43], w[42], selector); - w[54] = __byte_perm (w[42], w[41], selector); - w[53] = __byte_perm (w[41], w[40], selector); - w[52] = __byte_perm (w[40], w[39], selector); - w[51] = __byte_perm (w[39], w[38], selector); - w[50] = __byte_perm (w[38], w[37], selector); - w[49] = __byte_perm (w[37], w[36], selector); - w[48] = __byte_perm (w[36], w[35], selector); - w[47] = __byte_perm (w[35], w[34], selector); - w[46] = __byte_perm (w[34], w[33], selector); - w[45] = __byte_perm (w[33], w[32], selector); - w[44] = __byte_perm (w[32], w[31], selector); - w[43] = __byte_perm (w[31], w[30], selector); - w[42] = __byte_perm (w[30], w[29], selector); - w[41] = __byte_perm (w[29], w[28], selector); - w[40] = __byte_perm (w[28], w[27], selector); - w[39] = __byte_perm (w[27], w[26], selector); - w[38] = __byte_perm (w[26], w[25], selector); - w[37] = __byte_perm (w[25], w[24], selector); - w[36] = __byte_perm (w[24], w[23], selector); - w[35] = __byte_perm (w[23], w[22], selector); - w[34] = __byte_perm (w[22], w[21], selector); - w[33] = __byte_perm (w[21], w[20], selector); - w[32] = __byte_perm (w[20], w[19], selector); - w[31] = __byte_perm (w[19], w[18], selector); - w[30] = __byte_perm (w[18], w[17], selector); - w[29] = __byte_perm (w[17], w[16], selector); - w[28] = __byte_perm (w[16], w[15], selector); - w[27] = __byte_perm (w[15], w[14], selector); - w[26] = __byte_perm (w[14], w[13], selector); - w[25] = __byte_perm (w[13], w[12], selector); - w[24] = __byte_perm (w[12], w[11], selector); - w[23] = __byte_perm (w[11], w[10], selector); - w[22] = __byte_perm (w[10], w[ 9], selector); - w[21] = __byte_perm (w[ 9], w[ 8], selector); - w[20] = __byte_perm (w[ 8], w[ 7], selector); - w[19] = __byte_perm (w[ 7], w[ 6], selector); - w[18] = __byte_perm (w[ 6], w[ 5], selector); - w[17] = __byte_perm (w[ 5], w[ 4], selector); - w[16] = __byte_perm (w[ 4], w[ 3], selector); - w[15] = __byte_perm (w[ 3], w[ 2], selector); - w[14] = __byte_perm (w[ 2], w[ 1], selector); - w[13] = __byte_perm (w[ 1], w[ 0], selector); - w[12] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[51], w[50], selector); + w[62] = hc_byte_perm (w[50], w[49], selector); + w[61] = hc_byte_perm (w[49], w[48], selector); + w[60] = hc_byte_perm (w[48], w[47], selector); + w[59] = hc_byte_perm (w[47], w[46], selector); + w[58] = hc_byte_perm (w[46], w[45], selector); + w[57] = hc_byte_perm (w[45], w[44], selector); + w[56] = hc_byte_perm (w[44], w[43], selector); + w[55] = hc_byte_perm (w[43], w[42], selector); + w[54] = hc_byte_perm (w[42], w[41], selector); + w[53] = hc_byte_perm (w[41], w[40], selector); + w[52] = hc_byte_perm (w[40], w[39], selector); + w[51] = hc_byte_perm (w[39], w[38], selector); + w[50] = hc_byte_perm (w[38], w[37], selector); + w[49] = hc_byte_perm (w[37], w[36], selector); + w[48] = hc_byte_perm (w[36], w[35], selector); + w[47] = hc_byte_perm (w[35], w[34], selector); + w[46] = hc_byte_perm (w[34], w[33], selector); + w[45] = hc_byte_perm (w[33], w[32], selector); + w[44] = hc_byte_perm (w[32], w[31], selector); + w[43] = hc_byte_perm (w[31], w[30], selector); + w[42] = hc_byte_perm (w[30], w[29], selector); + w[41] = hc_byte_perm (w[29], w[28], selector); + w[40] = hc_byte_perm (w[28], w[27], selector); + w[39] = hc_byte_perm (w[27], w[26], selector); + w[38] = hc_byte_perm (w[26], w[25], selector); + w[37] = hc_byte_perm (w[25], w[24], selector); + w[36] = hc_byte_perm (w[24], w[23], selector); + w[35] = hc_byte_perm (w[23], w[22], selector); + w[34] = hc_byte_perm (w[22], w[21], selector); + w[33] = hc_byte_perm (w[21], w[20], selector); + w[32] = hc_byte_perm (w[20], w[19], selector); + w[31] = hc_byte_perm (w[19], w[18], selector); + w[30] = hc_byte_perm (w[18], w[17], selector); + w[29] = hc_byte_perm (w[17], w[16], selector); + w[28] = hc_byte_perm (w[16], w[15], selector); + w[27] = hc_byte_perm (w[15], w[14], selector); + w[26] = hc_byte_perm (w[14], w[13], selector); + w[25] = hc_byte_perm (w[13], w[12], selector); + w[24] = hc_byte_perm (w[12], w[11], selector); + w[23] = hc_byte_perm (w[11], w[10], selector); + w[22] = hc_byte_perm (w[10], w[ 9], selector); + w[21] = hc_byte_perm (w[ 9], w[ 8], selector); + w[20] = hc_byte_perm (w[ 8], w[ 7], selector); + w[19] = hc_byte_perm (w[ 7], w[ 6], selector); + w[18] = hc_byte_perm (w[ 6], w[ 5], selector); + w[17] = hc_byte_perm (w[ 5], w[ 4], selector); + w[16] = hc_byte_perm (w[ 4], w[ 3], selector); + w[15] = hc_byte_perm (w[ 3], w[ 2], selector); + w[14] = hc_byte_perm (w[ 2], w[ 1], selector); + w[13] = hc_byte_perm (w[ 1], w[ 0], selector); + w[12] = hc_byte_perm (w[ 0], 0, selector); w[11] = 0; w[10] = 0; w[ 9] = 0; @@ -25648,57 +25648,57 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 13: - w[63] = __byte_perm (w[50], w[49], selector); - w[62] = __byte_perm (w[49], w[48], selector); - w[61] = __byte_perm (w[48], w[47], selector); - w[60] = __byte_perm (w[47], w[46], selector); - w[59] = __byte_perm (w[46], w[45], selector); - w[58] = __byte_perm (w[45], w[44], selector); - w[57] = __byte_perm (w[44], w[43], selector); - w[56] = __byte_perm (w[43], w[42], selector); - w[55] = __byte_perm (w[42], w[41], selector); - w[54] = __byte_perm (w[41], w[40], selector); - w[53] = __byte_perm (w[40], w[39], selector); - w[52] = __byte_perm (w[39], w[38], selector); - w[51] = __byte_perm (w[38], w[37], selector); - w[50] = __byte_perm (w[37], w[36], selector); - w[49] = __byte_perm (w[36], w[35], selector); - w[48] = __byte_perm (w[35], w[34], selector); - w[47] = __byte_perm (w[34], w[33], selector); - w[46] = __byte_perm (w[33], w[32], selector); - w[45] = __byte_perm (w[32], w[31], selector); - w[44] = __byte_perm (w[31], w[30], selector); - w[43] = __byte_perm (w[30], w[29], selector); - w[42] = __byte_perm (w[29], w[28], selector); - w[41] = __byte_perm (w[28], w[27], selector); - w[40] = __byte_perm (w[27], w[26], selector); - w[39] = __byte_perm (w[26], w[25], selector); - w[38] = __byte_perm (w[25], w[24], selector); - w[37] = __byte_perm (w[24], w[23], selector); - w[36] = __byte_perm (w[23], w[22], selector); - w[35] = __byte_perm (w[22], w[21], selector); - w[34] = __byte_perm (w[21], w[20], selector); - w[33] = __byte_perm (w[20], w[19], selector); - w[32] = __byte_perm (w[19], w[18], selector); - w[31] = __byte_perm (w[18], w[17], selector); - w[30] = __byte_perm (w[17], w[16], selector); - w[29] = __byte_perm (w[16], w[15], selector); - w[28] = __byte_perm (w[15], w[14], selector); - w[27] = __byte_perm (w[14], w[13], selector); - w[26] = __byte_perm (w[13], w[12], selector); - w[25] = __byte_perm (w[12], w[11], selector); - w[24] = __byte_perm (w[11], w[10], selector); - w[23] = __byte_perm (w[10], w[ 9], selector); - w[22] = __byte_perm (w[ 9], w[ 8], selector); - w[21] = __byte_perm (w[ 8], w[ 7], selector); - w[20] = __byte_perm (w[ 7], w[ 6], selector); - w[19] = __byte_perm (w[ 6], w[ 5], selector); - w[18] = __byte_perm (w[ 5], w[ 4], selector); - w[17] = __byte_perm (w[ 4], w[ 3], selector); - w[16] = __byte_perm (w[ 3], w[ 2], selector); - w[15] = __byte_perm (w[ 2], w[ 1], selector); - w[14] = __byte_perm (w[ 1], w[ 0], selector); - w[13] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[50], w[49], selector); + w[62] = hc_byte_perm (w[49], w[48], selector); + w[61] = hc_byte_perm (w[48], w[47], selector); + w[60] = hc_byte_perm (w[47], w[46], selector); + w[59] = hc_byte_perm (w[46], w[45], selector); + w[58] = hc_byte_perm (w[45], w[44], selector); + w[57] = hc_byte_perm (w[44], w[43], selector); + w[56] = hc_byte_perm (w[43], w[42], selector); + w[55] = hc_byte_perm (w[42], w[41], selector); + w[54] = hc_byte_perm (w[41], w[40], selector); + w[53] = hc_byte_perm (w[40], w[39], selector); + w[52] = hc_byte_perm (w[39], w[38], selector); + w[51] = hc_byte_perm (w[38], w[37], selector); + w[50] = hc_byte_perm (w[37], w[36], selector); + w[49] = hc_byte_perm (w[36], w[35], selector); + w[48] = hc_byte_perm (w[35], w[34], selector); + w[47] = hc_byte_perm (w[34], w[33], selector); + w[46] = hc_byte_perm (w[33], w[32], selector); + w[45] = hc_byte_perm (w[32], w[31], selector); + w[44] = hc_byte_perm (w[31], w[30], selector); + w[43] = hc_byte_perm (w[30], w[29], selector); + w[42] = hc_byte_perm (w[29], w[28], selector); + w[41] = hc_byte_perm (w[28], w[27], selector); + w[40] = hc_byte_perm (w[27], w[26], selector); + w[39] = hc_byte_perm (w[26], w[25], selector); + w[38] = hc_byte_perm (w[25], w[24], selector); + w[37] = hc_byte_perm (w[24], w[23], selector); + w[36] = hc_byte_perm (w[23], w[22], selector); + w[35] = hc_byte_perm (w[22], w[21], selector); + w[34] = hc_byte_perm (w[21], w[20], selector); + w[33] = hc_byte_perm (w[20], w[19], selector); + w[32] = hc_byte_perm (w[19], w[18], selector); + w[31] = hc_byte_perm (w[18], w[17], selector); + w[30] = hc_byte_perm (w[17], w[16], selector); + w[29] = hc_byte_perm (w[16], w[15], selector); + w[28] = hc_byte_perm (w[15], w[14], selector); + w[27] = hc_byte_perm (w[14], w[13], selector); + w[26] = hc_byte_perm (w[13], w[12], selector); + w[25] = hc_byte_perm (w[12], w[11], selector); + w[24] = hc_byte_perm (w[11], w[10], selector); + w[23] = hc_byte_perm (w[10], w[ 9], selector); + w[22] = hc_byte_perm (w[ 9], w[ 8], selector); + w[21] = hc_byte_perm (w[ 8], w[ 7], selector); + w[20] = hc_byte_perm (w[ 7], w[ 6], selector); + w[19] = hc_byte_perm (w[ 6], w[ 5], selector); + w[18] = hc_byte_perm (w[ 5], w[ 4], selector); + w[17] = hc_byte_perm (w[ 4], w[ 3], selector); + w[16] = hc_byte_perm (w[ 3], w[ 2], selector); + w[15] = hc_byte_perm (w[ 2], w[ 1], selector); + w[14] = hc_byte_perm (w[ 1], w[ 0], selector); + w[13] = hc_byte_perm (w[ 0], 0, selector); w[12] = 0; w[11] = 0; w[10] = 0; @@ -25716,56 +25716,56 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 14: - w[63] = __byte_perm (w[49], w[48], selector); - w[62] = __byte_perm (w[48], w[47], selector); - w[61] = __byte_perm (w[47], w[46], selector); - w[60] = __byte_perm (w[46], w[45], selector); - w[59] = __byte_perm (w[45], w[44], selector); - w[58] = __byte_perm (w[44], w[43], selector); - w[57] = __byte_perm (w[43], w[42], selector); - w[56] = __byte_perm (w[42], w[41], selector); - w[55] = __byte_perm (w[41], w[40], selector); - w[54] = __byte_perm (w[40], w[39], selector); - w[53] = __byte_perm (w[39], w[38], selector); - w[52] = __byte_perm (w[38], w[37], selector); - w[51] = __byte_perm (w[37], w[36], selector); - w[50] = __byte_perm (w[36], w[35], selector); - w[49] = __byte_perm (w[35], w[34], selector); - w[48] = __byte_perm (w[34], w[33], selector); - w[47] = __byte_perm (w[33], w[32], selector); - w[46] = __byte_perm (w[32], w[31], selector); - w[45] = __byte_perm (w[31], w[30], selector); - w[44] = __byte_perm (w[30], w[29], selector); - w[43] = __byte_perm (w[29], w[28], selector); - w[42] = __byte_perm (w[28], w[27], selector); - w[41] = __byte_perm (w[27], w[26], selector); - w[40] = __byte_perm (w[26], w[25], selector); - w[39] = __byte_perm (w[25], w[24], selector); - w[38] = __byte_perm (w[24], w[23], selector); - w[37] = __byte_perm (w[23], w[22], selector); - w[36] = __byte_perm (w[22], w[21], selector); - w[35] = __byte_perm (w[21], w[20], selector); - w[34] = __byte_perm (w[20], w[19], selector); - w[33] = __byte_perm (w[19], w[18], selector); - w[32] = __byte_perm (w[18], w[17], selector); - w[31] = __byte_perm (w[17], w[16], selector); - w[30] = __byte_perm (w[16], w[15], selector); - w[29] = __byte_perm (w[15], w[14], selector); - w[28] = __byte_perm (w[14], w[13], selector); - w[27] = __byte_perm (w[13], w[12], selector); - w[26] = __byte_perm (w[12], w[11], selector); - w[25] = __byte_perm (w[11], w[10], selector); - w[24] = __byte_perm (w[10], w[ 9], selector); - w[23] = __byte_perm (w[ 9], w[ 8], selector); - w[22] = __byte_perm (w[ 8], w[ 7], selector); - w[21] = __byte_perm (w[ 7], w[ 6], selector); - w[20] = __byte_perm (w[ 6], w[ 5], selector); - w[19] = __byte_perm (w[ 5], w[ 4], selector); - w[18] = __byte_perm (w[ 4], w[ 3], selector); - w[17] = __byte_perm (w[ 3], w[ 2], selector); - w[16] = __byte_perm (w[ 2], w[ 1], selector); - w[15] = __byte_perm (w[ 1], w[ 0], selector); - w[14] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[49], w[48], selector); + w[62] = hc_byte_perm (w[48], w[47], selector); + w[61] = hc_byte_perm (w[47], w[46], selector); + w[60] = hc_byte_perm (w[46], w[45], selector); + w[59] = hc_byte_perm (w[45], w[44], selector); + w[58] = hc_byte_perm (w[44], w[43], selector); + w[57] = hc_byte_perm (w[43], w[42], selector); + w[56] = hc_byte_perm (w[42], w[41], selector); + w[55] = hc_byte_perm (w[41], w[40], selector); + w[54] = hc_byte_perm (w[40], w[39], selector); + w[53] = hc_byte_perm (w[39], w[38], selector); + w[52] = hc_byte_perm (w[38], w[37], selector); + w[51] = hc_byte_perm (w[37], w[36], selector); + w[50] = hc_byte_perm (w[36], w[35], selector); + w[49] = hc_byte_perm (w[35], w[34], selector); + w[48] = hc_byte_perm (w[34], w[33], selector); + w[47] = hc_byte_perm (w[33], w[32], selector); + w[46] = hc_byte_perm (w[32], w[31], selector); + w[45] = hc_byte_perm (w[31], w[30], selector); + w[44] = hc_byte_perm (w[30], w[29], selector); + w[43] = hc_byte_perm (w[29], w[28], selector); + w[42] = hc_byte_perm (w[28], w[27], selector); + w[41] = hc_byte_perm (w[27], w[26], selector); + w[40] = hc_byte_perm (w[26], w[25], selector); + w[39] = hc_byte_perm (w[25], w[24], selector); + w[38] = hc_byte_perm (w[24], w[23], selector); + w[37] = hc_byte_perm (w[23], w[22], selector); + w[36] = hc_byte_perm (w[22], w[21], selector); + w[35] = hc_byte_perm (w[21], w[20], selector); + w[34] = hc_byte_perm (w[20], w[19], selector); + w[33] = hc_byte_perm (w[19], w[18], selector); + w[32] = hc_byte_perm (w[18], w[17], selector); + w[31] = hc_byte_perm (w[17], w[16], selector); + w[30] = hc_byte_perm (w[16], w[15], selector); + w[29] = hc_byte_perm (w[15], w[14], selector); + w[28] = hc_byte_perm (w[14], w[13], selector); + w[27] = hc_byte_perm (w[13], w[12], selector); + w[26] = hc_byte_perm (w[12], w[11], selector); + w[25] = hc_byte_perm (w[11], w[10], selector); + w[24] = hc_byte_perm (w[10], w[ 9], selector); + w[23] = hc_byte_perm (w[ 9], w[ 8], selector); + w[22] = hc_byte_perm (w[ 8], w[ 7], selector); + w[21] = hc_byte_perm (w[ 7], w[ 6], selector); + w[20] = hc_byte_perm (w[ 6], w[ 5], selector); + w[19] = hc_byte_perm (w[ 5], w[ 4], selector); + w[18] = hc_byte_perm (w[ 4], w[ 3], selector); + w[17] = hc_byte_perm (w[ 3], w[ 2], selector); + w[16] = hc_byte_perm (w[ 2], w[ 1], selector); + w[15] = hc_byte_perm (w[ 1], w[ 0], selector); + w[14] = hc_byte_perm (w[ 0], 0, selector); w[13] = 0; w[12] = 0; w[11] = 0; @@ -25784,55 +25784,55 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 15: - w[63] = __byte_perm (w[48], w[47], selector); - w[62] = __byte_perm (w[47], w[46], selector); - w[61] = __byte_perm (w[46], w[45], selector); - w[60] = __byte_perm (w[45], w[44], selector); - w[59] = __byte_perm (w[44], w[43], selector); - w[58] = __byte_perm (w[43], w[42], selector); - w[57] = __byte_perm (w[42], w[41], selector); - w[56] = __byte_perm (w[41], w[40], selector); - w[55] = __byte_perm (w[40], w[39], selector); - w[54] = __byte_perm (w[39], w[38], selector); - w[53] = __byte_perm (w[38], w[37], selector); - w[52] = __byte_perm (w[37], w[36], selector); - w[51] = __byte_perm (w[36], w[35], selector); - w[50] = __byte_perm (w[35], w[34], selector); - w[49] = __byte_perm (w[34], w[33], selector); - w[48] = __byte_perm (w[33], w[32], selector); - w[47] = __byte_perm (w[32], w[31], selector); - w[46] = __byte_perm (w[31], w[30], selector); - w[45] = __byte_perm (w[30], w[29], selector); - w[44] = __byte_perm (w[29], w[28], selector); - w[43] = __byte_perm (w[28], w[27], selector); - w[42] = __byte_perm (w[27], w[26], selector); - w[41] = __byte_perm (w[26], w[25], selector); - w[40] = __byte_perm (w[25], w[24], selector); - w[39] = __byte_perm (w[24], w[23], selector); - w[38] = __byte_perm (w[23], w[22], selector); - w[37] = __byte_perm (w[22], w[21], selector); - w[36] = __byte_perm (w[21], w[20], selector); - w[35] = __byte_perm (w[20], w[19], selector); - w[34] = __byte_perm (w[19], w[18], selector); - w[33] = __byte_perm (w[18], w[17], selector); - w[32] = __byte_perm (w[17], w[16], selector); - w[31] = __byte_perm (w[16], w[15], selector); - w[30] = __byte_perm (w[15], w[14], selector); - w[29] = __byte_perm (w[14], w[13], selector); - w[28] = __byte_perm (w[13], w[12], selector); - w[27] = __byte_perm (w[12], w[11], selector); - w[26] = __byte_perm (w[11], w[10], selector); - w[25] = __byte_perm (w[10], w[ 9], selector); - w[24] = __byte_perm (w[ 9], w[ 8], selector); - w[23] = __byte_perm (w[ 8], w[ 7], selector); - w[22] = __byte_perm (w[ 7], w[ 6], selector); - w[21] = __byte_perm (w[ 6], w[ 5], selector); - w[20] = __byte_perm (w[ 5], w[ 4], selector); - w[19] = __byte_perm (w[ 4], w[ 3], selector); - w[18] = __byte_perm (w[ 3], w[ 2], selector); - w[17] = __byte_perm (w[ 2], w[ 1], selector); - w[16] = __byte_perm (w[ 1], w[ 0], selector); - w[15] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[48], w[47], selector); + w[62] = hc_byte_perm (w[47], w[46], selector); + w[61] = hc_byte_perm (w[46], w[45], selector); + w[60] = hc_byte_perm (w[45], w[44], selector); + w[59] = hc_byte_perm (w[44], w[43], selector); + w[58] = hc_byte_perm (w[43], w[42], selector); + w[57] = hc_byte_perm (w[42], w[41], selector); + w[56] = hc_byte_perm (w[41], w[40], selector); + w[55] = hc_byte_perm (w[40], w[39], selector); + w[54] = hc_byte_perm (w[39], w[38], selector); + w[53] = hc_byte_perm (w[38], w[37], selector); + w[52] = hc_byte_perm (w[37], w[36], selector); + w[51] = hc_byte_perm (w[36], w[35], selector); + w[50] = hc_byte_perm (w[35], w[34], selector); + w[49] = hc_byte_perm (w[34], w[33], selector); + w[48] = hc_byte_perm (w[33], w[32], selector); + w[47] = hc_byte_perm (w[32], w[31], selector); + w[46] = hc_byte_perm (w[31], w[30], selector); + w[45] = hc_byte_perm (w[30], w[29], selector); + w[44] = hc_byte_perm (w[29], w[28], selector); + w[43] = hc_byte_perm (w[28], w[27], selector); + w[42] = hc_byte_perm (w[27], w[26], selector); + w[41] = hc_byte_perm (w[26], w[25], selector); + w[40] = hc_byte_perm (w[25], w[24], selector); + w[39] = hc_byte_perm (w[24], w[23], selector); + w[38] = hc_byte_perm (w[23], w[22], selector); + w[37] = hc_byte_perm (w[22], w[21], selector); + w[36] = hc_byte_perm (w[21], w[20], selector); + w[35] = hc_byte_perm (w[20], w[19], selector); + w[34] = hc_byte_perm (w[19], w[18], selector); + w[33] = hc_byte_perm (w[18], w[17], selector); + w[32] = hc_byte_perm (w[17], w[16], selector); + w[31] = hc_byte_perm (w[16], w[15], selector); + w[30] = hc_byte_perm (w[15], w[14], selector); + w[29] = hc_byte_perm (w[14], w[13], selector); + w[28] = hc_byte_perm (w[13], w[12], selector); + w[27] = hc_byte_perm (w[12], w[11], selector); + w[26] = hc_byte_perm (w[11], w[10], selector); + w[25] = hc_byte_perm (w[10], w[ 9], selector); + w[24] = hc_byte_perm (w[ 9], w[ 8], selector); + w[23] = hc_byte_perm (w[ 8], w[ 7], selector); + w[22] = hc_byte_perm (w[ 7], w[ 6], selector); + w[21] = hc_byte_perm (w[ 6], w[ 5], selector); + w[20] = hc_byte_perm (w[ 5], w[ 4], selector); + w[19] = hc_byte_perm (w[ 4], w[ 3], selector); + w[18] = hc_byte_perm (w[ 3], w[ 2], selector); + w[17] = hc_byte_perm (w[ 2], w[ 1], selector); + w[16] = hc_byte_perm (w[ 1], w[ 0], selector); + w[15] = hc_byte_perm (w[ 0], 0, selector); w[14] = 0; w[13] = 0; w[12] = 0; @@ -25852,54 +25852,54 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 16: - w[63] = __byte_perm (w[47], w[46], selector); - w[62] = __byte_perm (w[46], w[45], selector); - w[61] = __byte_perm (w[45], w[44], selector); - w[60] = __byte_perm (w[44], w[43], selector); - w[59] = __byte_perm (w[43], w[42], selector); - w[58] = __byte_perm (w[42], w[41], selector); - w[57] = __byte_perm (w[41], w[40], selector); - w[56] = __byte_perm (w[40], w[39], selector); - w[55] = __byte_perm (w[39], w[38], selector); - w[54] = __byte_perm (w[38], w[37], selector); - w[53] = __byte_perm (w[37], w[36], selector); - w[52] = __byte_perm (w[36], w[35], selector); - w[51] = __byte_perm (w[35], w[34], selector); - w[50] = __byte_perm (w[34], w[33], selector); - w[49] = __byte_perm (w[33], w[32], selector); - w[48] = __byte_perm (w[32], w[31], selector); - w[47] = __byte_perm (w[31], w[30], selector); - w[46] = __byte_perm (w[30], w[29], selector); - w[45] = __byte_perm (w[29], w[28], selector); - w[44] = __byte_perm (w[28], w[27], selector); - w[43] = __byte_perm (w[27], w[26], selector); - w[42] = __byte_perm (w[26], w[25], selector); - w[41] = __byte_perm (w[25], w[24], selector); - w[40] = __byte_perm (w[24], w[23], selector); - w[39] = __byte_perm (w[23], w[22], selector); - w[38] = __byte_perm (w[22], w[21], selector); - w[37] = __byte_perm (w[21], w[20], selector); - w[36] = __byte_perm (w[20], w[19], selector); - w[35] = __byte_perm (w[19], w[18], selector); - w[34] = __byte_perm (w[18], w[17], selector); - w[33] = __byte_perm (w[17], w[16], selector); - w[32] = __byte_perm (w[16], w[15], selector); - w[31] = __byte_perm (w[15], w[14], selector); - w[30] = __byte_perm (w[14], w[13], selector); - w[29] = __byte_perm (w[13], w[12], selector); - w[28] = __byte_perm (w[12], w[11], selector); - w[27] = __byte_perm (w[11], w[10], selector); - w[26] = __byte_perm (w[10], w[ 9], selector); - w[25] = __byte_perm (w[ 9], w[ 8], selector); - w[24] = __byte_perm (w[ 8], w[ 7], selector); - w[23] = __byte_perm (w[ 7], w[ 6], selector); - w[22] = __byte_perm (w[ 6], w[ 5], selector); - w[21] = __byte_perm (w[ 5], w[ 4], selector); - w[20] = __byte_perm (w[ 4], w[ 3], selector); - w[19] = __byte_perm (w[ 3], w[ 2], selector); - w[18] = __byte_perm (w[ 2], w[ 1], selector); - w[17] = __byte_perm (w[ 1], w[ 0], selector); - w[16] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[47], w[46], selector); + w[62] = hc_byte_perm (w[46], w[45], selector); + w[61] = hc_byte_perm (w[45], w[44], selector); + w[60] = hc_byte_perm (w[44], w[43], selector); + w[59] = hc_byte_perm (w[43], w[42], selector); + w[58] = hc_byte_perm (w[42], w[41], selector); + w[57] = hc_byte_perm (w[41], w[40], selector); + w[56] = hc_byte_perm (w[40], w[39], selector); + w[55] = hc_byte_perm (w[39], w[38], selector); + w[54] = hc_byte_perm (w[38], w[37], selector); + w[53] = hc_byte_perm (w[37], w[36], selector); + w[52] = hc_byte_perm (w[36], w[35], selector); + w[51] = hc_byte_perm (w[35], w[34], selector); + w[50] = hc_byte_perm (w[34], w[33], selector); + w[49] = hc_byte_perm (w[33], w[32], selector); + w[48] = hc_byte_perm (w[32], w[31], selector); + w[47] = hc_byte_perm (w[31], w[30], selector); + w[46] = hc_byte_perm (w[30], w[29], selector); + w[45] = hc_byte_perm (w[29], w[28], selector); + w[44] = hc_byte_perm (w[28], w[27], selector); + w[43] = hc_byte_perm (w[27], w[26], selector); + w[42] = hc_byte_perm (w[26], w[25], selector); + w[41] = hc_byte_perm (w[25], w[24], selector); + w[40] = hc_byte_perm (w[24], w[23], selector); + w[39] = hc_byte_perm (w[23], w[22], selector); + w[38] = hc_byte_perm (w[22], w[21], selector); + w[37] = hc_byte_perm (w[21], w[20], selector); + w[36] = hc_byte_perm (w[20], w[19], selector); + w[35] = hc_byte_perm (w[19], w[18], selector); + w[34] = hc_byte_perm (w[18], w[17], selector); + w[33] = hc_byte_perm (w[17], w[16], selector); + w[32] = hc_byte_perm (w[16], w[15], selector); + w[31] = hc_byte_perm (w[15], w[14], selector); + w[30] = hc_byte_perm (w[14], w[13], selector); + w[29] = hc_byte_perm (w[13], w[12], selector); + w[28] = hc_byte_perm (w[12], w[11], selector); + w[27] = hc_byte_perm (w[11], w[10], selector); + w[26] = hc_byte_perm (w[10], w[ 9], selector); + w[25] = hc_byte_perm (w[ 9], w[ 8], selector); + w[24] = hc_byte_perm (w[ 8], w[ 7], selector); + w[23] = hc_byte_perm (w[ 7], w[ 6], selector); + w[22] = hc_byte_perm (w[ 6], w[ 5], selector); + w[21] = hc_byte_perm (w[ 5], w[ 4], selector); + w[20] = hc_byte_perm (w[ 4], w[ 3], selector); + w[19] = hc_byte_perm (w[ 3], w[ 2], selector); + w[18] = hc_byte_perm (w[ 2], w[ 1], selector); + w[17] = hc_byte_perm (w[ 1], w[ 0], selector); + w[16] = hc_byte_perm (w[ 0], 0, selector); w[15] = 0; w[14] = 0; w[13] = 0; @@ -25920,53 +25920,53 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 17: - w[63] = __byte_perm (w[46], w[45], selector); - w[62] = __byte_perm (w[45], w[44], selector); - w[61] = __byte_perm (w[44], w[43], selector); - w[60] = __byte_perm (w[43], w[42], selector); - w[59] = __byte_perm (w[42], w[41], selector); - w[58] = __byte_perm (w[41], w[40], selector); - w[57] = __byte_perm (w[40], w[39], selector); - w[56] = __byte_perm (w[39], w[38], selector); - w[55] = __byte_perm (w[38], w[37], selector); - w[54] = __byte_perm (w[37], w[36], selector); - w[53] = __byte_perm (w[36], w[35], selector); - w[52] = __byte_perm (w[35], w[34], selector); - w[51] = __byte_perm (w[34], w[33], selector); - w[50] = __byte_perm (w[33], w[32], selector); - w[49] = __byte_perm (w[32], w[31], selector); - w[48] = __byte_perm (w[31], w[30], selector); - w[47] = __byte_perm (w[30], w[29], selector); - w[46] = __byte_perm (w[29], w[28], selector); - w[45] = __byte_perm (w[28], w[27], selector); - w[44] = __byte_perm (w[27], w[26], selector); - w[43] = __byte_perm (w[26], w[25], selector); - w[42] = __byte_perm (w[25], w[24], selector); - w[41] = __byte_perm (w[24], w[23], selector); - w[40] = __byte_perm (w[23], w[22], selector); - w[39] = __byte_perm (w[22], w[21], selector); - w[38] = __byte_perm (w[21], w[20], selector); - w[37] = __byte_perm (w[20], w[19], selector); - w[36] = __byte_perm (w[19], w[18], selector); - w[35] = __byte_perm (w[18], w[17], selector); - w[34] = __byte_perm (w[17], w[16], selector); - w[33] = __byte_perm (w[16], w[15], selector); - w[32] = __byte_perm (w[15], w[14], selector); - w[31] = __byte_perm (w[14], w[13], selector); - w[30] = __byte_perm (w[13], w[12], selector); - w[29] = __byte_perm (w[12], w[11], selector); - w[28] = __byte_perm (w[11], w[10], selector); - w[27] = __byte_perm (w[10], w[ 9], selector); - w[26] = __byte_perm (w[ 9], w[ 8], selector); - w[25] = __byte_perm (w[ 8], w[ 7], selector); - w[24] = __byte_perm (w[ 7], w[ 6], selector); - w[23] = __byte_perm (w[ 6], w[ 5], selector); - w[22] = __byte_perm (w[ 5], w[ 4], selector); - w[21] = __byte_perm (w[ 4], w[ 3], selector); - w[20] = __byte_perm (w[ 3], w[ 2], selector); - w[19] = __byte_perm (w[ 2], w[ 1], selector); - w[18] = __byte_perm (w[ 1], w[ 0], selector); - w[17] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[46], w[45], selector); + w[62] = hc_byte_perm (w[45], w[44], selector); + w[61] = hc_byte_perm (w[44], w[43], selector); + w[60] = hc_byte_perm (w[43], w[42], selector); + w[59] = hc_byte_perm (w[42], w[41], selector); + w[58] = hc_byte_perm (w[41], w[40], selector); + w[57] = hc_byte_perm (w[40], w[39], selector); + w[56] = hc_byte_perm (w[39], w[38], selector); + w[55] = hc_byte_perm (w[38], w[37], selector); + w[54] = hc_byte_perm (w[37], w[36], selector); + w[53] = hc_byte_perm (w[36], w[35], selector); + w[52] = hc_byte_perm (w[35], w[34], selector); + w[51] = hc_byte_perm (w[34], w[33], selector); + w[50] = hc_byte_perm (w[33], w[32], selector); + w[49] = hc_byte_perm (w[32], w[31], selector); + w[48] = hc_byte_perm (w[31], w[30], selector); + w[47] = hc_byte_perm (w[30], w[29], selector); + w[46] = hc_byte_perm (w[29], w[28], selector); + w[45] = hc_byte_perm (w[28], w[27], selector); + w[44] = hc_byte_perm (w[27], w[26], selector); + w[43] = hc_byte_perm (w[26], w[25], selector); + w[42] = hc_byte_perm (w[25], w[24], selector); + w[41] = hc_byte_perm (w[24], w[23], selector); + w[40] = hc_byte_perm (w[23], w[22], selector); + w[39] = hc_byte_perm (w[22], w[21], selector); + w[38] = hc_byte_perm (w[21], w[20], selector); + w[37] = hc_byte_perm (w[20], w[19], selector); + w[36] = hc_byte_perm (w[19], w[18], selector); + w[35] = hc_byte_perm (w[18], w[17], selector); + w[34] = hc_byte_perm (w[17], w[16], selector); + w[33] = hc_byte_perm (w[16], w[15], selector); + w[32] = hc_byte_perm (w[15], w[14], selector); + w[31] = hc_byte_perm (w[14], w[13], selector); + w[30] = hc_byte_perm (w[13], w[12], selector); + w[29] = hc_byte_perm (w[12], w[11], selector); + w[28] = hc_byte_perm (w[11], w[10], selector); + w[27] = hc_byte_perm (w[10], w[ 9], selector); + w[26] = hc_byte_perm (w[ 9], w[ 8], selector); + w[25] = hc_byte_perm (w[ 8], w[ 7], selector); + w[24] = hc_byte_perm (w[ 7], w[ 6], selector); + w[23] = hc_byte_perm (w[ 6], w[ 5], selector); + w[22] = hc_byte_perm (w[ 5], w[ 4], selector); + w[21] = hc_byte_perm (w[ 4], w[ 3], selector); + w[20] = hc_byte_perm (w[ 3], w[ 2], selector); + w[19] = hc_byte_perm (w[ 2], w[ 1], selector); + w[18] = hc_byte_perm (w[ 1], w[ 0], selector); + w[17] = hc_byte_perm (w[ 0], 0, selector); w[16] = 0; w[15] = 0; w[14] = 0; @@ -25988,52 +25988,52 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 18: - w[63] = __byte_perm (w[45], w[44], selector); - w[62] = __byte_perm (w[44], w[43], selector); - w[61] = __byte_perm (w[43], w[42], selector); - w[60] = __byte_perm (w[42], w[41], selector); - w[59] = __byte_perm (w[41], w[40], selector); - w[58] = __byte_perm (w[40], w[39], selector); - w[57] = __byte_perm (w[39], w[38], selector); - w[56] = __byte_perm (w[38], w[37], selector); - w[55] = __byte_perm (w[37], w[36], selector); - w[54] = __byte_perm (w[36], w[35], selector); - w[53] = __byte_perm (w[35], w[34], selector); - w[52] = __byte_perm (w[34], w[33], selector); - w[51] = __byte_perm (w[33], w[32], selector); - w[50] = __byte_perm (w[32], w[31], selector); - w[49] = __byte_perm (w[31], w[30], selector); - w[48] = __byte_perm (w[30], w[29], selector); - w[47] = __byte_perm (w[29], w[28], selector); - w[46] = __byte_perm (w[28], w[27], selector); - w[45] = __byte_perm (w[27], w[26], selector); - w[44] = __byte_perm (w[26], w[25], selector); - w[43] = __byte_perm (w[25], w[24], selector); - w[42] = __byte_perm (w[24], w[23], selector); - w[41] = __byte_perm (w[23], w[22], selector); - w[40] = __byte_perm (w[22], w[21], selector); - w[39] = __byte_perm (w[21], w[20], selector); - w[38] = __byte_perm (w[20], w[19], selector); - w[37] = __byte_perm (w[19], w[18], selector); - w[36] = __byte_perm (w[18], w[17], selector); - w[35] = __byte_perm (w[17], w[16], selector); - w[34] = __byte_perm (w[16], w[15], selector); - w[33] = __byte_perm (w[15], w[14], selector); - w[32] = __byte_perm (w[14], w[13], selector); - w[31] = __byte_perm (w[13], w[12], selector); - w[30] = __byte_perm (w[12], w[11], selector); - w[29] = __byte_perm (w[11], w[10], selector); - w[28] = __byte_perm (w[10], w[ 9], selector); - w[27] = __byte_perm (w[ 9], w[ 8], selector); - w[26] = __byte_perm (w[ 8], w[ 7], selector); - w[25] = __byte_perm (w[ 7], w[ 6], selector); - w[24] = __byte_perm (w[ 6], w[ 5], selector); - w[23] = __byte_perm (w[ 5], w[ 4], selector); - w[22] = __byte_perm (w[ 4], w[ 3], selector); - w[21] = __byte_perm (w[ 3], w[ 2], selector); - w[20] = __byte_perm (w[ 2], w[ 1], selector); - w[19] = __byte_perm (w[ 1], w[ 0], selector); - w[18] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[45], w[44], selector); + w[62] = hc_byte_perm (w[44], w[43], selector); + w[61] = hc_byte_perm (w[43], w[42], selector); + w[60] = hc_byte_perm (w[42], w[41], selector); + w[59] = hc_byte_perm (w[41], w[40], selector); + w[58] = hc_byte_perm (w[40], w[39], selector); + w[57] = hc_byte_perm (w[39], w[38], selector); + w[56] = hc_byte_perm (w[38], w[37], selector); + w[55] = hc_byte_perm (w[37], w[36], selector); + w[54] = hc_byte_perm (w[36], w[35], selector); + w[53] = hc_byte_perm (w[35], w[34], selector); + w[52] = hc_byte_perm (w[34], w[33], selector); + w[51] = hc_byte_perm (w[33], w[32], selector); + w[50] = hc_byte_perm (w[32], w[31], selector); + w[49] = hc_byte_perm (w[31], w[30], selector); + w[48] = hc_byte_perm (w[30], w[29], selector); + w[47] = hc_byte_perm (w[29], w[28], selector); + w[46] = hc_byte_perm (w[28], w[27], selector); + w[45] = hc_byte_perm (w[27], w[26], selector); + w[44] = hc_byte_perm (w[26], w[25], selector); + w[43] = hc_byte_perm (w[25], w[24], selector); + w[42] = hc_byte_perm (w[24], w[23], selector); + w[41] = hc_byte_perm (w[23], w[22], selector); + w[40] = hc_byte_perm (w[22], w[21], selector); + w[39] = hc_byte_perm (w[21], w[20], selector); + w[38] = hc_byte_perm (w[20], w[19], selector); + w[37] = hc_byte_perm (w[19], w[18], selector); + w[36] = hc_byte_perm (w[18], w[17], selector); + w[35] = hc_byte_perm (w[17], w[16], selector); + w[34] = hc_byte_perm (w[16], w[15], selector); + w[33] = hc_byte_perm (w[15], w[14], selector); + w[32] = hc_byte_perm (w[14], w[13], selector); + w[31] = hc_byte_perm (w[13], w[12], selector); + w[30] = hc_byte_perm (w[12], w[11], selector); + w[29] = hc_byte_perm (w[11], w[10], selector); + w[28] = hc_byte_perm (w[10], w[ 9], selector); + w[27] = hc_byte_perm (w[ 9], w[ 8], selector); + w[26] = hc_byte_perm (w[ 8], w[ 7], selector); + w[25] = hc_byte_perm (w[ 7], w[ 6], selector); + w[24] = hc_byte_perm (w[ 6], w[ 5], selector); + w[23] = hc_byte_perm (w[ 5], w[ 4], selector); + w[22] = hc_byte_perm (w[ 4], w[ 3], selector); + w[21] = hc_byte_perm (w[ 3], w[ 2], selector); + w[20] = hc_byte_perm (w[ 2], w[ 1], selector); + w[19] = hc_byte_perm (w[ 1], w[ 0], selector); + w[18] = hc_byte_perm (w[ 0], 0, selector); w[17] = 0; w[16] = 0; w[15] = 0; @@ -26056,51 +26056,51 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 19: - w[63] = __byte_perm (w[44], w[43], selector); - w[62] = __byte_perm (w[43], w[42], selector); - w[61] = __byte_perm (w[42], w[41], selector); - w[60] = __byte_perm (w[41], w[40], selector); - w[59] = __byte_perm (w[40], w[39], selector); - w[58] = __byte_perm (w[39], w[38], selector); - w[57] = __byte_perm (w[38], w[37], selector); - w[56] = __byte_perm (w[37], w[36], selector); - w[55] = __byte_perm (w[36], w[35], selector); - w[54] = __byte_perm (w[35], w[34], selector); - w[53] = __byte_perm (w[34], w[33], selector); - w[52] = __byte_perm (w[33], w[32], selector); - w[51] = __byte_perm (w[32], w[31], selector); - w[50] = __byte_perm (w[31], w[30], selector); - w[49] = __byte_perm (w[30], w[29], selector); - w[48] = __byte_perm (w[29], w[28], selector); - w[47] = __byte_perm (w[28], w[27], selector); - w[46] = __byte_perm (w[27], w[26], selector); - w[45] = __byte_perm (w[26], w[25], selector); - w[44] = __byte_perm (w[25], w[24], selector); - w[43] = __byte_perm (w[24], w[23], selector); - w[42] = __byte_perm (w[23], w[22], selector); - w[41] = __byte_perm (w[22], w[21], selector); - w[40] = __byte_perm (w[21], w[20], selector); - w[39] = __byte_perm (w[20], w[19], selector); - w[38] = __byte_perm (w[19], w[18], selector); - w[37] = __byte_perm (w[18], w[17], selector); - w[36] = __byte_perm (w[17], w[16], selector); - w[35] = __byte_perm (w[16], w[15], selector); - w[34] = __byte_perm (w[15], w[14], selector); - w[33] = __byte_perm (w[14], w[13], selector); - w[32] = __byte_perm (w[13], w[12], selector); - w[31] = __byte_perm (w[12], w[11], selector); - w[30] = __byte_perm (w[11], w[10], selector); - w[29] = __byte_perm (w[10], w[ 9], selector); - w[28] = __byte_perm (w[ 9], w[ 8], selector); - w[27] = __byte_perm (w[ 8], w[ 7], selector); - w[26] = __byte_perm (w[ 7], w[ 6], selector); - w[25] = __byte_perm (w[ 6], w[ 5], selector); - w[24] = __byte_perm (w[ 5], w[ 4], selector); - w[23] = __byte_perm (w[ 4], w[ 3], selector); - w[22] = __byte_perm (w[ 3], w[ 2], selector); - w[21] = __byte_perm (w[ 2], w[ 1], selector); - w[20] = __byte_perm (w[ 1], w[ 0], selector); - w[19] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[44], w[43], selector); + w[62] = hc_byte_perm (w[43], w[42], selector); + w[61] = hc_byte_perm (w[42], w[41], selector); + w[60] = hc_byte_perm (w[41], w[40], selector); + w[59] = hc_byte_perm (w[40], w[39], selector); + w[58] = hc_byte_perm (w[39], w[38], selector); + w[57] = hc_byte_perm (w[38], w[37], selector); + w[56] = hc_byte_perm (w[37], w[36], selector); + w[55] = hc_byte_perm (w[36], w[35], selector); + w[54] = hc_byte_perm (w[35], w[34], selector); + w[53] = hc_byte_perm (w[34], w[33], selector); + w[52] = hc_byte_perm (w[33], w[32], selector); + w[51] = hc_byte_perm (w[32], w[31], selector); + w[50] = hc_byte_perm (w[31], w[30], selector); + w[49] = hc_byte_perm (w[30], w[29], selector); + w[48] = hc_byte_perm (w[29], w[28], selector); + w[47] = hc_byte_perm (w[28], w[27], selector); + w[46] = hc_byte_perm (w[27], w[26], selector); + w[45] = hc_byte_perm (w[26], w[25], selector); + w[44] = hc_byte_perm (w[25], w[24], selector); + w[43] = hc_byte_perm (w[24], w[23], selector); + w[42] = hc_byte_perm (w[23], w[22], selector); + w[41] = hc_byte_perm (w[22], w[21], selector); + w[40] = hc_byte_perm (w[21], w[20], selector); + w[39] = hc_byte_perm (w[20], w[19], selector); + w[38] = hc_byte_perm (w[19], w[18], selector); + w[37] = hc_byte_perm (w[18], w[17], selector); + w[36] = hc_byte_perm (w[17], w[16], selector); + w[35] = hc_byte_perm (w[16], w[15], selector); + w[34] = hc_byte_perm (w[15], w[14], selector); + w[33] = hc_byte_perm (w[14], w[13], selector); + w[32] = hc_byte_perm (w[13], w[12], selector); + w[31] = hc_byte_perm (w[12], w[11], selector); + w[30] = hc_byte_perm (w[11], w[10], selector); + w[29] = hc_byte_perm (w[10], w[ 9], selector); + w[28] = hc_byte_perm (w[ 9], w[ 8], selector); + w[27] = hc_byte_perm (w[ 8], w[ 7], selector); + w[26] = hc_byte_perm (w[ 7], w[ 6], selector); + w[25] = hc_byte_perm (w[ 6], w[ 5], selector); + w[24] = hc_byte_perm (w[ 5], w[ 4], selector); + w[23] = hc_byte_perm (w[ 4], w[ 3], selector); + w[22] = hc_byte_perm (w[ 3], w[ 2], selector); + w[21] = hc_byte_perm (w[ 2], w[ 1], selector); + w[20] = hc_byte_perm (w[ 1], w[ 0], selector); + w[19] = hc_byte_perm (w[ 0], 0, selector); w[18] = 0; w[17] = 0; w[16] = 0; @@ -26124,50 +26124,50 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 20: - w[63] = __byte_perm (w[43], w[42], selector); - w[62] = __byte_perm (w[42], w[41], selector); - w[61] = __byte_perm (w[41], w[40], selector); - w[60] = __byte_perm (w[40], w[39], selector); - w[59] = __byte_perm (w[39], w[38], selector); - w[58] = __byte_perm (w[38], w[37], selector); - w[57] = __byte_perm (w[37], w[36], selector); - w[56] = __byte_perm (w[36], w[35], selector); - w[55] = __byte_perm (w[35], w[34], selector); - w[54] = __byte_perm (w[34], w[33], selector); - w[53] = __byte_perm (w[33], w[32], selector); - w[52] = __byte_perm (w[32], w[31], selector); - w[51] = __byte_perm (w[31], w[30], selector); - w[50] = __byte_perm (w[30], w[29], selector); - w[49] = __byte_perm (w[29], w[28], selector); - w[48] = __byte_perm (w[28], w[27], selector); - w[47] = __byte_perm (w[27], w[26], selector); - w[46] = __byte_perm (w[26], w[25], selector); - w[45] = __byte_perm (w[25], w[24], selector); - w[44] = __byte_perm (w[24], w[23], selector); - w[43] = __byte_perm (w[23], w[22], selector); - w[42] = __byte_perm (w[22], w[21], selector); - w[41] = __byte_perm (w[21], w[20], selector); - w[40] = __byte_perm (w[20], w[19], selector); - w[39] = __byte_perm (w[19], w[18], selector); - w[38] = __byte_perm (w[18], w[17], selector); - w[37] = __byte_perm (w[17], w[16], selector); - w[36] = __byte_perm (w[16], w[15], selector); - w[35] = __byte_perm (w[15], w[14], selector); - w[34] = __byte_perm (w[14], w[13], selector); - w[33] = __byte_perm (w[13], w[12], selector); - w[32] = __byte_perm (w[12], w[11], selector); - w[31] = __byte_perm (w[11], w[10], selector); - w[30] = __byte_perm (w[10], w[ 9], selector); - w[29] = __byte_perm (w[ 9], w[ 8], selector); - w[28] = __byte_perm (w[ 8], w[ 7], selector); - w[27] = __byte_perm (w[ 7], w[ 6], selector); - w[26] = __byte_perm (w[ 6], w[ 5], selector); - w[25] = __byte_perm (w[ 5], w[ 4], selector); - w[24] = __byte_perm (w[ 4], w[ 3], selector); - w[23] = __byte_perm (w[ 3], w[ 2], selector); - w[22] = __byte_perm (w[ 2], w[ 1], selector); - w[21] = __byte_perm (w[ 1], w[ 0], selector); - w[20] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[43], w[42], selector); + w[62] = hc_byte_perm (w[42], w[41], selector); + w[61] = hc_byte_perm (w[41], w[40], selector); + w[60] = hc_byte_perm (w[40], w[39], selector); + w[59] = hc_byte_perm (w[39], w[38], selector); + w[58] = hc_byte_perm (w[38], w[37], selector); + w[57] = hc_byte_perm (w[37], w[36], selector); + w[56] = hc_byte_perm (w[36], w[35], selector); + w[55] = hc_byte_perm (w[35], w[34], selector); + w[54] = hc_byte_perm (w[34], w[33], selector); + w[53] = hc_byte_perm (w[33], w[32], selector); + w[52] = hc_byte_perm (w[32], w[31], selector); + w[51] = hc_byte_perm (w[31], w[30], selector); + w[50] = hc_byte_perm (w[30], w[29], selector); + w[49] = hc_byte_perm (w[29], w[28], selector); + w[48] = hc_byte_perm (w[28], w[27], selector); + w[47] = hc_byte_perm (w[27], w[26], selector); + w[46] = hc_byte_perm (w[26], w[25], selector); + w[45] = hc_byte_perm (w[25], w[24], selector); + w[44] = hc_byte_perm (w[24], w[23], selector); + w[43] = hc_byte_perm (w[23], w[22], selector); + w[42] = hc_byte_perm (w[22], w[21], selector); + w[41] = hc_byte_perm (w[21], w[20], selector); + w[40] = hc_byte_perm (w[20], w[19], selector); + w[39] = hc_byte_perm (w[19], w[18], selector); + w[38] = hc_byte_perm (w[18], w[17], selector); + w[37] = hc_byte_perm (w[17], w[16], selector); + w[36] = hc_byte_perm (w[16], w[15], selector); + w[35] = hc_byte_perm (w[15], w[14], selector); + w[34] = hc_byte_perm (w[14], w[13], selector); + w[33] = hc_byte_perm (w[13], w[12], selector); + w[32] = hc_byte_perm (w[12], w[11], selector); + w[31] = hc_byte_perm (w[11], w[10], selector); + w[30] = hc_byte_perm (w[10], w[ 9], selector); + w[29] = hc_byte_perm (w[ 9], w[ 8], selector); + w[28] = hc_byte_perm (w[ 8], w[ 7], selector); + w[27] = hc_byte_perm (w[ 7], w[ 6], selector); + w[26] = hc_byte_perm (w[ 6], w[ 5], selector); + w[25] = hc_byte_perm (w[ 5], w[ 4], selector); + w[24] = hc_byte_perm (w[ 4], w[ 3], selector); + w[23] = hc_byte_perm (w[ 3], w[ 2], selector); + w[22] = hc_byte_perm (w[ 2], w[ 1], selector); + w[21] = hc_byte_perm (w[ 1], w[ 0], selector); + w[20] = hc_byte_perm (w[ 0], 0, selector); w[19] = 0; w[18] = 0; w[17] = 0; @@ -26192,49 +26192,49 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 21: - w[63] = __byte_perm (w[42], w[41], selector); - w[62] = __byte_perm (w[41], w[40], selector); - w[61] = __byte_perm (w[40], w[39], selector); - w[60] = __byte_perm (w[39], w[38], selector); - w[59] = __byte_perm (w[38], w[37], selector); - w[58] = __byte_perm (w[37], w[36], selector); - w[57] = __byte_perm (w[36], w[35], selector); - w[56] = __byte_perm (w[35], w[34], selector); - w[55] = __byte_perm (w[34], w[33], selector); - w[54] = __byte_perm (w[33], w[32], selector); - w[53] = __byte_perm (w[32], w[31], selector); - w[52] = __byte_perm (w[31], w[30], selector); - w[51] = __byte_perm (w[30], w[29], selector); - w[50] = __byte_perm (w[29], w[28], selector); - w[49] = __byte_perm (w[28], w[27], selector); - w[48] = __byte_perm (w[27], w[26], selector); - w[47] = __byte_perm (w[26], w[25], selector); - w[46] = __byte_perm (w[25], w[24], selector); - w[45] = __byte_perm (w[24], w[23], selector); - w[44] = __byte_perm (w[23], w[22], selector); - w[43] = __byte_perm (w[22], w[21], selector); - w[42] = __byte_perm (w[21], w[20], selector); - w[41] = __byte_perm (w[20], w[19], selector); - w[40] = __byte_perm (w[19], w[18], selector); - w[39] = __byte_perm (w[18], w[17], selector); - w[38] = __byte_perm (w[17], w[16], selector); - w[37] = __byte_perm (w[16], w[15], selector); - w[36] = __byte_perm (w[15], w[14], selector); - w[35] = __byte_perm (w[14], w[13], selector); - w[34] = __byte_perm (w[13], w[12], selector); - w[33] = __byte_perm (w[12], w[11], selector); - w[32] = __byte_perm (w[11], w[10], selector); - w[31] = __byte_perm (w[10], w[ 9], selector); - w[30] = __byte_perm (w[ 9], w[ 8], selector); - w[29] = __byte_perm (w[ 8], w[ 7], selector); - w[28] = __byte_perm (w[ 7], w[ 6], selector); - w[27] = __byte_perm (w[ 6], w[ 5], selector); - w[26] = __byte_perm (w[ 5], w[ 4], selector); - w[25] = __byte_perm (w[ 4], w[ 3], selector); - w[24] = __byte_perm (w[ 3], w[ 2], selector); - w[23] = __byte_perm (w[ 2], w[ 1], selector); - w[22] = __byte_perm (w[ 1], w[ 0], selector); - w[21] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[42], w[41], selector); + w[62] = hc_byte_perm (w[41], w[40], selector); + w[61] = hc_byte_perm (w[40], w[39], selector); + w[60] = hc_byte_perm (w[39], w[38], selector); + w[59] = hc_byte_perm (w[38], w[37], selector); + w[58] = hc_byte_perm (w[37], w[36], selector); + w[57] = hc_byte_perm (w[36], w[35], selector); + w[56] = hc_byte_perm (w[35], w[34], selector); + w[55] = hc_byte_perm (w[34], w[33], selector); + w[54] = hc_byte_perm (w[33], w[32], selector); + w[53] = hc_byte_perm (w[32], w[31], selector); + w[52] = hc_byte_perm (w[31], w[30], selector); + w[51] = hc_byte_perm (w[30], w[29], selector); + w[50] = hc_byte_perm (w[29], w[28], selector); + w[49] = hc_byte_perm (w[28], w[27], selector); + w[48] = hc_byte_perm (w[27], w[26], selector); + w[47] = hc_byte_perm (w[26], w[25], selector); + w[46] = hc_byte_perm (w[25], w[24], selector); + w[45] = hc_byte_perm (w[24], w[23], selector); + w[44] = hc_byte_perm (w[23], w[22], selector); + w[43] = hc_byte_perm (w[22], w[21], selector); + w[42] = hc_byte_perm (w[21], w[20], selector); + w[41] = hc_byte_perm (w[20], w[19], selector); + w[40] = hc_byte_perm (w[19], w[18], selector); + w[39] = hc_byte_perm (w[18], w[17], selector); + w[38] = hc_byte_perm (w[17], w[16], selector); + w[37] = hc_byte_perm (w[16], w[15], selector); + w[36] = hc_byte_perm (w[15], w[14], selector); + w[35] = hc_byte_perm (w[14], w[13], selector); + w[34] = hc_byte_perm (w[13], w[12], selector); + w[33] = hc_byte_perm (w[12], w[11], selector); + w[32] = hc_byte_perm (w[11], w[10], selector); + w[31] = hc_byte_perm (w[10], w[ 9], selector); + w[30] = hc_byte_perm (w[ 9], w[ 8], selector); + w[29] = hc_byte_perm (w[ 8], w[ 7], selector); + w[28] = hc_byte_perm (w[ 7], w[ 6], selector); + w[27] = hc_byte_perm (w[ 6], w[ 5], selector); + w[26] = hc_byte_perm (w[ 5], w[ 4], selector); + w[25] = hc_byte_perm (w[ 4], w[ 3], selector); + w[24] = hc_byte_perm (w[ 3], w[ 2], selector); + w[23] = hc_byte_perm (w[ 2], w[ 1], selector); + w[22] = hc_byte_perm (w[ 1], w[ 0], selector); + w[21] = hc_byte_perm (w[ 0], 0, selector); w[20] = 0; w[19] = 0; w[18] = 0; @@ -26260,48 +26260,48 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 22: - w[63] = __byte_perm (w[41], w[40], selector); - w[62] = __byte_perm (w[40], w[39], selector); - w[61] = __byte_perm (w[39], w[38], selector); - w[60] = __byte_perm (w[38], w[37], selector); - w[59] = __byte_perm (w[37], w[36], selector); - w[58] = __byte_perm (w[36], w[35], selector); - w[57] = __byte_perm (w[35], w[34], selector); - w[56] = __byte_perm (w[34], w[33], selector); - w[55] = __byte_perm (w[33], w[32], selector); - w[54] = __byte_perm (w[32], w[31], selector); - w[53] = __byte_perm (w[31], w[30], selector); - w[52] = __byte_perm (w[30], w[29], selector); - w[51] = __byte_perm (w[29], w[28], selector); - w[50] = __byte_perm (w[28], w[27], selector); - w[49] = __byte_perm (w[27], w[26], selector); - w[48] = __byte_perm (w[26], w[25], selector); - w[47] = __byte_perm (w[25], w[24], selector); - w[46] = __byte_perm (w[24], w[23], selector); - w[45] = __byte_perm (w[23], w[22], selector); - w[44] = __byte_perm (w[22], w[21], selector); - w[43] = __byte_perm (w[21], w[20], selector); - w[42] = __byte_perm (w[20], w[19], selector); - w[41] = __byte_perm (w[19], w[18], selector); - w[40] = __byte_perm (w[18], w[17], selector); - w[39] = __byte_perm (w[17], w[16], selector); - w[38] = __byte_perm (w[16], w[15], selector); - w[37] = __byte_perm (w[15], w[14], selector); - w[36] = __byte_perm (w[14], w[13], selector); - w[35] = __byte_perm (w[13], w[12], selector); - w[34] = __byte_perm (w[12], w[11], selector); - w[33] = __byte_perm (w[11], w[10], selector); - w[32] = __byte_perm (w[10], w[ 9], selector); - w[31] = __byte_perm (w[ 9], w[ 8], selector); - w[30] = __byte_perm (w[ 8], w[ 7], selector); - w[29] = __byte_perm (w[ 7], w[ 6], selector); - w[28] = __byte_perm (w[ 6], w[ 5], selector); - w[27] = __byte_perm (w[ 5], w[ 4], selector); - w[26] = __byte_perm (w[ 4], w[ 3], selector); - w[25] = __byte_perm (w[ 3], w[ 2], selector); - w[24] = __byte_perm (w[ 2], w[ 1], selector); - w[23] = __byte_perm (w[ 1], w[ 0], selector); - w[22] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[41], w[40], selector); + w[62] = hc_byte_perm (w[40], w[39], selector); + w[61] = hc_byte_perm (w[39], w[38], selector); + w[60] = hc_byte_perm (w[38], w[37], selector); + w[59] = hc_byte_perm (w[37], w[36], selector); + w[58] = hc_byte_perm (w[36], w[35], selector); + w[57] = hc_byte_perm (w[35], w[34], selector); + w[56] = hc_byte_perm (w[34], w[33], selector); + w[55] = hc_byte_perm (w[33], w[32], selector); + w[54] = hc_byte_perm (w[32], w[31], selector); + w[53] = hc_byte_perm (w[31], w[30], selector); + w[52] = hc_byte_perm (w[30], w[29], selector); + w[51] = hc_byte_perm (w[29], w[28], selector); + w[50] = hc_byte_perm (w[28], w[27], selector); + w[49] = hc_byte_perm (w[27], w[26], selector); + w[48] = hc_byte_perm (w[26], w[25], selector); + w[47] = hc_byte_perm (w[25], w[24], selector); + w[46] = hc_byte_perm (w[24], w[23], selector); + w[45] = hc_byte_perm (w[23], w[22], selector); + w[44] = hc_byte_perm (w[22], w[21], selector); + w[43] = hc_byte_perm (w[21], w[20], selector); + w[42] = hc_byte_perm (w[20], w[19], selector); + w[41] = hc_byte_perm (w[19], w[18], selector); + w[40] = hc_byte_perm (w[18], w[17], selector); + w[39] = hc_byte_perm (w[17], w[16], selector); + w[38] = hc_byte_perm (w[16], w[15], selector); + w[37] = hc_byte_perm (w[15], w[14], selector); + w[36] = hc_byte_perm (w[14], w[13], selector); + w[35] = hc_byte_perm (w[13], w[12], selector); + w[34] = hc_byte_perm (w[12], w[11], selector); + w[33] = hc_byte_perm (w[11], w[10], selector); + w[32] = hc_byte_perm (w[10], w[ 9], selector); + w[31] = hc_byte_perm (w[ 9], w[ 8], selector); + w[30] = hc_byte_perm (w[ 8], w[ 7], selector); + w[29] = hc_byte_perm (w[ 7], w[ 6], selector); + w[28] = hc_byte_perm (w[ 6], w[ 5], selector); + w[27] = hc_byte_perm (w[ 5], w[ 4], selector); + w[26] = hc_byte_perm (w[ 4], w[ 3], selector); + w[25] = hc_byte_perm (w[ 3], w[ 2], selector); + w[24] = hc_byte_perm (w[ 2], w[ 1], selector); + w[23] = hc_byte_perm (w[ 1], w[ 0], selector); + w[22] = hc_byte_perm (w[ 0], 0, selector); w[21] = 0; w[20] = 0; w[19] = 0; @@ -26328,47 +26328,47 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 23: - w[63] = __byte_perm (w[40], w[39], selector); - w[62] = __byte_perm (w[39], w[38], selector); - w[61] = __byte_perm (w[38], w[37], selector); - w[60] = __byte_perm (w[37], w[36], selector); - w[59] = __byte_perm (w[36], w[35], selector); - w[58] = __byte_perm (w[35], w[34], selector); - w[57] = __byte_perm (w[34], w[33], selector); - w[56] = __byte_perm (w[33], w[32], selector); - w[55] = __byte_perm (w[32], w[31], selector); - w[54] = __byte_perm (w[31], w[30], selector); - w[53] = __byte_perm (w[30], w[29], selector); - w[52] = __byte_perm (w[29], w[28], selector); - w[51] = __byte_perm (w[28], w[27], selector); - w[50] = __byte_perm (w[27], w[26], selector); - w[49] = __byte_perm (w[26], w[25], selector); - w[48] = __byte_perm (w[25], w[24], selector); - w[47] = __byte_perm (w[24], w[23], selector); - w[46] = __byte_perm (w[23], w[22], selector); - w[45] = __byte_perm (w[22], w[21], selector); - w[44] = __byte_perm (w[21], w[20], selector); - w[43] = __byte_perm (w[20], w[19], selector); - w[42] = __byte_perm (w[19], w[18], selector); - w[41] = __byte_perm (w[18], w[17], selector); - w[40] = __byte_perm (w[17], w[16], selector); - w[39] = __byte_perm (w[16], w[15], selector); - w[38] = __byte_perm (w[15], w[14], selector); - w[37] = __byte_perm (w[14], w[13], selector); - w[36] = __byte_perm (w[13], w[12], selector); - w[35] = __byte_perm (w[12], w[11], selector); - w[34] = __byte_perm (w[11], w[10], selector); - w[33] = __byte_perm (w[10], w[ 9], selector); - w[32] = __byte_perm (w[ 9], w[ 8], selector); - w[31] = __byte_perm (w[ 8], w[ 7], selector); - w[30] = __byte_perm (w[ 7], w[ 6], selector); - w[29] = __byte_perm (w[ 6], w[ 5], selector); - w[28] = __byte_perm (w[ 5], w[ 4], selector); - w[27] = __byte_perm (w[ 4], w[ 3], selector); - w[26] = __byte_perm (w[ 3], w[ 2], selector); - w[25] = __byte_perm (w[ 2], w[ 1], selector); - w[24] = __byte_perm (w[ 1], w[ 0], selector); - w[23] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[40], w[39], selector); + w[62] = hc_byte_perm (w[39], w[38], selector); + w[61] = hc_byte_perm (w[38], w[37], selector); + w[60] = hc_byte_perm (w[37], w[36], selector); + w[59] = hc_byte_perm (w[36], w[35], selector); + w[58] = hc_byte_perm (w[35], w[34], selector); + w[57] = hc_byte_perm (w[34], w[33], selector); + w[56] = hc_byte_perm (w[33], w[32], selector); + w[55] = hc_byte_perm (w[32], w[31], selector); + w[54] = hc_byte_perm (w[31], w[30], selector); + w[53] = hc_byte_perm (w[30], w[29], selector); + w[52] = hc_byte_perm (w[29], w[28], selector); + w[51] = hc_byte_perm (w[28], w[27], selector); + w[50] = hc_byte_perm (w[27], w[26], selector); + w[49] = hc_byte_perm (w[26], w[25], selector); + w[48] = hc_byte_perm (w[25], w[24], selector); + w[47] = hc_byte_perm (w[24], w[23], selector); + w[46] = hc_byte_perm (w[23], w[22], selector); + w[45] = hc_byte_perm (w[22], w[21], selector); + w[44] = hc_byte_perm (w[21], w[20], selector); + w[43] = hc_byte_perm (w[20], w[19], selector); + w[42] = hc_byte_perm (w[19], w[18], selector); + w[41] = hc_byte_perm (w[18], w[17], selector); + w[40] = hc_byte_perm (w[17], w[16], selector); + w[39] = hc_byte_perm (w[16], w[15], selector); + w[38] = hc_byte_perm (w[15], w[14], selector); + w[37] = hc_byte_perm (w[14], w[13], selector); + w[36] = hc_byte_perm (w[13], w[12], selector); + w[35] = hc_byte_perm (w[12], w[11], selector); + w[34] = hc_byte_perm (w[11], w[10], selector); + w[33] = hc_byte_perm (w[10], w[ 9], selector); + w[32] = hc_byte_perm (w[ 9], w[ 8], selector); + w[31] = hc_byte_perm (w[ 8], w[ 7], selector); + w[30] = hc_byte_perm (w[ 7], w[ 6], selector); + w[29] = hc_byte_perm (w[ 6], w[ 5], selector); + w[28] = hc_byte_perm (w[ 5], w[ 4], selector); + w[27] = hc_byte_perm (w[ 4], w[ 3], selector); + w[26] = hc_byte_perm (w[ 3], w[ 2], selector); + w[25] = hc_byte_perm (w[ 2], w[ 1], selector); + w[24] = hc_byte_perm (w[ 1], w[ 0], selector); + w[23] = hc_byte_perm (w[ 0], 0, selector); w[22] = 0; w[21] = 0; w[20] = 0; @@ -26396,46 +26396,46 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 24: - w[63] = __byte_perm (w[39], w[38], selector); - w[62] = __byte_perm (w[38], w[37], selector); - w[61] = __byte_perm (w[37], w[36], selector); - w[60] = __byte_perm (w[36], w[35], selector); - w[59] = __byte_perm (w[35], w[34], selector); - w[58] = __byte_perm (w[34], w[33], selector); - w[57] = __byte_perm (w[33], w[32], selector); - w[56] = __byte_perm (w[32], w[31], selector); - w[55] = __byte_perm (w[31], w[30], selector); - w[54] = __byte_perm (w[30], w[29], selector); - w[53] = __byte_perm (w[29], w[28], selector); - w[52] = __byte_perm (w[28], w[27], selector); - w[51] = __byte_perm (w[27], w[26], selector); - w[50] = __byte_perm (w[26], w[25], selector); - w[49] = __byte_perm (w[25], w[24], selector); - w[48] = __byte_perm (w[24], w[23], selector); - w[47] = __byte_perm (w[23], w[22], selector); - w[46] = __byte_perm (w[22], w[21], selector); - w[45] = __byte_perm (w[21], w[20], selector); - w[44] = __byte_perm (w[20], w[19], selector); - w[43] = __byte_perm (w[19], w[18], selector); - w[42] = __byte_perm (w[18], w[17], selector); - w[41] = __byte_perm (w[17], w[16], selector); - w[40] = __byte_perm (w[16], w[15], selector); - w[39] = __byte_perm (w[15], w[14], selector); - w[38] = __byte_perm (w[14], w[13], selector); - w[37] = __byte_perm (w[13], w[12], selector); - w[36] = __byte_perm (w[12], w[11], selector); - w[35] = __byte_perm (w[11], w[10], selector); - w[34] = __byte_perm (w[10], w[ 9], selector); - w[33] = __byte_perm (w[ 9], w[ 8], selector); - w[32] = __byte_perm (w[ 8], w[ 7], selector); - w[31] = __byte_perm (w[ 7], w[ 6], selector); - w[30] = __byte_perm (w[ 6], w[ 5], selector); - w[29] = __byte_perm (w[ 5], w[ 4], selector); - w[28] = __byte_perm (w[ 4], w[ 3], selector); - w[27] = __byte_perm (w[ 3], w[ 2], selector); - w[26] = __byte_perm (w[ 2], w[ 1], selector); - w[25] = __byte_perm (w[ 1], w[ 0], selector); - w[24] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[39], w[38], selector); + w[62] = hc_byte_perm (w[38], w[37], selector); + w[61] = hc_byte_perm (w[37], w[36], selector); + w[60] = hc_byte_perm (w[36], w[35], selector); + w[59] = hc_byte_perm (w[35], w[34], selector); + w[58] = hc_byte_perm (w[34], w[33], selector); + w[57] = hc_byte_perm (w[33], w[32], selector); + w[56] = hc_byte_perm (w[32], w[31], selector); + w[55] = hc_byte_perm (w[31], w[30], selector); + w[54] = hc_byte_perm (w[30], w[29], selector); + w[53] = hc_byte_perm (w[29], w[28], selector); + w[52] = hc_byte_perm (w[28], w[27], selector); + w[51] = hc_byte_perm (w[27], w[26], selector); + w[50] = hc_byte_perm (w[26], w[25], selector); + w[49] = hc_byte_perm (w[25], w[24], selector); + w[48] = hc_byte_perm (w[24], w[23], selector); + w[47] = hc_byte_perm (w[23], w[22], selector); + w[46] = hc_byte_perm (w[22], w[21], selector); + w[45] = hc_byte_perm (w[21], w[20], selector); + w[44] = hc_byte_perm (w[20], w[19], selector); + w[43] = hc_byte_perm (w[19], w[18], selector); + w[42] = hc_byte_perm (w[18], w[17], selector); + w[41] = hc_byte_perm (w[17], w[16], selector); + w[40] = hc_byte_perm (w[16], w[15], selector); + w[39] = hc_byte_perm (w[15], w[14], selector); + w[38] = hc_byte_perm (w[14], w[13], selector); + w[37] = hc_byte_perm (w[13], w[12], selector); + w[36] = hc_byte_perm (w[12], w[11], selector); + w[35] = hc_byte_perm (w[11], w[10], selector); + w[34] = hc_byte_perm (w[10], w[ 9], selector); + w[33] = hc_byte_perm (w[ 9], w[ 8], selector); + w[32] = hc_byte_perm (w[ 8], w[ 7], selector); + w[31] = hc_byte_perm (w[ 7], w[ 6], selector); + w[30] = hc_byte_perm (w[ 6], w[ 5], selector); + w[29] = hc_byte_perm (w[ 5], w[ 4], selector); + w[28] = hc_byte_perm (w[ 4], w[ 3], selector); + w[27] = hc_byte_perm (w[ 3], w[ 2], selector); + w[26] = hc_byte_perm (w[ 2], w[ 1], selector); + w[25] = hc_byte_perm (w[ 1], w[ 0], selector); + w[24] = hc_byte_perm (w[ 0], 0, selector); w[23] = 0; w[22] = 0; w[21] = 0; @@ -26464,45 +26464,45 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 25: - w[63] = __byte_perm (w[38], w[37], selector); - w[62] = __byte_perm (w[37], w[36], selector); - w[61] = __byte_perm (w[36], w[35], selector); - w[60] = __byte_perm (w[35], w[34], selector); - w[59] = __byte_perm (w[34], w[33], selector); - w[58] = __byte_perm (w[33], w[32], selector); - w[57] = __byte_perm (w[32], w[31], selector); - w[56] = __byte_perm (w[31], w[30], selector); - w[55] = __byte_perm (w[30], w[29], selector); - w[54] = __byte_perm (w[29], w[28], selector); - w[53] = __byte_perm (w[28], w[27], selector); - w[52] = __byte_perm (w[27], w[26], selector); - w[51] = __byte_perm (w[26], w[25], selector); - w[50] = __byte_perm (w[25], w[24], selector); - w[49] = __byte_perm (w[24], w[23], selector); - w[48] = __byte_perm (w[23], w[22], selector); - w[47] = __byte_perm (w[22], w[21], selector); - w[46] = __byte_perm (w[21], w[20], selector); - w[45] = __byte_perm (w[20], w[19], selector); - w[44] = __byte_perm (w[19], w[18], selector); - w[43] = __byte_perm (w[18], w[17], selector); - w[42] = __byte_perm (w[17], w[16], selector); - w[41] = __byte_perm (w[16], w[15], selector); - w[40] = __byte_perm (w[15], w[14], selector); - w[39] = __byte_perm (w[14], w[13], selector); - w[38] = __byte_perm (w[13], w[12], selector); - w[37] = __byte_perm (w[12], w[11], selector); - w[36] = __byte_perm (w[11], w[10], selector); - w[35] = __byte_perm (w[10], w[ 9], selector); - w[34] = __byte_perm (w[ 9], w[ 8], selector); - w[33] = __byte_perm (w[ 8], w[ 7], selector); - w[32] = __byte_perm (w[ 7], w[ 6], selector); - w[31] = __byte_perm (w[ 6], w[ 5], selector); - w[30] = __byte_perm (w[ 5], w[ 4], selector); - w[29] = __byte_perm (w[ 4], w[ 3], selector); - w[28] = __byte_perm (w[ 3], w[ 2], selector); - w[27] = __byte_perm (w[ 2], w[ 1], selector); - w[26] = __byte_perm (w[ 1], w[ 0], selector); - w[25] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[38], w[37], selector); + w[62] = hc_byte_perm (w[37], w[36], selector); + w[61] = hc_byte_perm (w[36], w[35], selector); + w[60] = hc_byte_perm (w[35], w[34], selector); + w[59] = hc_byte_perm (w[34], w[33], selector); + w[58] = hc_byte_perm (w[33], w[32], selector); + w[57] = hc_byte_perm (w[32], w[31], selector); + w[56] = hc_byte_perm (w[31], w[30], selector); + w[55] = hc_byte_perm (w[30], w[29], selector); + w[54] = hc_byte_perm (w[29], w[28], selector); + w[53] = hc_byte_perm (w[28], w[27], selector); + w[52] = hc_byte_perm (w[27], w[26], selector); + w[51] = hc_byte_perm (w[26], w[25], selector); + w[50] = hc_byte_perm (w[25], w[24], selector); + w[49] = hc_byte_perm (w[24], w[23], selector); + w[48] = hc_byte_perm (w[23], w[22], selector); + w[47] = hc_byte_perm (w[22], w[21], selector); + w[46] = hc_byte_perm (w[21], w[20], selector); + w[45] = hc_byte_perm (w[20], w[19], selector); + w[44] = hc_byte_perm (w[19], w[18], selector); + w[43] = hc_byte_perm (w[18], w[17], selector); + w[42] = hc_byte_perm (w[17], w[16], selector); + w[41] = hc_byte_perm (w[16], w[15], selector); + w[40] = hc_byte_perm (w[15], w[14], selector); + w[39] = hc_byte_perm (w[14], w[13], selector); + w[38] = hc_byte_perm (w[13], w[12], selector); + w[37] = hc_byte_perm (w[12], w[11], selector); + w[36] = hc_byte_perm (w[11], w[10], selector); + w[35] = hc_byte_perm (w[10], w[ 9], selector); + w[34] = hc_byte_perm (w[ 9], w[ 8], selector); + w[33] = hc_byte_perm (w[ 8], w[ 7], selector); + w[32] = hc_byte_perm (w[ 7], w[ 6], selector); + w[31] = hc_byte_perm (w[ 6], w[ 5], selector); + w[30] = hc_byte_perm (w[ 5], w[ 4], selector); + w[29] = hc_byte_perm (w[ 4], w[ 3], selector); + w[28] = hc_byte_perm (w[ 3], w[ 2], selector); + w[27] = hc_byte_perm (w[ 2], w[ 1], selector); + w[26] = hc_byte_perm (w[ 1], w[ 0], selector); + w[25] = hc_byte_perm (w[ 0], 0, selector); w[24] = 0; w[23] = 0; w[22] = 0; @@ -26532,44 +26532,44 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 26: - w[63] = __byte_perm (w[37], w[36], selector); - w[62] = __byte_perm (w[36], w[35], selector); - w[61] = __byte_perm (w[35], w[34], selector); - w[60] = __byte_perm (w[34], w[33], selector); - w[59] = __byte_perm (w[33], w[32], selector); - w[58] = __byte_perm (w[32], w[31], selector); - w[57] = __byte_perm (w[31], w[30], selector); - w[56] = __byte_perm (w[30], w[29], selector); - w[55] = __byte_perm (w[29], w[28], selector); - w[54] = __byte_perm (w[28], w[27], selector); - w[53] = __byte_perm (w[27], w[26], selector); - w[52] = __byte_perm (w[26], w[25], selector); - w[51] = __byte_perm (w[25], w[24], selector); - w[50] = __byte_perm (w[24], w[23], selector); - w[49] = __byte_perm (w[23], w[22], selector); - w[48] = __byte_perm (w[22], w[21], selector); - w[47] = __byte_perm (w[21], w[20], selector); - w[46] = __byte_perm (w[20], w[19], selector); - w[45] = __byte_perm (w[19], w[18], selector); - w[44] = __byte_perm (w[18], w[17], selector); - w[43] = __byte_perm (w[17], w[16], selector); - w[42] = __byte_perm (w[16], w[15], selector); - w[41] = __byte_perm (w[15], w[14], selector); - w[40] = __byte_perm (w[14], w[13], selector); - w[39] = __byte_perm (w[13], w[12], selector); - w[38] = __byte_perm (w[12], w[11], selector); - w[37] = __byte_perm (w[11], w[10], selector); - w[36] = __byte_perm (w[10], w[ 9], selector); - w[35] = __byte_perm (w[ 9], w[ 8], selector); - w[34] = __byte_perm (w[ 8], w[ 7], selector); - w[33] = __byte_perm (w[ 7], w[ 6], selector); - w[32] = __byte_perm (w[ 6], w[ 5], selector); - w[31] = __byte_perm (w[ 5], w[ 4], selector); - w[30] = __byte_perm (w[ 4], w[ 3], selector); - w[29] = __byte_perm (w[ 3], w[ 2], selector); - w[28] = __byte_perm (w[ 2], w[ 1], selector); - w[27] = __byte_perm (w[ 1], w[ 0], selector); - w[26] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[37], w[36], selector); + w[62] = hc_byte_perm (w[36], w[35], selector); + w[61] = hc_byte_perm (w[35], w[34], selector); + w[60] = hc_byte_perm (w[34], w[33], selector); + w[59] = hc_byte_perm (w[33], w[32], selector); + w[58] = hc_byte_perm (w[32], w[31], selector); + w[57] = hc_byte_perm (w[31], w[30], selector); + w[56] = hc_byte_perm (w[30], w[29], selector); + w[55] = hc_byte_perm (w[29], w[28], selector); + w[54] = hc_byte_perm (w[28], w[27], selector); + w[53] = hc_byte_perm (w[27], w[26], selector); + w[52] = hc_byte_perm (w[26], w[25], selector); + w[51] = hc_byte_perm (w[25], w[24], selector); + w[50] = hc_byte_perm (w[24], w[23], selector); + w[49] = hc_byte_perm (w[23], w[22], selector); + w[48] = hc_byte_perm (w[22], w[21], selector); + w[47] = hc_byte_perm (w[21], w[20], selector); + w[46] = hc_byte_perm (w[20], w[19], selector); + w[45] = hc_byte_perm (w[19], w[18], selector); + w[44] = hc_byte_perm (w[18], w[17], selector); + w[43] = hc_byte_perm (w[17], w[16], selector); + w[42] = hc_byte_perm (w[16], w[15], selector); + w[41] = hc_byte_perm (w[15], w[14], selector); + w[40] = hc_byte_perm (w[14], w[13], selector); + w[39] = hc_byte_perm (w[13], w[12], selector); + w[38] = hc_byte_perm (w[12], w[11], selector); + w[37] = hc_byte_perm (w[11], w[10], selector); + w[36] = hc_byte_perm (w[10], w[ 9], selector); + w[35] = hc_byte_perm (w[ 9], w[ 8], selector); + w[34] = hc_byte_perm (w[ 8], w[ 7], selector); + w[33] = hc_byte_perm (w[ 7], w[ 6], selector); + w[32] = hc_byte_perm (w[ 6], w[ 5], selector); + w[31] = hc_byte_perm (w[ 5], w[ 4], selector); + w[30] = hc_byte_perm (w[ 4], w[ 3], selector); + w[29] = hc_byte_perm (w[ 3], w[ 2], selector); + w[28] = hc_byte_perm (w[ 2], w[ 1], selector); + w[27] = hc_byte_perm (w[ 1], w[ 0], selector); + w[26] = hc_byte_perm (w[ 0], 0, selector); w[25] = 0; w[24] = 0; w[23] = 0; @@ -26600,43 +26600,43 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 27: - w[63] = __byte_perm (w[36], w[35], selector); - w[62] = __byte_perm (w[35], w[34], selector); - w[61] = __byte_perm (w[34], w[33], selector); - w[60] = __byte_perm (w[33], w[32], selector); - w[59] = __byte_perm (w[32], w[31], selector); - w[58] = __byte_perm (w[31], w[30], selector); - w[57] = __byte_perm (w[30], w[29], selector); - w[56] = __byte_perm (w[29], w[28], selector); - w[55] = __byte_perm (w[28], w[27], selector); - w[54] = __byte_perm (w[27], w[26], selector); - w[53] = __byte_perm (w[26], w[25], selector); - w[52] = __byte_perm (w[25], w[24], selector); - w[51] = __byte_perm (w[24], w[23], selector); - w[50] = __byte_perm (w[23], w[22], selector); - w[49] = __byte_perm (w[22], w[21], selector); - w[48] = __byte_perm (w[21], w[20], selector); - w[47] = __byte_perm (w[20], w[19], selector); - w[46] = __byte_perm (w[19], w[18], selector); - w[45] = __byte_perm (w[18], w[17], selector); - w[44] = __byte_perm (w[17], w[16], selector); - w[43] = __byte_perm (w[16], w[15], selector); - w[42] = __byte_perm (w[15], w[14], selector); - w[41] = __byte_perm (w[14], w[13], selector); - w[40] = __byte_perm (w[13], w[12], selector); - w[39] = __byte_perm (w[12], w[11], selector); - w[38] = __byte_perm (w[11], w[10], selector); - w[37] = __byte_perm (w[10], w[ 9], selector); - w[36] = __byte_perm (w[ 9], w[ 8], selector); - w[35] = __byte_perm (w[ 8], w[ 7], selector); - w[34] = __byte_perm (w[ 7], w[ 6], selector); - w[33] = __byte_perm (w[ 6], w[ 5], selector); - w[32] = __byte_perm (w[ 5], w[ 4], selector); - w[31] = __byte_perm (w[ 4], w[ 3], selector); - w[30] = __byte_perm (w[ 3], w[ 2], selector); - w[29] = __byte_perm (w[ 2], w[ 1], selector); - w[28] = __byte_perm (w[ 1], w[ 0], selector); - w[27] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[36], w[35], selector); + w[62] = hc_byte_perm (w[35], w[34], selector); + w[61] = hc_byte_perm (w[34], w[33], selector); + w[60] = hc_byte_perm (w[33], w[32], selector); + w[59] = hc_byte_perm (w[32], w[31], selector); + w[58] = hc_byte_perm (w[31], w[30], selector); + w[57] = hc_byte_perm (w[30], w[29], selector); + w[56] = hc_byte_perm (w[29], w[28], selector); + w[55] = hc_byte_perm (w[28], w[27], selector); + w[54] = hc_byte_perm (w[27], w[26], selector); + w[53] = hc_byte_perm (w[26], w[25], selector); + w[52] = hc_byte_perm (w[25], w[24], selector); + w[51] = hc_byte_perm (w[24], w[23], selector); + w[50] = hc_byte_perm (w[23], w[22], selector); + w[49] = hc_byte_perm (w[22], w[21], selector); + w[48] = hc_byte_perm (w[21], w[20], selector); + w[47] = hc_byte_perm (w[20], w[19], selector); + w[46] = hc_byte_perm (w[19], w[18], selector); + w[45] = hc_byte_perm (w[18], w[17], selector); + w[44] = hc_byte_perm (w[17], w[16], selector); + w[43] = hc_byte_perm (w[16], w[15], selector); + w[42] = hc_byte_perm (w[15], w[14], selector); + w[41] = hc_byte_perm (w[14], w[13], selector); + w[40] = hc_byte_perm (w[13], w[12], selector); + w[39] = hc_byte_perm (w[12], w[11], selector); + w[38] = hc_byte_perm (w[11], w[10], selector); + w[37] = hc_byte_perm (w[10], w[ 9], selector); + w[36] = hc_byte_perm (w[ 9], w[ 8], selector); + w[35] = hc_byte_perm (w[ 8], w[ 7], selector); + w[34] = hc_byte_perm (w[ 7], w[ 6], selector); + w[33] = hc_byte_perm (w[ 6], w[ 5], selector); + w[32] = hc_byte_perm (w[ 5], w[ 4], selector); + w[31] = hc_byte_perm (w[ 4], w[ 3], selector); + w[30] = hc_byte_perm (w[ 3], w[ 2], selector); + w[29] = hc_byte_perm (w[ 2], w[ 1], selector); + w[28] = hc_byte_perm (w[ 1], w[ 0], selector); + w[27] = hc_byte_perm (w[ 0], 0, selector); w[26] = 0; w[25] = 0; w[24] = 0; @@ -26668,42 +26668,42 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 28: - w[63] = __byte_perm (w[35], w[34], selector); - w[62] = __byte_perm (w[34], w[33], selector); - w[61] = __byte_perm (w[33], w[32], selector); - w[60] = __byte_perm (w[32], w[31], selector); - w[59] = __byte_perm (w[31], w[30], selector); - w[58] = __byte_perm (w[30], w[29], selector); - w[57] = __byte_perm (w[29], w[28], selector); - w[56] = __byte_perm (w[28], w[27], selector); - w[55] = __byte_perm (w[27], w[26], selector); - w[54] = __byte_perm (w[26], w[25], selector); - w[53] = __byte_perm (w[25], w[24], selector); - w[52] = __byte_perm (w[24], w[23], selector); - w[51] = __byte_perm (w[23], w[22], selector); - w[50] = __byte_perm (w[22], w[21], selector); - w[49] = __byte_perm (w[21], w[20], selector); - w[48] = __byte_perm (w[20], w[19], selector); - w[47] = __byte_perm (w[19], w[18], selector); - w[46] = __byte_perm (w[18], w[17], selector); - w[45] = __byte_perm (w[17], w[16], selector); - w[44] = __byte_perm (w[16], w[15], selector); - w[43] = __byte_perm (w[15], w[14], selector); - w[42] = __byte_perm (w[14], w[13], selector); - w[41] = __byte_perm (w[13], w[12], selector); - w[40] = __byte_perm (w[12], w[11], selector); - w[39] = __byte_perm (w[11], w[10], selector); - w[38] = __byte_perm (w[10], w[ 9], selector); - w[37] = __byte_perm (w[ 9], w[ 8], selector); - w[36] = __byte_perm (w[ 8], w[ 7], selector); - w[35] = __byte_perm (w[ 7], w[ 6], selector); - w[34] = __byte_perm (w[ 6], w[ 5], selector); - w[33] = __byte_perm (w[ 5], w[ 4], selector); - w[32] = __byte_perm (w[ 4], w[ 3], selector); - w[31] = __byte_perm (w[ 3], w[ 2], selector); - w[30] = __byte_perm (w[ 2], w[ 1], selector); - w[29] = __byte_perm (w[ 1], w[ 0], selector); - w[28] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[35], w[34], selector); + w[62] = hc_byte_perm (w[34], w[33], selector); + w[61] = hc_byte_perm (w[33], w[32], selector); + w[60] = hc_byte_perm (w[32], w[31], selector); + w[59] = hc_byte_perm (w[31], w[30], selector); + w[58] = hc_byte_perm (w[30], w[29], selector); + w[57] = hc_byte_perm (w[29], w[28], selector); + w[56] = hc_byte_perm (w[28], w[27], selector); + w[55] = hc_byte_perm (w[27], w[26], selector); + w[54] = hc_byte_perm (w[26], w[25], selector); + w[53] = hc_byte_perm (w[25], w[24], selector); + w[52] = hc_byte_perm (w[24], w[23], selector); + w[51] = hc_byte_perm (w[23], w[22], selector); + w[50] = hc_byte_perm (w[22], w[21], selector); + w[49] = hc_byte_perm (w[21], w[20], selector); + w[48] = hc_byte_perm (w[20], w[19], selector); + w[47] = hc_byte_perm (w[19], w[18], selector); + w[46] = hc_byte_perm (w[18], w[17], selector); + w[45] = hc_byte_perm (w[17], w[16], selector); + w[44] = hc_byte_perm (w[16], w[15], selector); + w[43] = hc_byte_perm (w[15], w[14], selector); + w[42] = hc_byte_perm (w[14], w[13], selector); + w[41] = hc_byte_perm (w[13], w[12], selector); + w[40] = hc_byte_perm (w[12], w[11], selector); + w[39] = hc_byte_perm (w[11], w[10], selector); + w[38] = hc_byte_perm (w[10], w[ 9], selector); + w[37] = hc_byte_perm (w[ 9], w[ 8], selector); + w[36] = hc_byte_perm (w[ 8], w[ 7], selector); + w[35] = hc_byte_perm (w[ 7], w[ 6], selector); + w[34] = hc_byte_perm (w[ 6], w[ 5], selector); + w[33] = hc_byte_perm (w[ 5], w[ 4], selector); + w[32] = hc_byte_perm (w[ 4], w[ 3], selector); + w[31] = hc_byte_perm (w[ 3], w[ 2], selector); + w[30] = hc_byte_perm (w[ 2], w[ 1], selector); + w[29] = hc_byte_perm (w[ 1], w[ 0], selector); + w[28] = hc_byte_perm (w[ 0], 0, selector); w[27] = 0; w[26] = 0; w[25] = 0; @@ -26736,41 +26736,41 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 29: - w[63] = __byte_perm (w[34], w[33], selector); - w[62] = __byte_perm (w[33], w[32], selector); - w[61] = __byte_perm (w[32], w[31], selector); - w[60] = __byte_perm (w[31], w[30], selector); - w[59] = __byte_perm (w[30], w[29], selector); - w[58] = __byte_perm (w[29], w[28], selector); - w[57] = __byte_perm (w[28], w[27], selector); - w[56] = __byte_perm (w[27], w[26], selector); - w[55] = __byte_perm (w[26], w[25], selector); - w[54] = __byte_perm (w[25], w[24], selector); - w[53] = __byte_perm (w[24], w[23], selector); - w[52] = __byte_perm (w[23], w[22], selector); - w[51] = __byte_perm (w[22], w[21], selector); - w[50] = __byte_perm (w[21], w[20], selector); - w[49] = __byte_perm (w[20], w[19], selector); - w[48] = __byte_perm (w[19], w[18], selector); - w[47] = __byte_perm (w[18], w[17], selector); - w[46] = __byte_perm (w[17], w[16], selector); - w[45] = __byte_perm (w[16], w[15], selector); - w[44] = __byte_perm (w[15], w[14], selector); - w[43] = __byte_perm (w[14], w[13], selector); - w[42] = __byte_perm (w[13], w[12], selector); - w[41] = __byte_perm (w[12], w[11], selector); - w[40] = __byte_perm (w[11], w[10], selector); - w[39] = __byte_perm (w[10], w[ 9], selector); - w[38] = __byte_perm (w[ 9], w[ 8], selector); - w[37] = __byte_perm (w[ 8], w[ 7], selector); - w[36] = __byte_perm (w[ 7], w[ 6], selector); - w[35] = __byte_perm (w[ 6], w[ 5], selector); - w[34] = __byte_perm (w[ 5], w[ 4], selector); - w[33] = __byte_perm (w[ 4], w[ 3], selector); - w[32] = __byte_perm (w[ 3], w[ 2], selector); - w[31] = __byte_perm (w[ 2], w[ 1], selector); - w[30] = __byte_perm (w[ 1], w[ 0], selector); - w[29] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[34], w[33], selector); + w[62] = hc_byte_perm (w[33], w[32], selector); + w[61] = hc_byte_perm (w[32], w[31], selector); + w[60] = hc_byte_perm (w[31], w[30], selector); + w[59] = hc_byte_perm (w[30], w[29], selector); + w[58] = hc_byte_perm (w[29], w[28], selector); + w[57] = hc_byte_perm (w[28], w[27], selector); + w[56] = hc_byte_perm (w[27], w[26], selector); + w[55] = hc_byte_perm (w[26], w[25], selector); + w[54] = hc_byte_perm (w[25], w[24], selector); + w[53] = hc_byte_perm (w[24], w[23], selector); + w[52] = hc_byte_perm (w[23], w[22], selector); + w[51] = hc_byte_perm (w[22], w[21], selector); + w[50] = hc_byte_perm (w[21], w[20], selector); + w[49] = hc_byte_perm (w[20], w[19], selector); + w[48] = hc_byte_perm (w[19], w[18], selector); + w[47] = hc_byte_perm (w[18], w[17], selector); + w[46] = hc_byte_perm (w[17], w[16], selector); + w[45] = hc_byte_perm (w[16], w[15], selector); + w[44] = hc_byte_perm (w[15], w[14], selector); + w[43] = hc_byte_perm (w[14], w[13], selector); + w[42] = hc_byte_perm (w[13], w[12], selector); + w[41] = hc_byte_perm (w[12], w[11], selector); + w[40] = hc_byte_perm (w[11], w[10], selector); + w[39] = hc_byte_perm (w[10], w[ 9], selector); + w[38] = hc_byte_perm (w[ 9], w[ 8], selector); + w[37] = hc_byte_perm (w[ 8], w[ 7], selector); + w[36] = hc_byte_perm (w[ 7], w[ 6], selector); + w[35] = hc_byte_perm (w[ 6], w[ 5], selector); + w[34] = hc_byte_perm (w[ 5], w[ 4], selector); + w[33] = hc_byte_perm (w[ 4], w[ 3], selector); + w[32] = hc_byte_perm (w[ 3], w[ 2], selector); + w[31] = hc_byte_perm (w[ 2], w[ 1], selector); + w[30] = hc_byte_perm (w[ 1], w[ 0], selector); + w[29] = hc_byte_perm (w[ 0], 0, selector); w[28] = 0; w[27] = 0; w[26] = 0; @@ -26804,40 +26804,40 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 30: - w[63] = __byte_perm (w[33], w[32], selector); - w[62] = __byte_perm (w[32], w[31], selector); - w[61] = __byte_perm (w[31], w[30], selector); - w[60] = __byte_perm (w[30], w[29], selector); - w[59] = __byte_perm (w[29], w[28], selector); - w[58] = __byte_perm (w[28], w[27], selector); - w[57] = __byte_perm (w[27], w[26], selector); - w[56] = __byte_perm (w[26], w[25], selector); - w[55] = __byte_perm (w[25], w[24], selector); - w[54] = __byte_perm (w[24], w[23], selector); - w[53] = __byte_perm (w[23], w[22], selector); - w[52] = __byte_perm (w[22], w[21], selector); - w[51] = __byte_perm (w[21], w[20], selector); - w[50] = __byte_perm (w[20], w[19], selector); - w[49] = __byte_perm (w[19], w[18], selector); - w[48] = __byte_perm (w[18], w[17], selector); - w[47] = __byte_perm (w[17], w[16], selector); - w[46] = __byte_perm (w[16], w[15], selector); - w[45] = __byte_perm (w[15], w[14], selector); - w[44] = __byte_perm (w[14], w[13], selector); - w[43] = __byte_perm (w[13], w[12], selector); - w[42] = __byte_perm (w[12], w[11], selector); - w[41] = __byte_perm (w[11], w[10], selector); - w[40] = __byte_perm (w[10], w[ 9], selector); - w[39] = __byte_perm (w[ 9], w[ 8], selector); - w[38] = __byte_perm (w[ 8], w[ 7], selector); - w[37] = __byte_perm (w[ 7], w[ 6], selector); - w[36] = __byte_perm (w[ 6], w[ 5], selector); - w[35] = __byte_perm (w[ 5], w[ 4], selector); - w[34] = __byte_perm (w[ 4], w[ 3], selector); - w[33] = __byte_perm (w[ 3], w[ 2], selector); - w[32] = __byte_perm (w[ 2], w[ 1], selector); - w[31] = __byte_perm (w[ 1], w[ 0], selector); - w[30] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[33], w[32], selector); + w[62] = hc_byte_perm (w[32], w[31], selector); + w[61] = hc_byte_perm (w[31], w[30], selector); + w[60] = hc_byte_perm (w[30], w[29], selector); + w[59] = hc_byte_perm (w[29], w[28], selector); + w[58] = hc_byte_perm (w[28], w[27], selector); + w[57] = hc_byte_perm (w[27], w[26], selector); + w[56] = hc_byte_perm (w[26], w[25], selector); + w[55] = hc_byte_perm (w[25], w[24], selector); + w[54] = hc_byte_perm (w[24], w[23], selector); + w[53] = hc_byte_perm (w[23], w[22], selector); + w[52] = hc_byte_perm (w[22], w[21], selector); + w[51] = hc_byte_perm (w[21], w[20], selector); + w[50] = hc_byte_perm (w[20], w[19], selector); + w[49] = hc_byte_perm (w[19], w[18], selector); + w[48] = hc_byte_perm (w[18], w[17], selector); + w[47] = hc_byte_perm (w[17], w[16], selector); + w[46] = hc_byte_perm (w[16], w[15], selector); + w[45] = hc_byte_perm (w[15], w[14], selector); + w[44] = hc_byte_perm (w[14], w[13], selector); + w[43] = hc_byte_perm (w[13], w[12], selector); + w[42] = hc_byte_perm (w[12], w[11], selector); + w[41] = hc_byte_perm (w[11], w[10], selector); + w[40] = hc_byte_perm (w[10], w[ 9], selector); + w[39] = hc_byte_perm (w[ 9], w[ 8], selector); + w[38] = hc_byte_perm (w[ 8], w[ 7], selector); + w[37] = hc_byte_perm (w[ 7], w[ 6], selector); + w[36] = hc_byte_perm (w[ 6], w[ 5], selector); + w[35] = hc_byte_perm (w[ 5], w[ 4], selector); + w[34] = hc_byte_perm (w[ 4], w[ 3], selector); + w[33] = hc_byte_perm (w[ 3], w[ 2], selector); + w[32] = hc_byte_perm (w[ 2], w[ 1], selector); + w[31] = hc_byte_perm (w[ 1], w[ 0], selector); + w[30] = hc_byte_perm (w[ 0], 0, selector); w[29] = 0; w[28] = 0; w[27] = 0; @@ -26872,39 +26872,39 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 31: - w[63] = __byte_perm (w[32], w[31], selector); - w[62] = __byte_perm (w[31], w[30], selector); - w[61] = __byte_perm (w[30], w[29], selector); - w[60] = __byte_perm (w[29], w[28], selector); - w[59] = __byte_perm (w[28], w[27], selector); - w[58] = __byte_perm (w[27], w[26], selector); - w[57] = __byte_perm (w[26], w[25], selector); - w[56] = __byte_perm (w[25], w[24], selector); - w[55] = __byte_perm (w[24], w[23], selector); - w[54] = __byte_perm (w[23], w[22], selector); - w[53] = __byte_perm (w[22], w[21], selector); - w[52] = __byte_perm (w[21], w[20], selector); - w[51] = __byte_perm (w[20], w[19], selector); - w[50] = __byte_perm (w[19], w[18], selector); - w[49] = __byte_perm (w[18], w[17], selector); - w[48] = __byte_perm (w[17], w[16], selector); - w[47] = __byte_perm (w[16], w[15], selector); - w[46] = __byte_perm (w[15], w[14], selector); - w[45] = __byte_perm (w[14], w[13], selector); - w[44] = __byte_perm (w[13], w[12], selector); - w[43] = __byte_perm (w[12], w[11], selector); - w[42] = __byte_perm (w[11], w[10], selector); - w[41] = __byte_perm (w[10], w[ 9], selector); - w[40] = __byte_perm (w[ 9], w[ 8], selector); - w[39] = __byte_perm (w[ 8], w[ 7], selector); - w[38] = __byte_perm (w[ 7], w[ 6], selector); - w[37] = __byte_perm (w[ 6], w[ 5], selector); - w[36] = __byte_perm (w[ 5], w[ 4], selector); - w[35] = __byte_perm (w[ 4], w[ 3], selector); - w[34] = __byte_perm (w[ 3], w[ 2], selector); - w[33] = __byte_perm (w[ 2], w[ 1], selector); - w[32] = __byte_perm (w[ 1], w[ 0], selector); - w[31] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[32], w[31], selector); + w[62] = hc_byte_perm (w[31], w[30], selector); + w[61] = hc_byte_perm (w[30], w[29], selector); + w[60] = hc_byte_perm (w[29], w[28], selector); + w[59] = hc_byte_perm (w[28], w[27], selector); + w[58] = hc_byte_perm (w[27], w[26], selector); + w[57] = hc_byte_perm (w[26], w[25], selector); + w[56] = hc_byte_perm (w[25], w[24], selector); + w[55] = hc_byte_perm (w[24], w[23], selector); + w[54] = hc_byte_perm (w[23], w[22], selector); + w[53] = hc_byte_perm (w[22], w[21], selector); + w[52] = hc_byte_perm (w[21], w[20], selector); + w[51] = hc_byte_perm (w[20], w[19], selector); + w[50] = hc_byte_perm (w[19], w[18], selector); + w[49] = hc_byte_perm (w[18], w[17], selector); + w[48] = hc_byte_perm (w[17], w[16], selector); + w[47] = hc_byte_perm (w[16], w[15], selector); + w[46] = hc_byte_perm (w[15], w[14], selector); + w[45] = hc_byte_perm (w[14], w[13], selector); + w[44] = hc_byte_perm (w[13], w[12], selector); + w[43] = hc_byte_perm (w[12], w[11], selector); + w[42] = hc_byte_perm (w[11], w[10], selector); + w[41] = hc_byte_perm (w[10], w[ 9], selector); + w[40] = hc_byte_perm (w[ 9], w[ 8], selector); + w[39] = hc_byte_perm (w[ 8], w[ 7], selector); + w[38] = hc_byte_perm (w[ 7], w[ 6], selector); + w[37] = hc_byte_perm (w[ 6], w[ 5], selector); + w[36] = hc_byte_perm (w[ 5], w[ 4], selector); + w[35] = hc_byte_perm (w[ 4], w[ 3], selector); + w[34] = hc_byte_perm (w[ 3], w[ 2], selector); + w[33] = hc_byte_perm (w[ 2], w[ 1], selector); + w[32] = hc_byte_perm (w[ 1], w[ 0], selector); + w[31] = hc_byte_perm (w[ 0], 0, selector); w[30] = 0; w[29] = 0; w[28] = 0; @@ -26940,38 +26940,38 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 32: - w[63] = __byte_perm (w[31], w[30], selector); - w[62] = __byte_perm (w[30], w[29], selector); - w[61] = __byte_perm (w[29], w[28], selector); - w[60] = __byte_perm (w[28], w[27], selector); - w[59] = __byte_perm (w[27], w[26], selector); - w[58] = __byte_perm (w[26], w[25], selector); - w[57] = __byte_perm (w[25], w[24], selector); - w[56] = __byte_perm (w[24], w[23], selector); - w[55] = __byte_perm (w[23], w[22], selector); - w[54] = __byte_perm (w[22], w[21], selector); - w[53] = __byte_perm (w[21], w[20], selector); - w[52] = __byte_perm (w[20], w[19], selector); - w[51] = __byte_perm (w[19], w[18], selector); - w[50] = __byte_perm (w[18], w[17], selector); - w[49] = __byte_perm (w[17], w[16], selector); - w[48] = __byte_perm (w[16], w[15], selector); - w[47] = __byte_perm (w[15], w[14], selector); - w[46] = __byte_perm (w[14], w[13], selector); - w[45] = __byte_perm (w[13], w[12], selector); - w[44] = __byte_perm (w[12], w[11], selector); - w[43] = __byte_perm (w[11], w[10], selector); - w[42] = __byte_perm (w[10], w[ 9], selector); - w[41] = __byte_perm (w[ 9], w[ 8], selector); - w[40] = __byte_perm (w[ 8], w[ 7], selector); - w[39] = __byte_perm (w[ 7], w[ 6], selector); - w[38] = __byte_perm (w[ 6], w[ 5], selector); - w[37] = __byte_perm (w[ 5], w[ 4], selector); - w[36] = __byte_perm (w[ 4], w[ 3], selector); - w[35] = __byte_perm (w[ 3], w[ 2], selector); - w[34] = __byte_perm (w[ 2], w[ 1], selector); - w[33] = __byte_perm (w[ 1], w[ 0], selector); - w[32] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[31], w[30], selector); + w[62] = hc_byte_perm (w[30], w[29], selector); + w[61] = hc_byte_perm (w[29], w[28], selector); + w[60] = hc_byte_perm (w[28], w[27], selector); + w[59] = hc_byte_perm (w[27], w[26], selector); + w[58] = hc_byte_perm (w[26], w[25], selector); + w[57] = hc_byte_perm (w[25], w[24], selector); + w[56] = hc_byte_perm (w[24], w[23], selector); + w[55] = hc_byte_perm (w[23], w[22], selector); + w[54] = hc_byte_perm (w[22], w[21], selector); + w[53] = hc_byte_perm (w[21], w[20], selector); + w[52] = hc_byte_perm (w[20], w[19], selector); + w[51] = hc_byte_perm (w[19], w[18], selector); + w[50] = hc_byte_perm (w[18], w[17], selector); + w[49] = hc_byte_perm (w[17], w[16], selector); + w[48] = hc_byte_perm (w[16], w[15], selector); + w[47] = hc_byte_perm (w[15], w[14], selector); + w[46] = hc_byte_perm (w[14], w[13], selector); + w[45] = hc_byte_perm (w[13], w[12], selector); + w[44] = hc_byte_perm (w[12], w[11], selector); + w[43] = hc_byte_perm (w[11], w[10], selector); + w[42] = hc_byte_perm (w[10], w[ 9], selector); + w[41] = hc_byte_perm (w[ 9], w[ 8], selector); + w[40] = hc_byte_perm (w[ 8], w[ 7], selector); + w[39] = hc_byte_perm (w[ 7], w[ 6], selector); + w[38] = hc_byte_perm (w[ 6], w[ 5], selector); + w[37] = hc_byte_perm (w[ 5], w[ 4], selector); + w[36] = hc_byte_perm (w[ 4], w[ 3], selector); + w[35] = hc_byte_perm (w[ 3], w[ 2], selector); + w[34] = hc_byte_perm (w[ 2], w[ 1], selector); + w[33] = hc_byte_perm (w[ 1], w[ 0], selector); + w[32] = hc_byte_perm (w[ 0], 0, selector); w[31] = 0; w[30] = 0; w[29] = 0; @@ -27008,37 +27008,37 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 33: - w[63] = __byte_perm (w[30], w[29], selector); - w[62] = __byte_perm (w[29], w[28], selector); - w[61] = __byte_perm (w[28], w[27], selector); - w[60] = __byte_perm (w[27], w[26], selector); - w[59] = __byte_perm (w[26], w[25], selector); - w[58] = __byte_perm (w[25], w[24], selector); - w[57] = __byte_perm (w[24], w[23], selector); - w[56] = __byte_perm (w[23], w[22], selector); - w[55] = __byte_perm (w[22], w[21], selector); - w[54] = __byte_perm (w[21], w[20], selector); - w[53] = __byte_perm (w[20], w[19], selector); - w[52] = __byte_perm (w[19], w[18], selector); - w[51] = __byte_perm (w[18], w[17], selector); - w[50] = __byte_perm (w[17], w[16], selector); - w[49] = __byte_perm (w[16], w[15], selector); - w[48] = __byte_perm (w[15], w[14], selector); - w[47] = __byte_perm (w[14], w[13], selector); - w[46] = __byte_perm (w[13], w[12], selector); - w[45] = __byte_perm (w[12], w[11], selector); - w[44] = __byte_perm (w[11], w[10], selector); - w[43] = __byte_perm (w[10], w[ 9], selector); - w[42] = __byte_perm (w[ 9], w[ 8], selector); - w[41] = __byte_perm (w[ 8], w[ 7], selector); - w[40] = __byte_perm (w[ 7], w[ 6], selector); - w[39] = __byte_perm (w[ 6], w[ 5], selector); - w[38] = __byte_perm (w[ 5], w[ 4], selector); - w[37] = __byte_perm (w[ 4], w[ 3], selector); - w[36] = __byte_perm (w[ 3], w[ 2], selector); - w[35] = __byte_perm (w[ 2], w[ 1], selector); - w[34] = __byte_perm (w[ 1], w[ 0], selector); - w[33] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[30], w[29], selector); + w[62] = hc_byte_perm (w[29], w[28], selector); + w[61] = hc_byte_perm (w[28], w[27], selector); + w[60] = hc_byte_perm (w[27], w[26], selector); + w[59] = hc_byte_perm (w[26], w[25], selector); + w[58] = hc_byte_perm (w[25], w[24], selector); + w[57] = hc_byte_perm (w[24], w[23], selector); + w[56] = hc_byte_perm (w[23], w[22], selector); + w[55] = hc_byte_perm (w[22], w[21], selector); + w[54] = hc_byte_perm (w[21], w[20], selector); + w[53] = hc_byte_perm (w[20], w[19], selector); + w[52] = hc_byte_perm (w[19], w[18], selector); + w[51] = hc_byte_perm (w[18], w[17], selector); + w[50] = hc_byte_perm (w[17], w[16], selector); + w[49] = hc_byte_perm (w[16], w[15], selector); + w[48] = hc_byte_perm (w[15], w[14], selector); + w[47] = hc_byte_perm (w[14], w[13], selector); + w[46] = hc_byte_perm (w[13], w[12], selector); + w[45] = hc_byte_perm (w[12], w[11], selector); + w[44] = hc_byte_perm (w[11], w[10], selector); + w[43] = hc_byte_perm (w[10], w[ 9], selector); + w[42] = hc_byte_perm (w[ 9], w[ 8], selector); + w[41] = hc_byte_perm (w[ 8], w[ 7], selector); + w[40] = hc_byte_perm (w[ 7], w[ 6], selector); + w[39] = hc_byte_perm (w[ 6], w[ 5], selector); + w[38] = hc_byte_perm (w[ 5], w[ 4], selector); + w[37] = hc_byte_perm (w[ 4], w[ 3], selector); + w[36] = hc_byte_perm (w[ 3], w[ 2], selector); + w[35] = hc_byte_perm (w[ 2], w[ 1], selector); + w[34] = hc_byte_perm (w[ 1], w[ 0], selector); + w[33] = hc_byte_perm (w[ 0], 0, selector); w[32] = 0; w[31] = 0; w[30] = 0; @@ -27076,36 +27076,36 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 34: - w[63] = __byte_perm (w[29], w[28], selector); - w[62] = __byte_perm (w[28], w[27], selector); - w[61] = __byte_perm (w[27], w[26], selector); - w[60] = __byte_perm (w[26], w[25], selector); - w[59] = __byte_perm (w[25], w[24], selector); - w[58] = __byte_perm (w[24], w[23], selector); - w[57] = __byte_perm (w[23], w[22], selector); - w[56] = __byte_perm (w[22], w[21], selector); - w[55] = __byte_perm (w[21], w[20], selector); - w[54] = __byte_perm (w[20], w[19], selector); - w[53] = __byte_perm (w[19], w[18], selector); - w[52] = __byte_perm (w[18], w[17], selector); - w[51] = __byte_perm (w[17], w[16], selector); - w[50] = __byte_perm (w[16], w[15], selector); - w[49] = __byte_perm (w[15], w[14], selector); - w[48] = __byte_perm (w[14], w[13], selector); - w[47] = __byte_perm (w[13], w[12], selector); - w[46] = __byte_perm (w[12], w[11], selector); - w[45] = __byte_perm (w[11], w[10], selector); - w[44] = __byte_perm (w[10], w[ 9], selector); - w[43] = __byte_perm (w[ 9], w[ 8], selector); - w[42] = __byte_perm (w[ 8], w[ 7], selector); - w[41] = __byte_perm (w[ 7], w[ 6], selector); - w[40] = __byte_perm (w[ 6], w[ 5], selector); - w[39] = __byte_perm (w[ 5], w[ 4], selector); - w[38] = __byte_perm (w[ 4], w[ 3], selector); - w[37] = __byte_perm (w[ 3], w[ 2], selector); - w[36] = __byte_perm (w[ 2], w[ 1], selector); - w[35] = __byte_perm (w[ 1], w[ 0], selector); - w[34] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[29], w[28], selector); + w[62] = hc_byte_perm (w[28], w[27], selector); + w[61] = hc_byte_perm (w[27], w[26], selector); + w[60] = hc_byte_perm (w[26], w[25], selector); + w[59] = hc_byte_perm (w[25], w[24], selector); + w[58] = hc_byte_perm (w[24], w[23], selector); + w[57] = hc_byte_perm (w[23], w[22], selector); + w[56] = hc_byte_perm (w[22], w[21], selector); + w[55] = hc_byte_perm (w[21], w[20], selector); + w[54] = hc_byte_perm (w[20], w[19], selector); + w[53] = hc_byte_perm (w[19], w[18], selector); + w[52] = hc_byte_perm (w[18], w[17], selector); + w[51] = hc_byte_perm (w[17], w[16], selector); + w[50] = hc_byte_perm (w[16], w[15], selector); + w[49] = hc_byte_perm (w[15], w[14], selector); + w[48] = hc_byte_perm (w[14], w[13], selector); + w[47] = hc_byte_perm (w[13], w[12], selector); + w[46] = hc_byte_perm (w[12], w[11], selector); + w[45] = hc_byte_perm (w[11], w[10], selector); + w[44] = hc_byte_perm (w[10], w[ 9], selector); + w[43] = hc_byte_perm (w[ 9], w[ 8], selector); + w[42] = hc_byte_perm (w[ 8], w[ 7], selector); + w[41] = hc_byte_perm (w[ 7], w[ 6], selector); + w[40] = hc_byte_perm (w[ 6], w[ 5], selector); + w[39] = hc_byte_perm (w[ 5], w[ 4], selector); + w[38] = hc_byte_perm (w[ 4], w[ 3], selector); + w[37] = hc_byte_perm (w[ 3], w[ 2], selector); + w[36] = hc_byte_perm (w[ 2], w[ 1], selector); + w[35] = hc_byte_perm (w[ 1], w[ 0], selector); + w[34] = hc_byte_perm (w[ 0], 0, selector); w[33] = 0; w[32] = 0; w[31] = 0; @@ -27144,35 +27144,35 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 35: - w[63] = __byte_perm (w[28], w[27], selector); - w[62] = __byte_perm (w[27], w[26], selector); - w[61] = __byte_perm (w[26], w[25], selector); - w[60] = __byte_perm (w[25], w[24], selector); - w[59] = __byte_perm (w[24], w[23], selector); - w[58] = __byte_perm (w[23], w[22], selector); - w[57] = __byte_perm (w[22], w[21], selector); - w[56] = __byte_perm (w[21], w[20], selector); - w[55] = __byte_perm (w[20], w[19], selector); - w[54] = __byte_perm (w[19], w[18], selector); - w[53] = __byte_perm (w[18], w[17], selector); - w[52] = __byte_perm (w[17], w[16], selector); - w[51] = __byte_perm (w[16], w[15], selector); - w[50] = __byte_perm (w[15], w[14], selector); - w[49] = __byte_perm (w[14], w[13], selector); - w[48] = __byte_perm (w[13], w[12], selector); - w[47] = __byte_perm (w[12], w[11], selector); - w[46] = __byte_perm (w[11], w[10], selector); - w[45] = __byte_perm (w[10], w[ 9], selector); - w[44] = __byte_perm (w[ 9], w[ 8], selector); - w[43] = __byte_perm (w[ 8], w[ 7], selector); - w[42] = __byte_perm (w[ 7], w[ 6], selector); - w[41] = __byte_perm (w[ 6], w[ 5], selector); - w[40] = __byte_perm (w[ 5], w[ 4], selector); - w[39] = __byte_perm (w[ 4], w[ 3], selector); - w[38] = __byte_perm (w[ 3], w[ 2], selector); - w[37] = __byte_perm (w[ 2], w[ 1], selector); - w[36] = __byte_perm (w[ 1], w[ 0], selector); - w[35] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[28], w[27], selector); + w[62] = hc_byte_perm (w[27], w[26], selector); + w[61] = hc_byte_perm (w[26], w[25], selector); + w[60] = hc_byte_perm (w[25], w[24], selector); + w[59] = hc_byte_perm (w[24], w[23], selector); + w[58] = hc_byte_perm (w[23], w[22], selector); + w[57] = hc_byte_perm (w[22], w[21], selector); + w[56] = hc_byte_perm (w[21], w[20], selector); + w[55] = hc_byte_perm (w[20], w[19], selector); + w[54] = hc_byte_perm (w[19], w[18], selector); + w[53] = hc_byte_perm (w[18], w[17], selector); + w[52] = hc_byte_perm (w[17], w[16], selector); + w[51] = hc_byte_perm (w[16], w[15], selector); + w[50] = hc_byte_perm (w[15], w[14], selector); + w[49] = hc_byte_perm (w[14], w[13], selector); + w[48] = hc_byte_perm (w[13], w[12], selector); + w[47] = hc_byte_perm (w[12], w[11], selector); + w[46] = hc_byte_perm (w[11], w[10], selector); + w[45] = hc_byte_perm (w[10], w[ 9], selector); + w[44] = hc_byte_perm (w[ 9], w[ 8], selector); + w[43] = hc_byte_perm (w[ 8], w[ 7], selector); + w[42] = hc_byte_perm (w[ 7], w[ 6], selector); + w[41] = hc_byte_perm (w[ 6], w[ 5], selector); + w[40] = hc_byte_perm (w[ 5], w[ 4], selector); + w[39] = hc_byte_perm (w[ 4], w[ 3], selector); + w[38] = hc_byte_perm (w[ 3], w[ 2], selector); + w[37] = hc_byte_perm (w[ 2], w[ 1], selector); + w[36] = hc_byte_perm (w[ 1], w[ 0], selector); + w[35] = hc_byte_perm (w[ 0], 0, selector); w[34] = 0; w[33] = 0; w[32] = 0; @@ -27212,34 +27212,34 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 36: - w[63] = __byte_perm (w[27], w[26], selector); - w[62] = __byte_perm (w[26], w[25], selector); - w[61] = __byte_perm (w[25], w[24], selector); - w[60] = __byte_perm (w[24], w[23], selector); - w[59] = __byte_perm (w[23], w[22], selector); - w[58] = __byte_perm (w[22], w[21], selector); - w[57] = __byte_perm (w[21], w[20], selector); - w[56] = __byte_perm (w[20], w[19], selector); - w[55] = __byte_perm (w[19], w[18], selector); - w[54] = __byte_perm (w[18], w[17], selector); - w[53] = __byte_perm (w[17], w[16], selector); - w[52] = __byte_perm (w[16], w[15], selector); - w[51] = __byte_perm (w[15], w[14], selector); - w[50] = __byte_perm (w[14], w[13], selector); - w[49] = __byte_perm (w[13], w[12], selector); - w[48] = __byte_perm (w[12], w[11], selector); - w[47] = __byte_perm (w[11], w[10], selector); - w[46] = __byte_perm (w[10], w[ 9], selector); - w[45] = __byte_perm (w[ 9], w[ 8], selector); - w[44] = __byte_perm (w[ 8], w[ 7], selector); - w[43] = __byte_perm (w[ 7], w[ 6], selector); - w[42] = __byte_perm (w[ 6], w[ 5], selector); - w[41] = __byte_perm (w[ 5], w[ 4], selector); - w[40] = __byte_perm (w[ 4], w[ 3], selector); - w[39] = __byte_perm (w[ 3], w[ 2], selector); - w[38] = __byte_perm (w[ 2], w[ 1], selector); - w[37] = __byte_perm (w[ 1], w[ 0], selector); - w[36] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[27], w[26], selector); + w[62] = hc_byte_perm (w[26], w[25], selector); + w[61] = hc_byte_perm (w[25], w[24], selector); + w[60] = hc_byte_perm (w[24], w[23], selector); + w[59] = hc_byte_perm (w[23], w[22], selector); + w[58] = hc_byte_perm (w[22], w[21], selector); + w[57] = hc_byte_perm (w[21], w[20], selector); + w[56] = hc_byte_perm (w[20], w[19], selector); + w[55] = hc_byte_perm (w[19], w[18], selector); + w[54] = hc_byte_perm (w[18], w[17], selector); + w[53] = hc_byte_perm (w[17], w[16], selector); + w[52] = hc_byte_perm (w[16], w[15], selector); + w[51] = hc_byte_perm (w[15], w[14], selector); + w[50] = hc_byte_perm (w[14], w[13], selector); + w[49] = hc_byte_perm (w[13], w[12], selector); + w[48] = hc_byte_perm (w[12], w[11], selector); + w[47] = hc_byte_perm (w[11], w[10], selector); + w[46] = hc_byte_perm (w[10], w[ 9], selector); + w[45] = hc_byte_perm (w[ 9], w[ 8], selector); + w[44] = hc_byte_perm (w[ 8], w[ 7], selector); + w[43] = hc_byte_perm (w[ 7], w[ 6], selector); + w[42] = hc_byte_perm (w[ 6], w[ 5], selector); + w[41] = hc_byte_perm (w[ 5], w[ 4], selector); + w[40] = hc_byte_perm (w[ 4], w[ 3], selector); + w[39] = hc_byte_perm (w[ 3], w[ 2], selector); + w[38] = hc_byte_perm (w[ 2], w[ 1], selector); + w[37] = hc_byte_perm (w[ 1], w[ 0], selector); + w[36] = hc_byte_perm (w[ 0], 0, selector); w[35] = 0; w[34] = 0; w[33] = 0; @@ -27280,33 +27280,33 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 37: - w[63] = __byte_perm (w[26], w[25], selector); - w[62] = __byte_perm (w[25], w[24], selector); - w[61] = __byte_perm (w[24], w[23], selector); - w[60] = __byte_perm (w[23], w[22], selector); - w[59] = __byte_perm (w[22], w[21], selector); - w[58] = __byte_perm (w[21], w[20], selector); - w[57] = __byte_perm (w[20], w[19], selector); - w[56] = __byte_perm (w[19], w[18], selector); - w[55] = __byte_perm (w[18], w[17], selector); - w[54] = __byte_perm (w[17], w[16], selector); - w[53] = __byte_perm (w[16], w[15], selector); - w[52] = __byte_perm (w[15], w[14], selector); - w[51] = __byte_perm (w[14], w[13], selector); - w[50] = __byte_perm (w[13], w[12], selector); - w[49] = __byte_perm (w[12], w[11], selector); - w[48] = __byte_perm (w[11], w[10], selector); - w[47] = __byte_perm (w[10], w[ 9], selector); - w[46] = __byte_perm (w[ 9], w[ 8], selector); - w[45] = __byte_perm (w[ 8], w[ 7], selector); - w[44] = __byte_perm (w[ 7], w[ 6], selector); - w[43] = __byte_perm (w[ 6], w[ 5], selector); - w[42] = __byte_perm (w[ 5], w[ 4], selector); - w[41] = __byte_perm (w[ 4], w[ 3], selector); - w[40] = __byte_perm (w[ 3], w[ 2], selector); - w[39] = __byte_perm (w[ 2], w[ 1], selector); - w[38] = __byte_perm (w[ 1], w[ 0], selector); - w[37] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[26], w[25], selector); + w[62] = hc_byte_perm (w[25], w[24], selector); + w[61] = hc_byte_perm (w[24], w[23], selector); + w[60] = hc_byte_perm (w[23], w[22], selector); + w[59] = hc_byte_perm (w[22], w[21], selector); + w[58] = hc_byte_perm (w[21], w[20], selector); + w[57] = hc_byte_perm (w[20], w[19], selector); + w[56] = hc_byte_perm (w[19], w[18], selector); + w[55] = hc_byte_perm (w[18], w[17], selector); + w[54] = hc_byte_perm (w[17], w[16], selector); + w[53] = hc_byte_perm (w[16], w[15], selector); + w[52] = hc_byte_perm (w[15], w[14], selector); + w[51] = hc_byte_perm (w[14], w[13], selector); + w[50] = hc_byte_perm (w[13], w[12], selector); + w[49] = hc_byte_perm (w[12], w[11], selector); + w[48] = hc_byte_perm (w[11], w[10], selector); + w[47] = hc_byte_perm (w[10], w[ 9], selector); + w[46] = hc_byte_perm (w[ 9], w[ 8], selector); + w[45] = hc_byte_perm (w[ 8], w[ 7], selector); + w[44] = hc_byte_perm (w[ 7], w[ 6], selector); + w[43] = hc_byte_perm (w[ 6], w[ 5], selector); + w[42] = hc_byte_perm (w[ 5], w[ 4], selector); + w[41] = hc_byte_perm (w[ 4], w[ 3], selector); + w[40] = hc_byte_perm (w[ 3], w[ 2], selector); + w[39] = hc_byte_perm (w[ 2], w[ 1], selector); + w[38] = hc_byte_perm (w[ 1], w[ 0], selector); + w[37] = hc_byte_perm (w[ 0], 0, selector); w[36] = 0; w[35] = 0; w[34] = 0; @@ -27348,32 +27348,32 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 38: - w[63] = __byte_perm (w[25], w[24], selector); - w[62] = __byte_perm (w[24], w[23], selector); - w[61] = __byte_perm (w[23], w[22], selector); - w[60] = __byte_perm (w[22], w[21], selector); - w[59] = __byte_perm (w[21], w[20], selector); - w[58] = __byte_perm (w[20], w[19], selector); - w[57] = __byte_perm (w[19], w[18], selector); - w[56] = __byte_perm (w[18], w[17], selector); - w[55] = __byte_perm (w[17], w[16], selector); - w[54] = __byte_perm (w[16], w[15], selector); - w[53] = __byte_perm (w[15], w[14], selector); - w[52] = __byte_perm (w[14], w[13], selector); - w[51] = __byte_perm (w[13], w[12], selector); - w[50] = __byte_perm (w[12], w[11], selector); - w[49] = __byte_perm (w[11], w[10], selector); - w[48] = __byte_perm (w[10], w[ 9], selector); - w[47] = __byte_perm (w[ 9], w[ 8], selector); - w[46] = __byte_perm (w[ 8], w[ 7], selector); - w[45] = __byte_perm (w[ 7], w[ 6], selector); - w[44] = __byte_perm (w[ 6], w[ 5], selector); - w[43] = __byte_perm (w[ 5], w[ 4], selector); - w[42] = __byte_perm (w[ 4], w[ 3], selector); - w[41] = __byte_perm (w[ 3], w[ 2], selector); - w[40] = __byte_perm (w[ 2], w[ 1], selector); - w[39] = __byte_perm (w[ 1], w[ 0], selector); - w[38] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[25], w[24], selector); + w[62] = hc_byte_perm (w[24], w[23], selector); + w[61] = hc_byte_perm (w[23], w[22], selector); + w[60] = hc_byte_perm (w[22], w[21], selector); + w[59] = hc_byte_perm (w[21], w[20], selector); + w[58] = hc_byte_perm (w[20], w[19], selector); + w[57] = hc_byte_perm (w[19], w[18], selector); + w[56] = hc_byte_perm (w[18], w[17], selector); + w[55] = hc_byte_perm (w[17], w[16], selector); + w[54] = hc_byte_perm (w[16], w[15], selector); + w[53] = hc_byte_perm (w[15], w[14], selector); + w[52] = hc_byte_perm (w[14], w[13], selector); + w[51] = hc_byte_perm (w[13], w[12], selector); + w[50] = hc_byte_perm (w[12], w[11], selector); + w[49] = hc_byte_perm (w[11], w[10], selector); + w[48] = hc_byte_perm (w[10], w[ 9], selector); + w[47] = hc_byte_perm (w[ 9], w[ 8], selector); + w[46] = hc_byte_perm (w[ 8], w[ 7], selector); + w[45] = hc_byte_perm (w[ 7], w[ 6], selector); + w[44] = hc_byte_perm (w[ 6], w[ 5], selector); + w[43] = hc_byte_perm (w[ 5], w[ 4], selector); + w[42] = hc_byte_perm (w[ 4], w[ 3], selector); + w[41] = hc_byte_perm (w[ 3], w[ 2], selector); + w[40] = hc_byte_perm (w[ 2], w[ 1], selector); + w[39] = hc_byte_perm (w[ 1], w[ 0], selector); + w[38] = hc_byte_perm (w[ 0], 0, selector); w[37] = 0; w[36] = 0; w[35] = 0; @@ -27416,31 +27416,31 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 39: - w[63] = __byte_perm (w[24], w[23], selector); - w[62] = __byte_perm (w[23], w[22], selector); - w[61] = __byte_perm (w[22], w[21], selector); - w[60] = __byte_perm (w[21], w[20], selector); - w[59] = __byte_perm (w[20], w[19], selector); - w[58] = __byte_perm (w[19], w[18], selector); - w[57] = __byte_perm (w[18], w[17], selector); - w[56] = __byte_perm (w[17], w[16], selector); - w[55] = __byte_perm (w[16], w[15], selector); - w[54] = __byte_perm (w[15], w[14], selector); - w[53] = __byte_perm (w[14], w[13], selector); - w[52] = __byte_perm (w[13], w[12], selector); - w[51] = __byte_perm (w[12], w[11], selector); - w[50] = __byte_perm (w[11], w[10], selector); - w[49] = __byte_perm (w[10], w[ 9], selector); - w[48] = __byte_perm (w[ 9], w[ 8], selector); - w[47] = __byte_perm (w[ 8], w[ 7], selector); - w[46] = __byte_perm (w[ 7], w[ 6], selector); - w[45] = __byte_perm (w[ 6], w[ 5], selector); - w[44] = __byte_perm (w[ 5], w[ 4], selector); - w[43] = __byte_perm (w[ 4], w[ 3], selector); - w[42] = __byte_perm (w[ 3], w[ 2], selector); - w[41] = __byte_perm (w[ 2], w[ 1], selector); - w[40] = __byte_perm (w[ 1], w[ 0], selector); - w[39] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[24], w[23], selector); + w[62] = hc_byte_perm (w[23], w[22], selector); + w[61] = hc_byte_perm (w[22], w[21], selector); + w[60] = hc_byte_perm (w[21], w[20], selector); + w[59] = hc_byte_perm (w[20], w[19], selector); + w[58] = hc_byte_perm (w[19], w[18], selector); + w[57] = hc_byte_perm (w[18], w[17], selector); + w[56] = hc_byte_perm (w[17], w[16], selector); + w[55] = hc_byte_perm (w[16], w[15], selector); + w[54] = hc_byte_perm (w[15], w[14], selector); + w[53] = hc_byte_perm (w[14], w[13], selector); + w[52] = hc_byte_perm (w[13], w[12], selector); + w[51] = hc_byte_perm (w[12], w[11], selector); + w[50] = hc_byte_perm (w[11], w[10], selector); + w[49] = hc_byte_perm (w[10], w[ 9], selector); + w[48] = hc_byte_perm (w[ 9], w[ 8], selector); + w[47] = hc_byte_perm (w[ 8], w[ 7], selector); + w[46] = hc_byte_perm (w[ 7], w[ 6], selector); + w[45] = hc_byte_perm (w[ 6], w[ 5], selector); + w[44] = hc_byte_perm (w[ 5], w[ 4], selector); + w[43] = hc_byte_perm (w[ 4], w[ 3], selector); + w[42] = hc_byte_perm (w[ 3], w[ 2], selector); + w[41] = hc_byte_perm (w[ 2], w[ 1], selector); + w[40] = hc_byte_perm (w[ 1], w[ 0], selector); + w[39] = hc_byte_perm (w[ 0], 0, selector); w[38] = 0; w[37] = 0; w[36] = 0; @@ -27484,30 +27484,30 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 40: - w[63] = __byte_perm (w[23], w[22], selector); - w[62] = __byte_perm (w[22], w[21], selector); - w[61] = __byte_perm (w[21], w[20], selector); - w[60] = __byte_perm (w[20], w[19], selector); - w[59] = __byte_perm (w[19], w[18], selector); - w[58] = __byte_perm (w[18], w[17], selector); - w[57] = __byte_perm (w[17], w[16], selector); - w[56] = __byte_perm (w[16], w[15], selector); - w[55] = __byte_perm (w[15], w[14], selector); - w[54] = __byte_perm (w[14], w[13], selector); - w[53] = __byte_perm (w[13], w[12], selector); - w[52] = __byte_perm (w[12], w[11], selector); - w[51] = __byte_perm (w[11], w[10], selector); - w[50] = __byte_perm (w[10], w[ 9], selector); - w[49] = __byte_perm (w[ 9], w[ 8], selector); - w[48] = __byte_perm (w[ 8], w[ 7], selector); - w[47] = __byte_perm (w[ 7], w[ 6], selector); - w[46] = __byte_perm (w[ 6], w[ 5], selector); - w[45] = __byte_perm (w[ 5], w[ 4], selector); - w[44] = __byte_perm (w[ 4], w[ 3], selector); - w[43] = __byte_perm (w[ 3], w[ 2], selector); - w[42] = __byte_perm (w[ 2], w[ 1], selector); - w[41] = __byte_perm (w[ 1], w[ 0], selector); - w[40] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[23], w[22], selector); + w[62] = hc_byte_perm (w[22], w[21], selector); + w[61] = hc_byte_perm (w[21], w[20], selector); + w[60] = hc_byte_perm (w[20], w[19], selector); + w[59] = hc_byte_perm (w[19], w[18], selector); + w[58] = hc_byte_perm (w[18], w[17], selector); + w[57] = hc_byte_perm (w[17], w[16], selector); + w[56] = hc_byte_perm (w[16], w[15], selector); + w[55] = hc_byte_perm (w[15], w[14], selector); + w[54] = hc_byte_perm (w[14], w[13], selector); + w[53] = hc_byte_perm (w[13], w[12], selector); + w[52] = hc_byte_perm (w[12], w[11], selector); + w[51] = hc_byte_perm (w[11], w[10], selector); + w[50] = hc_byte_perm (w[10], w[ 9], selector); + w[49] = hc_byte_perm (w[ 9], w[ 8], selector); + w[48] = hc_byte_perm (w[ 8], w[ 7], selector); + w[47] = hc_byte_perm (w[ 7], w[ 6], selector); + w[46] = hc_byte_perm (w[ 6], w[ 5], selector); + w[45] = hc_byte_perm (w[ 5], w[ 4], selector); + w[44] = hc_byte_perm (w[ 4], w[ 3], selector); + w[43] = hc_byte_perm (w[ 3], w[ 2], selector); + w[42] = hc_byte_perm (w[ 2], w[ 1], selector); + w[41] = hc_byte_perm (w[ 1], w[ 0], selector); + w[40] = hc_byte_perm (w[ 0], 0, selector); w[39] = 0; w[38] = 0; w[37] = 0; @@ -27552,29 +27552,29 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 41: - w[63] = __byte_perm (w[22], w[21], selector); - w[62] = __byte_perm (w[21], w[20], selector); - w[61] = __byte_perm (w[20], w[19], selector); - w[60] = __byte_perm (w[19], w[18], selector); - w[59] = __byte_perm (w[18], w[17], selector); - w[58] = __byte_perm (w[17], w[16], selector); - w[57] = __byte_perm (w[16], w[15], selector); - w[56] = __byte_perm (w[15], w[14], selector); - w[55] = __byte_perm (w[14], w[13], selector); - w[54] = __byte_perm (w[13], w[12], selector); - w[53] = __byte_perm (w[12], w[11], selector); - w[52] = __byte_perm (w[11], w[10], selector); - w[51] = __byte_perm (w[10], w[ 9], selector); - w[50] = __byte_perm (w[ 9], w[ 8], selector); - w[49] = __byte_perm (w[ 8], w[ 7], selector); - w[48] = __byte_perm (w[ 7], w[ 6], selector); - w[47] = __byte_perm (w[ 6], w[ 5], selector); - w[46] = __byte_perm (w[ 5], w[ 4], selector); - w[45] = __byte_perm (w[ 4], w[ 3], selector); - w[44] = __byte_perm (w[ 3], w[ 2], selector); - w[43] = __byte_perm (w[ 2], w[ 1], selector); - w[42] = __byte_perm (w[ 1], w[ 0], selector); - w[41] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[22], w[21], selector); + w[62] = hc_byte_perm (w[21], w[20], selector); + w[61] = hc_byte_perm (w[20], w[19], selector); + w[60] = hc_byte_perm (w[19], w[18], selector); + w[59] = hc_byte_perm (w[18], w[17], selector); + w[58] = hc_byte_perm (w[17], w[16], selector); + w[57] = hc_byte_perm (w[16], w[15], selector); + w[56] = hc_byte_perm (w[15], w[14], selector); + w[55] = hc_byte_perm (w[14], w[13], selector); + w[54] = hc_byte_perm (w[13], w[12], selector); + w[53] = hc_byte_perm (w[12], w[11], selector); + w[52] = hc_byte_perm (w[11], w[10], selector); + w[51] = hc_byte_perm (w[10], w[ 9], selector); + w[50] = hc_byte_perm (w[ 9], w[ 8], selector); + w[49] = hc_byte_perm (w[ 8], w[ 7], selector); + w[48] = hc_byte_perm (w[ 7], w[ 6], selector); + w[47] = hc_byte_perm (w[ 6], w[ 5], selector); + w[46] = hc_byte_perm (w[ 5], w[ 4], selector); + w[45] = hc_byte_perm (w[ 4], w[ 3], selector); + w[44] = hc_byte_perm (w[ 3], w[ 2], selector); + w[43] = hc_byte_perm (w[ 2], w[ 1], selector); + w[42] = hc_byte_perm (w[ 1], w[ 0], selector); + w[41] = hc_byte_perm (w[ 0], 0, selector); w[40] = 0; w[39] = 0; w[38] = 0; @@ -27620,28 +27620,28 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 42: - w[63] = __byte_perm (w[21], w[20], selector); - w[62] = __byte_perm (w[20], w[19], selector); - w[61] = __byte_perm (w[19], w[18], selector); - w[60] = __byte_perm (w[18], w[17], selector); - w[59] = __byte_perm (w[17], w[16], selector); - w[58] = __byte_perm (w[16], w[15], selector); - w[57] = __byte_perm (w[15], w[14], selector); - w[56] = __byte_perm (w[14], w[13], selector); - w[55] = __byte_perm (w[13], w[12], selector); - w[54] = __byte_perm (w[12], w[11], selector); - w[53] = __byte_perm (w[11], w[10], selector); - w[52] = __byte_perm (w[10], w[ 9], selector); - w[51] = __byte_perm (w[ 9], w[ 8], selector); - w[50] = __byte_perm (w[ 8], w[ 7], selector); - w[49] = __byte_perm (w[ 7], w[ 6], selector); - w[48] = __byte_perm (w[ 6], w[ 5], selector); - w[47] = __byte_perm (w[ 5], w[ 4], selector); - w[46] = __byte_perm (w[ 4], w[ 3], selector); - w[45] = __byte_perm (w[ 3], w[ 2], selector); - w[44] = __byte_perm (w[ 2], w[ 1], selector); - w[43] = __byte_perm (w[ 1], w[ 0], selector); - w[42] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[21], w[20], selector); + w[62] = hc_byte_perm (w[20], w[19], selector); + w[61] = hc_byte_perm (w[19], w[18], selector); + w[60] = hc_byte_perm (w[18], w[17], selector); + w[59] = hc_byte_perm (w[17], w[16], selector); + w[58] = hc_byte_perm (w[16], w[15], selector); + w[57] = hc_byte_perm (w[15], w[14], selector); + w[56] = hc_byte_perm (w[14], w[13], selector); + w[55] = hc_byte_perm (w[13], w[12], selector); + w[54] = hc_byte_perm (w[12], w[11], selector); + w[53] = hc_byte_perm (w[11], w[10], selector); + w[52] = hc_byte_perm (w[10], w[ 9], selector); + w[51] = hc_byte_perm (w[ 9], w[ 8], selector); + w[50] = hc_byte_perm (w[ 8], w[ 7], selector); + w[49] = hc_byte_perm (w[ 7], w[ 6], selector); + w[48] = hc_byte_perm (w[ 6], w[ 5], selector); + w[47] = hc_byte_perm (w[ 5], w[ 4], selector); + w[46] = hc_byte_perm (w[ 4], w[ 3], selector); + w[45] = hc_byte_perm (w[ 3], w[ 2], selector); + w[44] = hc_byte_perm (w[ 2], w[ 1], selector); + w[43] = hc_byte_perm (w[ 1], w[ 0], selector); + w[42] = hc_byte_perm (w[ 0], 0, selector); w[41] = 0; w[40] = 0; w[39] = 0; @@ -27688,27 +27688,27 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 43: - w[63] = __byte_perm (w[20], w[19], selector); - w[62] = __byte_perm (w[19], w[18], selector); - w[61] = __byte_perm (w[18], w[17], selector); - w[60] = __byte_perm (w[17], w[16], selector); - w[59] = __byte_perm (w[16], w[15], selector); - w[58] = __byte_perm (w[15], w[14], selector); - w[57] = __byte_perm (w[14], w[13], selector); - w[56] = __byte_perm (w[13], w[12], selector); - w[55] = __byte_perm (w[12], w[11], selector); - w[54] = __byte_perm (w[11], w[10], selector); - w[53] = __byte_perm (w[10], w[ 9], selector); - w[52] = __byte_perm (w[ 9], w[ 8], selector); - w[51] = __byte_perm (w[ 8], w[ 7], selector); - w[50] = __byte_perm (w[ 7], w[ 6], selector); - w[49] = __byte_perm (w[ 6], w[ 5], selector); - w[48] = __byte_perm (w[ 5], w[ 4], selector); - w[47] = __byte_perm (w[ 4], w[ 3], selector); - w[46] = __byte_perm (w[ 3], w[ 2], selector); - w[45] = __byte_perm (w[ 2], w[ 1], selector); - w[44] = __byte_perm (w[ 1], w[ 0], selector); - w[43] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[20], w[19], selector); + w[62] = hc_byte_perm (w[19], w[18], selector); + w[61] = hc_byte_perm (w[18], w[17], selector); + w[60] = hc_byte_perm (w[17], w[16], selector); + w[59] = hc_byte_perm (w[16], w[15], selector); + w[58] = hc_byte_perm (w[15], w[14], selector); + w[57] = hc_byte_perm (w[14], w[13], selector); + w[56] = hc_byte_perm (w[13], w[12], selector); + w[55] = hc_byte_perm (w[12], w[11], selector); + w[54] = hc_byte_perm (w[11], w[10], selector); + w[53] = hc_byte_perm (w[10], w[ 9], selector); + w[52] = hc_byte_perm (w[ 9], w[ 8], selector); + w[51] = hc_byte_perm (w[ 8], w[ 7], selector); + w[50] = hc_byte_perm (w[ 7], w[ 6], selector); + w[49] = hc_byte_perm (w[ 6], w[ 5], selector); + w[48] = hc_byte_perm (w[ 5], w[ 4], selector); + w[47] = hc_byte_perm (w[ 4], w[ 3], selector); + w[46] = hc_byte_perm (w[ 3], w[ 2], selector); + w[45] = hc_byte_perm (w[ 2], w[ 1], selector); + w[44] = hc_byte_perm (w[ 1], w[ 0], selector); + w[43] = hc_byte_perm (w[ 0], 0, selector); w[42] = 0; w[41] = 0; w[40] = 0; @@ -27756,26 +27756,26 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 44: - w[63] = __byte_perm (w[19], w[18], selector); - w[62] = __byte_perm (w[18], w[17], selector); - w[61] = __byte_perm (w[17], w[16], selector); - w[60] = __byte_perm (w[16], w[15], selector); - w[59] = __byte_perm (w[15], w[14], selector); - w[58] = __byte_perm (w[14], w[13], selector); - w[57] = __byte_perm (w[13], w[12], selector); - w[56] = __byte_perm (w[12], w[11], selector); - w[55] = __byte_perm (w[11], w[10], selector); - w[54] = __byte_perm (w[10], w[ 9], selector); - w[53] = __byte_perm (w[ 9], w[ 8], selector); - w[52] = __byte_perm (w[ 8], w[ 7], selector); - w[51] = __byte_perm (w[ 7], w[ 6], selector); - w[50] = __byte_perm (w[ 6], w[ 5], selector); - w[49] = __byte_perm (w[ 5], w[ 4], selector); - w[48] = __byte_perm (w[ 4], w[ 3], selector); - w[47] = __byte_perm (w[ 3], w[ 2], selector); - w[46] = __byte_perm (w[ 2], w[ 1], selector); - w[45] = __byte_perm (w[ 1], w[ 0], selector); - w[44] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[19], w[18], selector); + w[62] = hc_byte_perm (w[18], w[17], selector); + w[61] = hc_byte_perm (w[17], w[16], selector); + w[60] = hc_byte_perm (w[16], w[15], selector); + w[59] = hc_byte_perm (w[15], w[14], selector); + w[58] = hc_byte_perm (w[14], w[13], selector); + w[57] = hc_byte_perm (w[13], w[12], selector); + w[56] = hc_byte_perm (w[12], w[11], selector); + w[55] = hc_byte_perm (w[11], w[10], selector); + w[54] = hc_byte_perm (w[10], w[ 9], selector); + w[53] = hc_byte_perm (w[ 9], w[ 8], selector); + w[52] = hc_byte_perm (w[ 8], w[ 7], selector); + w[51] = hc_byte_perm (w[ 7], w[ 6], selector); + w[50] = hc_byte_perm (w[ 6], w[ 5], selector); + w[49] = hc_byte_perm (w[ 5], w[ 4], selector); + w[48] = hc_byte_perm (w[ 4], w[ 3], selector); + w[47] = hc_byte_perm (w[ 3], w[ 2], selector); + w[46] = hc_byte_perm (w[ 2], w[ 1], selector); + w[45] = hc_byte_perm (w[ 1], w[ 0], selector); + w[44] = hc_byte_perm (w[ 0], 0, selector); w[43] = 0; w[42] = 0; w[41] = 0; @@ -27824,25 +27824,25 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 45: - w[63] = __byte_perm (w[18], w[17], selector); - w[62] = __byte_perm (w[17], w[16], selector); - w[61] = __byte_perm (w[16], w[15], selector); - w[60] = __byte_perm (w[15], w[14], selector); - w[59] = __byte_perm (w[14], w[13], selector); - w[58] = __byte_perm (w[13], w[12], selector); - w[57] = __byte_perm (w[12], w[11], selector); - w[56] = __byte_perm (w[11], w[10], selector); - w[55] = __byte_perm (w[10], w[ 9], selector); - w[54] = __byte_perm (w[ 9], w[ 8], selector); - w[53] = __byte_perm (w[ 8], w[ 7], selector); - w[52] = __byte_perm (w[ 7], w[ 6], selector); - w[51] = __byte_perm (w[ 6], w[ 5], selector); - w[50] = __byte_perm (w[ 5], w[ 4], selector); - w[49] = __byte_perm (w[ 4], w[ 3], selector); - w[48] = __byte_perm (w[ 3], w[ 2], selector); - w[47] = __byte_perm (w[ 2], w[ 1], selector); - w[46] = __byte_perm (w[ 1], w[ 0], selector); - w[45] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[18], w[17], selector); + w[62] = hc_byte_perm (w[17], w[16], selector); + w[61] = hc_byte_perm (w[16], w[15], selector); + w[60] = hc_byte_perm (w[15], w[14], selector); + w[59] = hc_byte_perm (w[14], w[13], selector); + w[58] = hc_byte_perm (w[13], w[12], selector); + w[57] = hc_byte_perm (w[12], w[11], selector); + w[56] = hc_byte_perm (w[11], w[10], selector); + w[55] = hc_byte_perm (w[10], w[ 9], selector); + w[54] = hc_byte_perm (w[ 9], w[ 8], selector); + w[53] = hc_byte_perm (w[ 8], w[ 7], selector); + w[52] = hc_byte_perm (w[ 7], w[ 6], selector); + w[51] = hc_byte_perm (w[ 6], w[ 5], selector); + w[50] = hc_byte_perm (w[ 5], w[ 4], selector); + w[49] = hc_byte_perm (w[ 4], w[ 3], selector); + w[48] = hc_byte_perm (w[ 3], w[ 2], selector); + w[47] = hc_byte_perm (w[ 2], w[ 1], selector); + w[46] = hc_byte_perm (w[ 1], w[ 0], selector); + w[45] = hc_byte_perm (w[ 0], 0, selector); w[44] = 0; w[43] = 0; w[42] = 0; @@ -27892,24 +27892,24 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 46: - w[63] = __byte_perm (w[17], w[16], selector); - w[62] = __byte_perm (w[16], w[15], selector); - w[61] = __byte_perm (w[15], w[14], selector); - w[60] = __byte_perm (w[14], w[13], selector); - w[59] = __byte_perm (w[13], w[12], selector); - w[58] = __byte_perm (w[12], w[11], selector); - w[57] = __byte_perm (w[11], w[10], selector); - w[56] = __byte_perm (w[10], w[ 9], selector); - w[55] = __byte_perm (w[ 9], w[ 8], selector); - w[54] = __byte_perm (w[ 8], w[ 7], selector); - w[53] = __byte_perm (w[ 7], w[ 6], selector); - w[52] = __byte_perm (w[ 6], w[ 5], selector); - w[51] = __byte_perm (w[ 5], w[ 4], selector); - w[50] = __byte_perm (w[ 4], w[ 3], selector); - w[49] = __byte_perm (w[ 3], w[ 2], selector); - w[48] = __byte_perm (w[ 2], w[ 1], selector); - w[47] = __byte_perm (w[ 1], w[ 0], selector); - w[46] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[17], w[16], selector); + w[62] = hc_byte_perm (w[16], w[15], selector); + w[61] = hc_byte_perm (w[15], w[14], selector); + w[60] = hc_byte_perm (w[14], w[13], selector); + w[59] = hc_byte_perm (w[13], w[12], selector); + w[58] = hc_byte_perm (w[12], w[11], selector); + w[57] = hc_byte_perm (w[11], w[10], selector); + w[56] = hc_byte_perm (w[10], w[ 9], selector); + w[55] = hc_byte_perm (w[ 9], w[ 8], selector); + w[54] = hc_byte_perm (w[ 8], w[ 7], selector); + w[53] = hc_byte_perm (w[ 7], w[ 6], selector); + w[52] = hc_byte_perm (w[ 6], w[ 5], selector); + w[51] = hc_byte_perm (w[ 5], w[ 4], selector); + w[50] = hc_byte_perm (w[ 4], w[ 3], selector); + w[49] = hc_byte_perm (w[ 3], w[ 2], selector); + w[48] = hc_byte_perm (w[ 2], w[ 1], selector); + w[47] = hc_byte_perm (w[ 1], w[ 0], selector); + w[46] = hc_byte_perm (w[ 0], 0, selector); w[45] = 0; w[44] = 0; w[43] = 0; @@ -27960,23 +27960,23 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 47: - w[63] = __byte_perm (w[16], w[15], selector); - w[62] = __byte_perm (w[15], w[14], selector); - w[61] = __byte_perm (w[14], w[13], selector); - w[60] = __byte_perm (w[13], w[12], selector); - w[59] = __byte_perm (w[12], w[11], selector); - w[58] = __byte_perm (w[11], w[10], selector); - w[57] = __byte_perm (w[10], w[ 9], selector); - w[56] = __byte_perm (w[ 9], w[ 8], selector); - w[55] = __byte_perm (w[ 8], w[ 7], selector); - w[54] = __byte_perm (w[ 7], w[ 6], selector); - w[53] = __byte_perm (w[ 6], w[ 5], selector); - w[52] = __byte_perm (w[ 5], w[ 4], selector); - w[51] = __byte_perm (w[ 4], w[ 3], selector); - w[50] = __byte_perm (w[ 3], w[ 2], selector); - w[49] = __byte_perm (w[ 2], w[ 1], selector); - w[48] = __byte_perm (w[ 1], w[ 0], selector); - w[47] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[16], w[15], selector); + w[62] = hc_byte_perm (w[15], w[14], selector); + w[61] = hc_byte_perm (w[14], w[13], selector); + w[60] = hc_byte_perm (w[13], w[12], selector); + w[59] = hc_byte_perm (w[12], w[11], selector); + w[58] = hc_byte_perm (w[11], w[10], selector); + w[57] = hc_byte_perm (w[10], w[ 9], selector); + w[56] = hc_byte_perm (w[ 9], w[ 8], selector); + w[55] = hc_byte_perm (w[ 8], w[ 7], selector); + w[54] = hc_byte_perm (w[ 7], w[ 6], selector); + w[53] = hc_byte_perm (w[ 6], w[ 5], selector); + w[52] = hc_byte_perm (w[ 5], w[ 4], selector); + w[51] = hc_byte_perm (w[ 4], w[ 3], selector); + w[50] = hc_byte_perm (w[ 3], w[ 2], selector); + w[49] = hc_byte_perm (w[ 2], w[ 1], selector); + w[48] = hc_byte_perm (w[ 1], w[ 0], selector); + w[47] = hc_byte_perm (w[ 0], 0, selector); w[46] = 0; w[45] = 0; w[44] = 0; @@ -28028,22 +28028,22 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 48: - w[63] = __byte_perm (w[15], w[14], selector); - w[62] = __byte_perm (w[14], w[13], selector); - w[61] = __byte_perm (w[13], w[12], selector); - w[60] = __byte_perm (w[12], w[11], selector); - w[59] = __byte_perm (w[11], w[10], selector); - w[58] = __byte_perm (w[10], w[ 9], selector); - w[57] = __byte_perm (w[ 9], w[ 8], selector); - w[56] = __byte_perm (w[ 8], w[ 7], selector); - w[55] = __byte_perm (w[ 7], w[ 6], selector); - w[54] = __byte_perm (w[ 6], w[ 5], selector); - w[53] = __byte_perm (w[ 5], w[ 4], selector); - w[52] = __byte_perm (w[ 4], w[ 3], selector); - w[51] = __byte_perm (w[ 3], w[ 2], selector); - w[50] = __byte_perm (w[ 2], w[ 1], selector); - w[49] = __byte_perm (w[ 1], w[ 0], selector); - w[48] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[15], w[14], selector); + w[62] = hc_byte_perm (w[14], w[13], selector); + w[61] = hc_byte_perm (w[13], w[12], selector); + w[60] = hc_byte_perm (w[12], w[11], selector); + w[59] = hc_byte_perm (w[11], w[10], selector); + w[58] = hc_byte_perm (w[10], w[ 9], selector); + w[57] = hc_byte_perm (w[ 9], w[ 8], selector); + w[56] = hc_byte_perm (w[ 8], w[ 7], selector); + w[55] = hc_byte_perm (w[ 7], w[ 6], selector); + w[54] = hc_byte_perm (w[ 6], w[ 5], selector); + w[53] = hc_byte_perm (w[ 5], w[ 4], selector); + w[52] = hc_byte_perm (w[ 4], w[ 3], selector); + w[51] = hc_byte_perm (w[ 3], w[ 2], selector); + w[50] = hc_byte_perm (w[ 2], w[ 1], selector); + w[49] = hc_byte_perm (w[ 1], w[ 0], selector); + w[48] = hc_byte_perm (w[ 0], 0, selector); w[47] = 0; w[46] = 0; w[45] = 0; @@ -28096,21 +28096,21 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 49: - w[63] = __byte_perm (w[14], w[13], selector); - w[62] = __byte_perm (w[13], w[12], selector); - w[61] = __byte_perm (w[12], w[11], selector); - w[60] = __byte_perm (w[11], w[10], selector); - w[59] = __byte_perm (w[10], w[ 9], selector); - w[58] = __byte_perm (w[ 9], w[ 8], selector); - w[57] = __byte_perm (w[ 8], w[ 7], selector); - w[56] = __byte_perm (w[ 7], w[ 6], selector); - w[55] = __byte_perm (w[ 6], w[ 5], selector); - w[54] = __byte_perm (w[ 5], w[ 4], selector); - w[53] = __byte_perm (w[ 4], w[ 3], selector); - w[52] = __byte_perm (w[ 3], w[ 2], selector); - w[51] = __byte_perm (w[ 2], w[ 1], selector); - w[50] = __byte_perm (w[ 1], w[ 0], selector); - w[49] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[14], w[13], selector); + w[62] = hc_byte_perm (w[13], w[12], selector); + w[61] = hc_byte_perm (w[12], w[11], selector); + w[60] = hc_byte_perm (w[11], w[10], selector); + w[59] = hc_byte_perm (w[10], w[ 9], selector); + w[58] = hc_byte_perm (w[ 9], w[ 8], selector); + w[57] = hc_byte_perm (w[ 8], w[ 7], selector); + w[56] = hc_byte_perm (w[ 7], w[ 6], selector); + w[55] = hc_byte_perm (w[ 6], w[ 5], selector); + w[54] = hc_byte_perm (w[ 5], w[ 4], selector); + w[53] = hc_byte_perm (w[ 4], w[ 3], selector); + w[52] = hc_byte_perm (w[ 3], w[ 2], selector); + w[51] = hc_byte_perm (w[ 2], w[ 1], selector); + w[50] = hc_byte_perm (w[ 1], w[ 0], selector); + w[49] = hc_byte_perm (w[ 0], 0, selector); w[48] = 0; w[47] = 0; w[46] = 0; @@ -28164,20 +28164,20 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 50: - w[63] = __byte_perm (w[13], w[12], selector); - w[62] = __byte_perm (w[12], w[11], selector); - w[61] = __byte_perm (w[11], w[10], selector); - w[60] = __byte_perm (w[10], w[ 9], selector); - w[59] = __byte_perm (w[ 9], w[ 8], selector); - w[58] = __byte_perm (w[ 8], w[ 7], selector); - w[57] = __byte_perm (w[ 7], w[ 6], selector); - w[56] = __byte_perm (w[ 6], w[ 5], selector); - w[55] = __byte_perm (w[ 5], w[ 4], selector); - w[54] = __byte_perm (w[ 4], w[ 3], selector); - w[53] = __byte_perm (w[ 3], w[ 2], selector); - w[52] = __byte_perm (w[ 2], w[ 1], selector); - w[51] = __byte_perm (w[ 1], w[ 0], selector); - w[50] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[13], w[12], selector); + w[62] = hc_byte_perm (w[12], w[11], selector); + w[61] = hc_byte_perm (w[11], w[10], selector); + w[60] = hc_byte_perm (w[10], w[ 9], selector); + w[59] = hc_byte_perm (w[ 9], w[ 8], selector); + w[58] = hc_byte_perm (w[ 8], w[ 7], selector); + w[57] = hc_byte_perm (w[ 7], w[ 6], selector); + w[56] = hc_byte_perm (w[ 6], w[ 5], selector); + w[55] = hc_byte_perm (w[ 5], w[ 4], selector); + w[54] = hc_byte_perm (w[ 4], w[ 3], selector); + w[53] = hc_byte_perm (w[ 3], w[ 2], selector); + w[52] = hc_byte_perm (w[ 2], w[ 1], selector); + w[51] = hc_byte_perm (w[ 1], w[ 0], selector); + w[50] = hc_byte_perm (w[ 0], 0, selector); w[49] = 0; w[48] = 0; w[47] = 0; @@ -28232,19 +28232,19 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 51: - w[63] = __byte_perm (w[12], w[11], selector); - w[62] = __byte_perm (w[11], w[10], selector); - w[61] = __byte_perm (w[10], w[ 9], selector); - w[60] = __byte_perm (w[ 9], w[ 8], selector); - w[59] = __byte_perm (w[ 8], w[ 7], selector); - w[58] = __byte_perm (w[ 7], w[ 6], selector); - w[57] = __byte_perm (w[ 6], w[ 5], selector); - w[56] = __byte_perm (w[ 5], w[ 4], selector); - w[55] = __byte_perm (w[ 4], w[ 3], selector); - w[54] = __byte_perm (w[ 3], w[ 2], selector); - w[53] = __byte_perm (w[ 2], w[ 1], selector); - w[52] = __byte_perm (w[ 1], w[ 0], selector); - w[51] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[12], w[11], selector); + w[62] = hc_byte_perm (w[11], w[10], selector); + w[61] = hc_byte_perm (w[10], w[ 9], selector); + w[60] = hc_byte_perm (w[ 9], w[ 8], selector); + w[59] = hc_byte_perm (w[ 8], w[ 7], selector); + w[58] = hc_byte_perm (w[ 7], w[ 6], selector); + w[57] = hc_byte_perm (w[ 6], w[ 5], selector); + w[56] = hc_byte_perm (w[ 5], w[ 4], selector); + w[55] = hc_byte_perm (w[ 4], w[ 3], selector); + w[54] = hc_byte_perm (w[ 3], w[ 2], selector); + w[53] = hc_byte_perm (w[ 2], w[ 1], selector); + w[52] = hc_byte_perm (w[ 1], w[ 0], selector); + w[51] = hc_byte_perm (w[ 0], 0, selector); w[50] = 0; w[49] = 0; w[48] = 0; @@ -28300,18 +28300,18 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 52: - w[63] = __byte_perm (w[11], w[10], selector); - w[62] = __byte_perm (w[10], w[ 9], selector); - w[61] = __byte_perm (w[ 9], w[ 8], selector); - w[60] = __byte_perm (w[ 8], w[ 7], selector); - w[59] = __byte_perm (w[ 7], w[ 6], selector); - w[58] = __byte_perm (w[ 6], w[ 5], selector); - w[57] = __byte_perm (w[ 5], w[ 4], selector); - w[56] = __byte_perm (w[ 4], w[ 3], selector); - w[55] = __byte_perm (w[ 3], w[ 2], selector); - w[54] = __byte_perm (w[ 2], w[ 1], selector); - w[53] = __byte_perm (w[ 1], w[ 0], selector); - w[52] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[11], w[10], selector); + w[62] = hc_byte_perm (w[10], w[ 9], selector); + w[61] = hc_byte_perm (w[ 9], w[ 8], selector); + w[60] = hc_byte_perm (w[ 8], w[ 7], selector); + w[59] = hc_byte_perm (w[ 7], w[ 6], selector); + w[58] = hc_byte_perm (w[ 6], w[ 5], selector); + w[57] = hc_byte_perm (w[ 5], w[ 4], selector); + w[56] = hc_byte_perm (w[ 4], w[ 3], selector); + w[55] = hc_byte_perm (w[ 3], w[ 2], selector); + w[54] = hc_byte_perm (w[ 2], w[ 1], selector); + w[53] = hc_byte_perm (w[ 1], w[ 0], selector); + w[52] = hc_byte_perm (w[ 0], 0, selector); w[51] = 0; w[50] = 0; w[49] = 0; @@ -28368,17 +28368,17 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 53: - w[63] = __byte_perm (w[10], w[ 9], selector); - w[62] = __byte_perm (w[ 9], w[ 8], selector); - w[61] = __byte_perm (w[ 8], w[ 7], selector); - w[60] = __byte_perm (w[ 7], w[ 6], selector); - w[59] = __byte_perm (w[ 6], w[ 5], selector); - w[58] = __byte_perm (w[ 5], w[ 4], selector); - w[57] = __byte_perm (w[ 4], w[ 3], selector); - w[56] = __byte_perm (w[ 3], w[ 2], selector); - w[55] = __byte_perm (w[ 2], w[ 1], selector); - w[54] = __byte_perm (w[ 1], w[ 0], selector); - w[53] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[10], w[ 9], selector); + w[62] = hc_byte_perm (w[ 9], w[ 8], selector); + w[61] = hc_byte_perm (w[ 8], w[ 7], selector); + w[60] = hc_byte_perm (w[ 7], w[ 6], selector); + w[59] = hc_byte_perm (w[ 6], w[ 5], selector); + w[58] = hc_byte_perm (w[ 5], w[ 4], selector); + w[57] = hc_byte_perm (w[ 4], w[ 3], selector); + w[56] = hc_byte_perm (w[ 3], w[ 2], selector); + w[55] = hc_byte_perm (w[ 2], w[ 1], selector); + w[54] = hc_byte_perm (w[ 1], w[ 0], selector); + w[53] = hc_byte_perm (w[ 0], 0, selector); w[52] = 0; w[51] = 0; w[50] = 0; @@ -28436,16 +28436,16 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 54: - w[63] = __byte_perm (w[ 9], w[ 8], selector); - w[62] = __byte_perm (w[ 8], w[ 7], selector); - w[61] = __byte_perm (w[ 7], w[ 6], selector); - w[60] = __byte_perm (w[ 6], w[ 5], selector); - w[59] = __byte_perm (w[ 5], w[ 4], selector); - w[58] = __byte_perm (w[ 4], w[ 3], selector); - w[57] = __byte_perm (w[ 3], w[ 2], selector); - w[56] = __byte_perm (w[ 2], w[ 1], selector); - w[55] = __byte_perm (w[ 1], w[ 0], selector); - w[54] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[ 9], w[ 8], selector); + w[62] = hc_byte_perm (w[ 8], w[ 7], selector); + w[61] = hc_byte_perm (w[ 7], w[ 6], selector); + w[60] = hc_byte_perm (w[ 6], w[ 5], selector); + w[59] = hc_byte_perm (w[ 5], w[ 4], selector); + w[58] = hc_byte_perm (w[ 4], w[ 3], selector); + w[57] = hc_byte_perm (w[ 3], w[ 2], selector); + w[56] = hc_byte_perm (w[ 2], w[ 1], selector); + w[55] = hc_byte_perm (w[ 1], w[ 0], selector); + w[54] = hc_byte_perm (w[ 0], 0, selector); w[53] = 0; w[52] = 0; w[51] = 0; @@ -28504,15 +28504,15 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 55: - w[63] = __byte_perm (w[ 8], w[ 7], selector); - w[62] = __byte_perm (w[ 7], w[ 6], selector); - w[61] = __byte_perm (w[ 6], w[ 5], selector); - w[60] = __byte_perm (w[ 5], w[ 4], selector); - w[59] = __byte_perm (w[ 4], w[ 3], selector); - w[58] = __byte_perm (w[ 3], w[ 2], selector); - w[57] = __byte_perm (w[ 2], w[ 1], selector); - w[56] = __byte_perm (w[ 1], w[ 0], selector); - w[55] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[ 8], w[ 7], selector); + w[62] = hc_byte_perm (w[ 7], w[ 6], selector); + w[61] = hc_byte_perm (w[ 6], w[ 5], selector); + w[60] = hc_byte_perm (w[ 5], w[ 4], selector); + w[59] = hc_byte_perm (w[ 4], w[ 3], selector); + w[58] = hc_byte_perm (w[ 3], w[ 2], selector); + w[57] = hc_byte_perm (w[ 2], w[ 1], selector); + w[56] = hc_byte_perm (w[ 1], w[ 0], selector); + w[55] = hc_byte_perm (w[ 0], 0, selector); w[54] = 0; w[53] = 0; w[52] = 0; @@ -28572,14 +28572,14 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 56: - w[63] = __byte_perm (w[ 7], w[ 6], selector); - w[62] = __byte_perm (w[ 6], w[ 5], selector); - w[61] = __byte_perm (w[ 5], w[ 4], selector); - w[60] = __byte_perm (w[ 4], w[ 3], selector); - w[59] = __byte_perm (w[ 3], w[ 2], selector); - w[58] = __byte_perm (w[ 2], w[ 1], selector); - w[57] = __byte_perm (w[ 1], w[ 0], selector); - w[56] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[ 7], w[ 6], selector); + w[62] = hc_byte_perm (w[ 6], w[ 5], selector); + w[61] = hc_byte_perm (w[ 5], w[ 4], selector); + w[60] = hc_byte_perm (w[ 4], w[ 3], selector); + w[59] = hc_byte_perm (w[ 3], w[ 2], selector); + w[58] = hc_byte_perm (w[ 2], w[ 1], selector); + w[57] = hc_byte_perm (w[ 1], w[ 0], selector); + w[56] = hc_byte_perm (w[ 0], 0, selector); w[55] = 0; w[54] = 0; w[53] = 0; @@ -28640,13 +28640,13 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 57: - w[63] = __byte_perm (w[ 6], w[ 5], selector); - w[62] = __byte_perm (w[ 5], w[ 4], selector); - w[61] = __byte_perm (w[ 4], w[ 3], selector); - w[60] = __byte_perm (w[ 3], w[ 2], selector); - w[59] = __byte_perm (w[ 2], w[ 1], selector); - w[58] = __byte_perm (w[ 1], w[ 0], selector); - w[57] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[ 6], w[ 5], selector); + w[62] = hc_byte_perm (w[ 5], w[ 4], selector); + w[61] = hc_byte_perm (w[ 4], w[ 3], selector); + w[60] = hc_byte_perm (w[ 3], w[ 2], selector); + w[59] = hc_byte_perm (w[ 2], w[ 1], selector); + w[58] = hc_byte_perm (w[ 1], w[ 0], selector); + w[57] = hc_byte_perm (w[ 0], 0, selector); w[56] = 0; w[55] = 0; w[54] = 0; @@ -28708,12 +28708,12 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 58: - w[63] = __byte_perm (w[ 5], w[ 4], selector); - w[62] = __byte_perm (w[ 4], w[ 3], selector); - w[61] = __byte_perm (w[ 3], w[ 2], selector); - w[60] = __byte_perm (w[ 2], w[ 1], selector); - w[59] = __byte_perm (w[ 1], w[ 0], selector); - w[58] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[ 5], w[ 4], selector); + w[62] = hc_byte_perm (w[ 4], w[ 3], selector); + w[61] = hc_byte_perm (w[ 3], w[ 2], selector); + w[60] = hc_byte_perm (w[ 2], w[ 1], selector); + w[59] = hc_byte_perm (w[ 1], w[ 0], selector); + w[58] = hc_byte_perm (w[ 0], 0, selector); w[57] = 0; w[56] = 0; w[55] = 0; @@ -28776,11 +28776,11 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 59: - w[63] = __byte_perm (w[ 4], w[ 3], selector); - w[62] = __byte_perm (w[ 3], w[ 2], selector); - w[61] = __byte_perm (w[ 2], w[ 1], selector); - w[60] = __byte_perm (w[ 1], w[ 0], selector); - w[59] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[ 4], w[ 3], selector); + w[62] = hc_byte_perm (w[ 3], w[ 2], selector); + w[61] = hc_byte_perm (w[ 2], w[ 1], selector); + w[60] = hc_byte_perm (w[ 1], w[ 0], selector); + w[59] = hc_byte_perm (w[ 0], 0, selector); w[58] = 0; w[57] = 0; w[56] = 0; @@ -28844,10 +28844,10 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 60: - w[63] = __byte_perm (w[ 3], w[ 2], selector); - w[62] = __byte_perm (w[ 2], w[ 1], selector); - w[61] = __byte_perm (w[ 1], w[ 0], selector); - w[60] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[ 3], w[ 2], selector); + w[62] = hc_byte_perm (w[ 2], w[ 1], selector); + w[61] = hc_byte_perm (w[ 1], w[ 0], selector); + w[60] = hc_byte_perm (w[ 0], 0, selector); w[59] = 0; w[58] = 0; w[57] = 0; @@ -28912,9 +28912,9 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 61: - w[63] = __byte_perm (w[ 2], w[ 1], selector); - w[62] = __byte_perm (w[ 1], w[ 0], selector); - w[61] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[ 2], w[ 1], selector); + w[62] = hc_byte_perm (w[ 1], w[ 0], selector); + w[61] = hc_byte_perm (w[ 0], 0, selector); w[60] = 0; w[59] = 0; w[58] = 0; @@ -28980,8 +28980,8 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 62: - w[63] = __byte_perm (w[ 1], w[ 0], selector); - w[62] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[ 1], w[ 0], selector); + w[62] = hc_byte_perm (w[ 0], 0, selector); w[61] = 0; w[60] = 0; w[59] = 0; @@ -29048,7 +29048,7 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; case 63: - w[63] = __byte_perm (w[ 0], 0, selector); + w[63] = hc_byte_perm (w[ 0], 0, selector); w[62] = 0; w[61] = 0; w[60] = 0; @@ -31068,25 +31068,25 @@ DECLSPEC void make_utf16be_S (const u32 *in, u32 *out1, u32 *out2) { #if defined IS_NV - out2[3] = __byte_perm_S (in[3], 0, 0x3727); - out2[2] = __byte_perm_S (in[3], 0, 0x1707); - out2[1] = __byte_perm_S (in[2], 0, 0x3727); - out2[0] = __byte_perm_S (in[2], 0, 0x1707); - out1[3] = __byte_perm_S (in[1], 0, 0x3727); - out1[2] = __byte_perm_S (in[1], 0, 0x1707); - out1[1] = __byte_perm_S (in[0], 0, 0x3727); - out1[0] = __byte_perm_S (in[0], 0, 0x1707); + out2[3] = hc_byte_perm_S (in[3], 0, 0x3727); + out2[2] = hc_byte_perm_S (in[3], 0, 0x1707); + out2[1] = hc_byte_perm_S (in[2], 0, 0x3727); + out2[0] = hc_byte_perm_S (in[2], 0, 0x1707); + out1[3] = hc_byte_perm_S (in[1], 0, 0x3727); + out1[2] = hc_byte_perm_S (in[1], 0, 0x1707); + out1[1] = hc_byte_perm_S (in[0], 0, 0x3727); + out1[0] = hc_byte_perm_S (in[0], 0, 0x1707); #elif defined IS_AMD && AMD_GCN >= 3 - out2[3] = __byte_perm_S (in[3], 0, 0x03070207); - out2[2] = __byte_perm_S (in[3], 0, 0x01070007); - out2[1] = __byte_perm_S (in[2], 0, 0x03070207); - out2[0] = __byte_perm_S (in[2], 0, 0x01070007); - out1[3] = __byte_perm_S (in[1], 0, 0x03070207); - out1[2] = __byte_perm_S (in[1], 0, 0x01070007); - out1[1] = __byte_perm_S (in[0], 0, 0x03070207); - out1[0] = __byte_perm_S (in[0], 0, 0x01070007); + out2[3] = hc_byte_perm_S (in[3], 0, 0x03070207); + out2[2] = hc_byte_perm_S (in[3], 0, 0x01070007); + out2[1] = hc_byte_perm_S (in[2], 0, 0x03070207); + out2[0] = hc_byte_perm_S (in[2], 0, 0x01070007); + out1[3] = hc_byte_perm_S (in[1], 0, 0x03070207); + out1[2] = hc_byte_perm_S (in[1], 0, 0x01070007); + out1[1] = hc_byte_perm_S (in[0], 0, 0x03070207); + out1[0] = hc_byte_perm_S (in[0], 0, 0x01070007); #else @@ -31106,25 +31106,25 @@ DECLSPEC void make_utf16le_S (const u32 *in, u32 *out1, u32 *out2) { #if defined IS_NV - out2[3] = __byte_perm_S (in[3], 0, 0x7372); - out2[2] = __byte_perm_S (in[3], 0, 0x7170); - out2[1] = __byte_perm_S (in[2], 0, 0x7372); - out2[0] = __byte_perm_S (in[2], 0, 0x7170); - out1[3] = __byte_perm_S (in[1], 0, 0x7372); - out1[2] = __byte_perm_S (in[1], 0, 0x7170); - out1[1] = __byte_perm_S (in[0], 0, 0x7372); - out1[0] = __byte_perm_S (in[0], 0, 0x7170); + out2[3] = hc_byte_perm_S (in[3], 0, 0x7372); + out2[2] = hc_byte_perm_S (in[3], 0, 0x7170); + out2[1] = hc_byte_perm_S (in[2], 0, 0x7372); + out2[0] = hc_byte_perm_S (in[2], 0, 0x7170); + out1[3] = hc_byte_perm_S (in[1], 0, 0x7372); + out1[2] = hc_byte_perm_S (in[1], 0, 0x7170); + out1[1] = hc_byte_perm_S (in[0], 0, 0x7372); + out1[0] = hc_byte_perm_S (in[0], 0, 0x7170); #elif defined IS_AMD && AMD_GCN >= 3 - out2[3] = __byte_perm_S (in[3], 0, 0x07030702); - out2[2] = __byte_perm_S (in[3], 0, 0x07010700); - out2[1] = __byte_perm_S (in[2], 0, 0x07030702); - out2[0] = __byte_perm_S (in[2], 0, 0x07010700); - out1[3] = __byte_perm_S (in[1], 0, 0x07030702); - out1[2] = __byte_perm_S (in[1], 0, 0x07010700); - out1[1] = __byte_perm_S (in[0], 0, 0x07030702); - out1[0] = __byte_perm_S (in[0], 0, 0x07010700); + out2[3] = hc_byte_perm_S (in[3], 0, 0x07030702); + out2[2] = hc_byte_perm_S (in[3], 0, 0x07010700); + out2[1] = hc_byte_perm_S (in[2], 0, 0x07030702); + out2[0] = hc_byte_perm_S (in[2], 0, 0x07010700); + out1[3] = hc_byte_perm_S (in[1], 0, 0x07030702); + out1[2] = hc_byte_perm_S (in[1], 0, 0x07010700); + out1[1] = hc_byte_perm_S (in[0], 0, 0x07030702); + out1[0] = hc_byte_perm_S (in[0], 0, 0x07010700); #else @@ -31144,17 +31144,17 @@ DECLSPEC void undo_utf16be_S (const u32 *in1, const u32 *in2, u32 *out) { #if defined IS_NV - out[0] = __byte_perm_S (in1[0], in1[1], 0x4602); - out[1] = __byte_perm_S (in1[2], in1[3], 0x4602); - out[2] = __byte_perm_S (in2[0], in2[1], 0x4602); - out[3] = __byte_perm_S (in2[2], in2[3], 0x4602); + out[0] = hc_byte_perm_S (in1[0], in1[1], 0x4602); + out[1] = hc_byte_perm_S (in1[2], in1[3], 0x4602); + out[2] = hc_byte_perm_S (in2[0], in2[1], 0x4602); + out[3] = hc_byte_perm_S (in2[2], in2[3], 0x4602); #elif defined IS_AMD && AMD_GCN >= 3 - out[0] = __byte_perm_S (in1[0], in1[1], 0x04060002); - out[1] = __byte_perm_S (in1[2], in1[3], 0x04060002); - out[2] = __byte_perm_S (in2[0], in2[1], 0x04060002); - out[3] = __byte_perm_S (in2[2], in2[3], 0x04060002); + out[0] = hc_byte_perm_S (in1[0], in1[1], 0x04060002); + out[1] = hc_byte_perm_S (in1[2], in1[3], 0x04060002); + out[2] = hc_byte_perm_S (in2[0], in2[1], 0x04060002); + out[3] = hc_byte_perm_S (in2[2], in2[3], 0x04060002); #else @@ -31174,17 +31174,17 @@ DECLSPEC void undo_utf16le_S (const u32 *in1, const u32 *in2, u32 *out) { #if defined IS_NV - out[0] = __byte_perm_S (in1[0], in1[1], 0x6420); - out[1] = __byte_perm_S (in1[2], in1[3], 0x6420); - out[2] = __byte_perm_S (in2[0], in2[1], 0x6420); - out[3] = __byte_perm_S (in2[2], in2[3], 0x6420); + out[0] = hc_byte_perm_S (in1[0], in1[1], 0x6420); + out[1] = hc_byte_perm_S (in1[2], in1[3], 0x6420); + out[2] = hc_byte_perm_S (in2[0], in2[1], 0x6420); + out[3] = hc_byte_perm_S (in2[2], in2[3], 0x6420); #elif defined IS_AMD && AMD_GCN >= 3 - out[0] = __byte_perm_S (in1[0], in1[1], 0x06040200); - out[1] = __byte_perm_S (in1[2], in1[3], 0x06040200); - out[2] = __byte_perm_S (in2[0], in2[1], 0x06040200); - out[3] = __byte_perm_S (in2[2], in2[3], 0x06040200); + out[0] = hc_byte_perm_S (in1[0], in1[1], 0x06040200); + out[1] = hc_byte_perm_S (in1[2], in1[3], 0x06040200); + out[2] = hc_byte_perm_S (in2[0], in2[1], 0x06040200); + out[3] = hc_byte_perm_S (in2[2], in2[3], 0x06040200); #else @@ -31229,79 +31229,79 @@ DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, switch (offset_switch) { case 0: - w3[3] = amd_bytealign_S (w3[2], w3[3], offset); - w3[2] = amd_bytealign_S (w3[1], w3[2], offset); - w3[1] = amd_bytealign_S (w3[0], w3[1], offset); - w3[0] = amd_bytealign_S (w2[3], w3[0], offset); - w2[3] = amd_bytealign_S (w2[2], w2[3], offset); - w2[2] = amd_bytealign_S (w2[1], w2[2], offset); - w2[1] = amd_bytealign_S (w2[0], w2[1], offset); - w2[0] = amd_bytealign_S (w1[3], w2[0], offset); - w1[3] = amd_bytealign_S (w1[2], w1[3], offset); - w1[2] = amd_bytealign_S (w1[1], w1[2], offset); - w1[1] = amd_bytealign_S (w1[0], w1[1], offset); - w1[0] = amd_bytealign_S (w0[3], w1[0], offset); - w0[3] = amd_bytealign_S (w0[2], w0[3], offset); - w0[2] = amd_bytealign_S (w0[1], w0[2], offset); - w0[1] = amd_bytealign_S (w0[0], w0[1], offset); - w0[0] = amd_bytealign_S ( 0, w0[0], offset); + w3[3] = hc_bytealign_S (w3[2], w3[3], offset); + w3[2] = hc_bytealign_S (w3[1], w3[2], offset); + w3[1] = hc_bytealign_S (w3[0], w3[1], offset); + w3[0] = hc_bytealign_S (w2[3], w3[0], offset); + w2[3] = hc_bytealign_S (w2[2], w2[3], offset); + w2[2] = hc_bytealign_S (w2[1], w2[2], offset); + w2[1] = hc_bytealign_S (w2[0], w2[1], offset); + w2[0] = hc_bytealign_S (w1[3], w2[0], offset); + w1[3] = hc_bytealign_S (w1[2], w1[3], offset); + w1[2] = hc_bytealign_S (w1[1], w1[2], offset); + w1[1] = hc_bytealign_S (w1[0], w1[1], offset); + w1[0] = hc_bytealign_S (w0[3], w1[0], offset); + w0[3] = hc_bytealign_S (w0[2], w0[3], offset); + w0[2] = hc_bytealign_S (w0[1], w0[2], offset); + w0[1] = hc_bytealign_S (w0[0], w0[1], offset); + w0[0] = hc_bytealign_S ( 0, w0[0], offset); break; case 1: - w3[3] = amd_bytealign_S (w3[1], w3[2], offset); - w3[2] = amd_bytealign_S (w3[0], w3[1], offset); - w3[1] = amd_bytealign_S (w2[3], w3[0], offset); - w3[0] = amd_bytealign_S (w2[2], w2[3], offset); - w2[3] = amd_bytealign_S (w2[1], w2[2], offset); - w2[2] = amd_bytealign_S (w2[0], w2[1], offset); - w2[1] = amd_bytealign_S (w1[3], w2[0], offset); - w2[0] = amd_bytealign_S (w1[2], w1[3], offset); - w1[3] = amd_bytealign_S (w1[1], w1[2], offset); - w1[2] = amd_bytealign_S (w1[0], w1[1], offset); - w1[1] = amd_bytealign_S (w0[3], w1[0], offset); - w1[0] = amd_bytealign_S (w0[2], w0[3], offset); - w0[3] = amd_bytealign_S (w0[1], w0[2], offset); - w0[2] = amd_bytealign_S (w0[0], w0[1], offset); - w0[1] = amd_bytealign_S ( 0, w0[0], offset); + w3[3] = hc_bytealign_S (w3[1], w3[2], offset); + w3[2] = hc_bytealign_S (w3[0], w3[1], offset); + w3[1] = hc_bytealign_S (w2[3], w3[0], offset); + w3[0] = hc_bytealign_S (w2[2], w2[3], offset); + w2[3] = hc_bytealign_S (w2[1], w2[2], offset); + w2[2] = hc_bytealign_S (w2[0], w2[1], offset); + w2[1] = hc_bytealign_S (w1[3], w2[0], offset); + w2[0] = hc_bytealign_S (w1[2], w1[3], offset); + w1[3] = hc_bytealign_S (w1[1], w1[2], offset); + w1[2] = hc_bytealign_S (w1[0], w1[1], offset); + w1[1] = hc_bytealign_S (w0[3], w1[0], offset); + w1[0] = hc_bytealign_S (w0[2], w0[3], offset); + w0[3] = hc_bytealign_S (w0[1], w0[2], offset); + w0[2] = hc_bytealign_S (w0[0], w0[1], offset); + w0[1] = hc_bytealign_S ( 0, w0[0], offset); w0[0] = 0; break; case 2: - w3[3] = amd_bytealign_S (w3[0], w3[1], offset); - w3[2] = amd_bytealign_S (w2[3], w3[0], offset); - w3[1] = amd_bytealign_S (w2[2], w2[3], offset); - w3[0] = amd_bytealign_S (w2[1], w2[2], offset); - w2[3] = amd_bytealign_S (w2[0], w2[1], offset); - w2[2] = amd_bytealign_S (w1[3], w2[0], offset); - w2[1] = amd_bytealign_S (w1[2], w1[3], offset); - w2[0] = amd_bytealign_S (w1[1], w1[2], offset); - w1[3] = amd_bytealign_S (w1[0], w1[1], offset); - w1[2] = amd_bytealign_S (w0[3], w1[0], offset); - w1[1] = amd_bytealign_S (w0[2], w0[3], offset); - w1[0] = amd_bytealign_S (w0[1], w0[2], offset); - w0[3] = amd_bytealign_S (w0[0], w0[1], offset); - w0[2] = amd_bytealign_S ( 0, w0[0], offset); + w3[3] = hc_bytealign_S (w3[0], w3[1], offset); + w3[2] = hc_bytealign_S (w2[3], w3[0], offset); + w3[1] = hc_bytealign_S (w2[2], w2[3], offset); + w3[0] = hc_bytealign_S (w2[1], w2[2], offset); + w2[3] = hc_bytealign_S (w2[0], w2[1], offset); + w2[2] = hc_bytealign_S (w1[3], w2[0], offset); + w2[1] = hc_bytealign_S (w1[2], w1[3], offset); + w2[0] = hc_bytealign_S (w1[1], w1[2], offset); + w1[3] = hc_bytealign_S (w1[0], w1[1], offset); + w1[2] = hc_bytealign_S (w0[3], w1[0], offset); + w1[1] = hc_bytealign_S (w0[2], w0[3], offset); + w1[0] = hc_bytealign_S (w0[1], w0[2], offset); + w0[3] = hc_bytealign_S (w0[0], w0[1], offset); + w0[2] = hc_bytealign_S ( 0, w0[0], offset); w0[1] = 0; w0[0] = 0; break; case 3: - w3[3] = amd_bytealign_S (w2[3], w3[0], offset); - w3[2] = amd_bytealign_S (w2[2], w2[3], offset); - w3[1] = amd_bytealign_S (w2[1], w2[2], offset); - w3[0] = amd_bytealign_S (w2[0], w2[1], offset); - w2[3] = amd_bytealign_S (w1[3], w2[0], offset); - w2[2] = amd_bytealign_S (w1[2], w1[3], offset); - w2[1] = amd_bytealign_S (w1[1], w1[2], offset); - w2[0] = amd_bytealign_S (w1[0], w1[1], offset); - w1[3] = amd_bytealign_S (w0[3], w1[0], offset); - w1[2] = amd_bytealign_S (w0[2], w0[3], offset); - w1[1] = amd_bytealign_S (w0[1], w0[2], offset); - w1[0] = amd_bytealign_S (w0[0], w0[1], offset); - w0[3] = amd_bytealign_S ( 0, w0[0], offset); + w3[3] = hc_bytealign_S (w2[3], w3[0], offset); + w3[2] = hc_bytealign_S (w2[2], w2[3], offset); + w3[1] = hc_bytealign_S (w2[1], w2[2], offset); + w3[0] = hc_bytealign_S (w2[0], w2[1], offset); + w2[3] = hc_bytealign_S (w1[3], w2[0], offset); + w2[2] = hc_bytealign_S (w1[2], w1[3], offset); + w2[1] = hc_bytealign_S (w1[1], w1[2], offset); + w2[0] = hc_bytealign_S (w1[0], w1[1], offset); + w1[3] = hc_bytealign_S (w0[3], w1[0], offset); + w1[2] = hc_bytealign_S (w0[2], w0[3], offset); + w1[1] = hc_bytealign_S (w0[1], w0[2], offset); + w1[0] = hc_bytealign_S (w0[0], w0[1], offset); + w0[3] = hc_bytealign_S ( 0, w0[0], offset); w0[2] = 0; w0[1] = 0; w0[0] = 0; @@ -31309,18 +31309,18 @@ DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 4: - w3[3] = amd_bytealign_S (w2[2], w2[3], offset); - w3[2] = amd_bytealign_S (w2[1], w2[2], offset); - w3[1] = amd_bytealign_S (w2[0], w2[1], offset); - w3[0] = amd_bytealign_S (w1[3], w2[0], offset); - w2[3] = amd_bytealign_S (w1[2], w1[3], offset); - w2[2] = amd_bytealign_S (w1[1], w1[2], offset); - w2[1] = amd_bytealign_S (w1[0], w1[1], offset); - w2[0] = amd_bytealign_S (w0[3], w1[0], offset); - w1[3] = amd_bytealign_S (w0[2], w0[3], offset); - w1[2] = amd_bytealign_S (w0[1], w0[2], offset); - w1[1] = amd_bytealign_S (w0[0], w0[1], offset); - w1[0] = amd_bytealign_S ( 0, w0[0], offset); + w3[3] = hc_bytealign_S (w2[2], w2[3], offset); + w3[2] = hc_bytealign_S (w2[1], w2[2], offset); + w3[1] = hc_bytealign_S (w2[0], w2[1], offset); + w3[0] = hc_bytealign_S (w1[3], w2[0], offset); + w2[3] = hc_bytealign_S (w1[2], w1[3], offset); + w2[2] = hc_bytealign_S (w1[1], w1[2], offset); + w2[1] = hc_bytealign_S (w1[0], w1[1], offset); + w2[0] = hc_bytealign_S (w0[3], w1[0], offset); + w1[3] = hc_bytealign_S (w0[2], w0[3], offset); + w1[2] = hc_bytealign_S (w0[1], w0[2], offset); + w1[1] = hc_bytealign_S (w0[0], w0[1], offset); + w1[0] = hc_bytealign_S ( 0, w0[0], offset); w0[3] = 0; w0[2] = 0; w0[1] = 0; @@ -31329,17 +31329,17 @@ DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 5: - w3[3] = amd_bytealign_S (w2[1], w2[2], offset); - w3[2] = amd_bytealign_S (w2[0], w2[1], offset); - w3[1] = amd_bytealign_S (w1[3], w2[0], offset); - w3[0] = amd_bytealign_S (w1[2], w1[3], offset); - w2[3] = amd_bytealign_S (w1[1], w1[2], offset); - w2[2] = amd_bytealign_S (w1[0], w1[1], offset); - w2[1] = amd_bytealign_S (w0[3], w1[0], offset); - w2[0] = amd_bytealign_S (w0[2], w0[3], offset); - w1[3] = amd_bytealign_S (w0[1], w0[2], offset); - w1[2] = amd_bytealign_S (w0[0], w0[1], offset); - w1[1] = amd_bytealign_S ( 0, w0[0], offset); + w3[3] = hc_bytealign_S (w2[1], w2[2], offset); + w3[2] = hc_bytealign_S (w2[0], w2[1], offset); + w3[1] = hc_bytealign_S (w1[3], w2[0], offset); + w3[0] = hc_bytealign_S (w1[2], w1[3], offset); + w2[3] = hc_bytealign_S (w1[1], w1[2], offset); + w2[2] = hc_bytealign_S (w1[0], w1[1], offset); + w2[1] = hc_bytealign_S (w0[3], w1[0], offset); + w2[0] = hc_bytealign_S (w0[2], w0[3], offset); + w1[3] = hc_bytealign_S (w0[1], w0[2], offset); + w1[2] = hc_bytealign_S (w0[0], w0[1], offset); + w1[1] = hc_bytealign_S ( 0, w0[0], offset); w1[0] = 0; w0[3] = 0; w0[2] = 0; @@ -31349,16 +31349,16 @@ DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 6: - w3[3] = amd_bytealign_S (w2[0], w2[1], offset); - w3[2] = amd_bytealign_S (w1[3], w2[0], offset); - w3[1] = amd_bytealign_S (w1[2], w1[3], offset); - w3[0] = amd_bytealign_S (w1[1], w1[2], offset); - w2[3] = amd_bytealign_S (w1[0], w1[1], offset); - w2[2] = amd_bytealign_S (w0[3], w1[0], offset); - w2[1] = amd_bytealign_S (w0[2], w0[3], offset); - w2[0] = amd_bytealign_S (w0[1], w0[2], offset); - w1[3] = amd_bytealign_S (w0[0], w0[1], offset); - w1[2] = amd_bytealign_S ( 0, w0[0], offset); + w3[3] = hc_bytealign_S (w2[0], w2[1], offset); + w3[2] = hc_bytealign_S (w1[3], w2[0], offset); + w3[1] = hc_bytealign_S (w1[2], w1[3], offset); + w3[0] = hc_bytealign_S (w1[1], w1[2], offset); + w2[3] = hc_bytealign_S (w1[0], w1[1], offset); + w2[2] = hc_bytealign_S (w0[3], w1[0], offset); + w2[1] = hc_bytealign_S (w0[2], w0[3], offset); + w2[0] = hc_bytealign_S (w0[1], w0[2], offset); + w1[3] = hc_bytealign_S (w0[0], w0[1], offset); + w1[2] = hc_bytealign_S ( 0, w0[0], offset); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -31369,15 +31369,15 @@ DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 7: - w3[3] = amd_bytealign_S (w1[3], w2[0], offset); - w3[2] = amd_bytealign_S (w1[2], w1[3], offset); - w3[1] = amd_bytealign_S (w1[1], w1[2], offset); - w3[0] = amd_bytealign_S (w1[0], w1[1], offset); - w2[3] = amd_bytealign_S (w0[3], w1[0], offset); - w2[2] = amd_bytealign_S (w0[2], w0[3], offset); - w2[1] = amd_bytealign_S (w0[1], w0[2], offset); - w2[0] = amd_bytealign_S (w0[0], w0[1], offset); - w1[3] = amd_bytealign_S ( 0, w0[0], offset); + w3[3] = hc_bytealign_S (w1[3], w2[0], offset); + w3[2] = hc_bytealign_S (w1[2], w1[3], offset); + w3[1] = hc_bytealign_S (w1[1], w1[2], offset); + w3[0] = hc_bytealign_S (w1[0], w1[1], offset); + w2[3] = hc_bytealign_S (w0[3], w1[0], offset); + w2[2] = hc_bytealign_S (w0[2], w0[3], offset); + w2[1] = hc_bytealign_S (w0[1], w0[2], offset); + w2[0] = hc_bytealign_S (w0[0], w0[1], offset); + w1[3] = hc_bytealign_S ( 0, w0[0], offset); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -31389,14 +31389,14 @@ DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 8: - w3[3] = amd_bytealign_S (w1[2], w1[3], offset); - w3[2] = amd_bytealign_S (w1[1], w1[2], offset); - w3[1] = amd_bytealign_S (w1[0], w1[1], offset); - w3[0] = amd_bytealign_S (w0[3], w1[0], offset); - w2[3] = amd_bytealign_S (w0[2], w0[3], offset); - w2[2] = amd_bytealign_S (w0[1], w0[2], offset); - w2[1] = amd_bytealign_S (w0[0], w0[1], offset); - w2[0] = amd_bytealign_S ( 0, w0[0], offset); + w3[3] = hc_bytealign_S (w1[2], w1[3], offset); + w3[2] = hc_bytealign_S (w1[1], w1[2], offset); + w3[1] = hc_bytealign_S (w1[0], w1[1], offset); + w3[0] = hc_bytealign_S (w0[3], w1[0], offset); + w2[3] = hc_bytealign_S (w0[2], w0[3], offset); + w2[2] = hc_bytealign_S (w0[1], w0[2], offset); + w2[1] = hc_bytealign_S (w0[0], w0[1], offset); + w2[0] = hc_bytealign_S ( 0, w0[0], offset); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -31409,13 +31409,13 @@ DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 9: - w3[3] = amd_bytealign_S (w1[1], w1[2], offset); - w3[2] = amd_bytealign_S (w1[0], w1[1], offset); - w3[1] = amd_bytealign_S (w0[3], w1[0], offset); - w3[0] = amd_bytealign_S (w0[2], w0[3], offset); - w2[3] = amd_bytealign_S (w0[1], w0[2], offset); - w2[2] = amd_bytealign_S (w0[0], w0[1], offset); - w2[1] = amd_bytealign_S ( 0, w0[0], offset); + w3[3] = hc_bytealign_S (w1[1], w1[2], offset); + w3[2] = hc_bytealign_S (w1[0], w1[1], offset); + w3[1] = hc_bytealign_S (w0[3], w1[0], offset); + w3[0] = hc_bytealign_S (w0[2], w0[3], offset); + w2[3] = hc_bytealign_S (w0[1], w0[2], offset); + w2[2] = hc_bytealign_S (w0[0], w0[1], offset); + w2[1] = hc_bytealign_S ( 0, w0[0], offset); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -31429,12 +31429,12 @@ DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 10: - w3[3] = amd_bytealign_S (w1[0], w1[1], offset); - w3[2] = amd_bytealign_S (w0[3], w1[0], offset); - w3[1] = amd_bytealign_S (w0[2], w0[3], offset); - w3[0] = amd_bytealign_S (w0[1], w0[2], offset); - w2[3] = amd_bytealign_S (w0[0], w0[1], offset); - w2[2] = amd_bytealign_S ( 0, w0[0], offset); + w3[3] = hc_bytealign_S (w1[0], w1[1], offset); + w3[2] = hc_bytealign_S (w0[3], w1[0], offset); + w3[1] = hc_bytealign_S (w0[2], w0[3], offset); + w3[0] = hc_bytealign_S (w0[1], w0[2], offset); + w2[3] = hc_bytealign_S (w0[0], w0[1], offset); + w2[2] = hc_bytealign_S ( 0, w0[0], offset); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -31449,11 +31449,11 @@ DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 11: - w3[3] = amd_bytealign_S (w0[3], w1[0], offset); - w3[2] = amd_bytealign_S (w0[2], w0[3], offset); - w3[1] = amd_bytealign_S (w0[1], w0[2], offset); - w3[0] = amd_bytealign_S (w0[0], w0[1], offset); - w2[3] = amd_bytealign_S ( 0, w0[0], offset); + w3[3] = hc_bytealign_S (w0[3], w1[0], offset); + w3[2] = hc_bytealign_S (w0[2], w0[3], offset); + w3[1] = hc_bytealign_S (w0[1], w0[2], offset); + w3[0] = hc_bytealign_S (w0[0], w0[1], offset); + w2[3] = hc_bytealign_S ( 0, w0[0], offset); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -31469,10 +31469,10 @@ DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 12: - w3[3] = amd_bytealign_S (w0[2], w0[3], offset); - w3[2] = amd_bytealign_S (w0[1], w0[2], offset); - w3[1] = amd_bytealign_S (w0[0], w0[1], offset); - w3[0] = amd_bytealign_S ( 0, w0[0], offset); + w3[3] = hc_bytealign_S (w0[2], w0[3], offset); + w3[2] = hc_bytealign_S (w0[1], w0[2], offset); + w3[1] = hc_bytealign_S (w0[0], w0[1], offset); + w3[0] = hc_bytealign_S ( 0, w0[0], offset); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -31489,9 +31489,9 @@ DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 13: - w3[3] = amd_bytealign_S (w0[1], w0[2], offset); - w3[2] = amd_bytealign_S (w0[0], w0[1], offset); - w3[1] = amd_bytealign_S ( 0, w0[0], offset); + w3[3] = hc_bytealign_S (w0[1], w0[2], offset); + w3[2] = hc_bytealign_S (w0[0], w0[1], offset); + w3[1] = hc_bytealign_S ( 0, w0[0], offset); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -31509,8 +31509,8 @@ DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 14: - w3[3] = amd_bytealign_S (w0[0], w0[1], offset); - w3[2] = amd_bytealign_S ( 0, w0[0], offset); + w3[3] = hc_bytealign_S (w0[0], w0[1], offset); + w3[2] = hc_bytealign_S ( 0, w0[0], offset); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -31529,7 +31529,7 @@ DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 15: - w3[3] = amd_bytealign_S ( 0, w0[0], offset); + w3[3] = hc_bytealign_S ( 0, w0[0], offset); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -31580,79 +31580,79 @@ DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, switch (offset_switch) { case 0: - w3[3] = __byte_perm_S (w3[2], w3[3], selector); - w3[2] = __byte_perm_S (w3[1], w3[2], selector); - w3[1] = __byte_perm_S (w3[0], w3[1], selector); - w3[0] = __byte_perm_S (w2[3], w3[0], selector); - w2[3] = __byte_perm_S (w2[2], w2[3], selector); - w2[2] = __byte_perm_S (w2[1], w2[2], selector); - w2[1] = __byte_perm_S (w2[0], w2[1], selector); - w2[0] = __byte_perm_S (w1[3], w2[0], selector); - w1[3] = __byte_perm_S (w1[2], w1[3], selector); - w1[2] = __byte_perm_S (w1[1], w1[2], selector); - w1[1] = __byte_perm_S (w1[0], w1[1], selector); - w1[0] = __byte_perm_S (w0[3], w1[0], selector); - w0[3] = __byte_perm_S (w0[2], w0[3], selector); - w0[2] = __byte_perm_S (w0[1], w0[2], selector); - w0[1] = __byte_perm_S (w0[0], w0[1], selector); - w0[0] = __byte_perm_S ( 0, w0[0], selector); + w3[3] = hc_byte_perm_S (w3[2], w3[3], selector); + w3[2] = hc_byte_perm_S (w3[1], w3[2], selector); + w3[1] = hc_byte_perm_S (w3[0], w3[1], selector); + w3[0] = hc_byte_perm_S (w2[3], w3[0], selector); + w2[3] = hc_byte_perm_S (w2[2], w2[3], selector); + w2[2] = hc_byte_perm_S (w2[1], w2[2], selector); + w2[1] = hc_byte_perm_S (w2[0], w2[1], selector); + w2[0] = hc_byte_perm_S (w1[3], w2[0], selector); + w1[3] = hc_byte_perm_S (w1[2], w1[3], selector); + w1[2] = hc_byte_perm_S (w1[1], w1[2], selector); + w1[1] = hc_byte_perm_S (w1[0], w1[1], selector); + w1[0] = hc_byte_perm_S (w0[3], w1[0], selector); + w0[3] = hc_byte_perm_S (w0[2], w0[3], selector); + w0[2] = hc_byte_perm_S (w0[1], w0[2], selector); + w0[1] = hc_byte_perm_S (w0[0], w0[1], selector); + w0[0] = hc_byte_perm_S ( 0, w0[0], selector); break; case 1: - w3[3] = __byte_perm_S (w3[1], w3[2], selector); - w3[2] = __byte_perm_S (w3[0], w3[1], selector); - w3[1] = __byte_perm_S (w2[3], w3[0], selector); - w3[0] = __byte_perm_S (w2[2], w2[3], selector); - w2[3] = __byte_perm_S (w2[1], w2[2], selector); - w2[2] = __byte_perm_S (w2[0], w2[1], selector); - w2[1] = __byte_perm_S (w1[3], w2[0], selector); - w2[0] = __byte_perm_S (w1[2], w1[3], selector); - w1[3] = __byte_perm_S (w1[1], w1[2], selector); - w1[2] = __byte_perm_S (w1[0], w1[1], selector); - w1[1] = __byte_perm_S (w0[3], w1[0], selector); - w1[0] = __byte_perm_S (w0[2], w0[3], selector); - w0[3] = __byte_perm_S (w0[1], w0[2], selector); - w0[2] = __byte_perm_S (w0[0], w0[1], selector); - w0[1] = __byte_perm_S ( 0, w0[0], selector); + w3[3] = hc_byte_perm_S (w3[1], w3[2], selector); + w3[2] = hc_byte_perm_S (w3[0], w3[1], selector); + w3[1] = hc_byte_perm_S (w2[3], w3[0], selector); + w3[0] = hc_byte_perm_S (w2[2], w2[3], selector); + w2[3] = hc_byte_perm_S (w2[1], w2[2], selector); + w2[2] = hc_byte_perm_S (w2[0], w2[1], selector); + w2[1] = hc_byte_perm_S (w1[3], w2[0], selector); + w2[0] = hc_byte_perm_S (w1[2], w1[3], selector); + w1[3] = hc_byte_perm_S (w1[1], w1[2], selector); + w1[2] = hc_byte_perm_S (w1[0], w1[1], selector); + w1[1] = hc_byte_perm_S (w0[3], w1[0], selector); + w1[0] = hc_byte_perm_S (w0[2], w0[3], selector); + w0[3] = hc_byte_perm_S (w0[1], w0[2], selector); + w0[2] = hc_byte_perm_S (w0[0], w0[1], selector); + w0[1] = hc_byte_perm_S ( 0, w0[0], selector); w0[0] = 0; break; case 2: - w3[3] = __byte_perm_S (w3[0], w3[1], selector); - w3[2] = __byte_perm_S (w2[3], w3[0], selector); - w3[1] = __byte_perm_S (w2[2], w2[3], selector); - w3[0] = __byte_perm_S (w2[1], w2[2], selector); - w2[3] = __byte_perm_S (w2[0], w2[1], selector); - w2[2] = __byte_perm_S (w1[3], w2[0], selector); - w2[1] = __byte_perm_S (w1[2], w1[3], selector); - w2[0] = __byte_perm_S (w1[1], w1[2], selector); - w1[3] = __byte_perm_S (w1[0], w1[1], selector); - w1[2] = __byte_perm_S (w0[3], w1[0], selector); - w1[1] = __byte_perm_S (w0[2], w0[3], selector); - w1[0] = __byte_perm_S (w0[1], w0[2], selector); - w0[3] = __byte_perm_S (w0[0], w0[1], selector); - w0[2] = __byte_perm_S ( 0, w0[0], selector); + w3[3] = hc_byte_perm_S (w3[0], w3[1], selector); + w3[2] = hc_byte_perm_S (w2[3], w3[0], selector); + w3[1] = hc_byte_perm_S (w2[2], w2[3], selector); + w3[0] = hc_byte_perm_S (w2[1], w2[2], selector); + w2[3] = hc_byte_perm_S (w2[0], w2[1], selector); + w2[2] = hc_byte_perm_S (w1[3], w2[0], selector); + w2[1] = hc_byte_perm_S (w1[2], w1[3], selector); + w2[0] = hc_byte_perm_S (w1[1], w1[2], selector); + w1[3] = hc_byte_perm_S (w1[0], w1[1], selector); + w1[2] = hc_byte_perm_S (w0[3], w1[0], selector); + w1[1] = hc_byte_perm_S (w0[2], w0[3], selector); + w1[0] = hc_byte_perm_S (w0[1], w0[2], selector); + w0[3] = hc_byte_perm_S (w0[0], w0[1], selector); + w0[2] = hc_byte_perm_S ( 0, w0[0], selector); w0[1] = 0; w0[0] = 0; break; case 3: - w3[3] = __byte_perm_S (w2[3], w3[0], selector); - w3[2] = __byte_perm_S (w2[2], w2[3], selector); - w3[1] = __byte_perm_S (w2[1], w2[2], selector); - w3[0] = __byte_perm_S (w2[0], w2[1], selector); - w2[3] = __byte_perm_S (w1[3], w2[0], selector); - w2[2] = __byte_perm_S (w1[2], w1[3], selector); - w2[1] = __byte_perm_S (w1[1], w1[2], selector); - w2[0] = __byte_perm_S (w1[0], w1[1], selector); - w1[3] = __byte_perm_S (w0[3], w1[0], selector); - w1[2] = __byte_perm_S (w0[2], w0[3], selector); - w1[1] = __byte_perm_S (w0[1], w0[2], selector); - w1[0] = __byte_perm_S (w0[0], w0[1], selector); - w0[3] = __byte_perm_S ( 0, w0[0], selector); + w3[3] = hc_byte_perm_S (w2[3], w3[0], selector); + w3[2] = hc_byte_perm_S (w2[2], w2[3], selector); + w3[1] = hc_byte_perm_S (w2[1], w2[2], selector); + w3[0] = hc_byte_perm_S (w2[0], w2[1], selector); + w2[3] = hc_byte_perm_S (w1[3], w2[0], selector); + w2[2] = hc_byte_perm_S (w1[2], w1[3], selector); + w2[1] = hc_byte_perm_S (w1[1], w1[2], selector); + w2[0] = hc_byte_perm_S (w1[0], w1[1], selector); + w1[3] = hc_byte_perm_S (w0[3], w1[0], selector); + w1[2] = hc_byte_perm_S (w0[2], w0[3], selector); + w1[1] = hc_byte_perm_S (w0[1], w0[2], selector); + w1[0] = hc_byte_perm_S (w0[0], w0[1], selector); + w0[3] = hc_byte_perm_S ( 0, w0[0], selector); w0[2] = 0; w0[1] = 0; w0[0] = 0; @@ -31660,18 +31660,18 @@ DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 4: - w3[3] = __byte_perm_S (w2[2], w2[3], selector); - w3[2] = __byte_perm_S (w2[1], w2[2], selector); - w3[1] = __byte_perm_S (w2[0], w2[1], selector); - w3[0] = __byte_perm_S (w1[3], w2[0], selector); - w2[3] = __byte_perm_S (w1[2], w1[3], selector); - w2[2] = __byte_perm_S (w1[1], w1[2], selector); - w2[1] = __byte_perm_S (w1[0], w1[1], selector); - w2[0] = __byte_perm_S (w0[3], w1[0], selector); - w1[3] = __byte_perm_S (w0[2], w0[3], selector); - w1[2] = __byte_perm_S (w0[1], w0[2], selector); - w1[1] = __byte_perm_S (w0[0], w0[1], selector); - w1[0] = __byte_perm_S ( 0, w0[0], selector); + w3[3] = hc_byte_perm_S (w2[2], w2[3], selector); + w3[2] = hc_byte_perm_S (w2[1], w2[2], selector); + w3[1] = hc_byte_perm_S (w2[0], w2[1], selector); + w3[0] = hc_byte_perm_S (w1[3], w2[0], selector); + w2[3] = hc_byte_perm_S (w1[2], w1[3], selector); + w2[2] = hc_byte_perm_S (w1[1], w1[2], selector); + w2[1] = hc_byte_perm_S (w1[0], w1[1], selector); + w2[0] = hc_byte_perm_S (w0[3], w1[0], selector); + w1[3] = hc_byte_perm_S (w0[2], w0[3], selector); + w1[2] = hc_byte_perm_S (w0[1], w0[2], selector); + w1[1] = hc_byte_perm_S (w0[0], w0[1], selector); + w1[0] = hc_byte_perm_S ( 0, w0[0], selector); w0[3] = 0; w0[2] = 0; w0[1] = 0; @@ -31680,17 +31680,17 @@ DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 5: - w3[3] = __byte_perm_S (w2[1], w2[2], selector); - w3[2] = __byte_perm_S (w2[0], w2[1], selector); - w3[1] = __byte_perm_S (w1[3], w2[0], selector); - w3[0] = __byte_perm_S (w1[2], w1[3], selector); - w2[3] = __byte_perm_S (w1[1], w1[2], selector); - w2[2] = __byte_perm_S (w1[0], w1[1], selector); - w2[1] = __byte_perm_S (w0[3], w1[0], selector); - w2[0] = __byte_perm_S (w0[2], w0[3], selector); - w1[3] = __byte_perm_S (w0[1], w0[2], selector); - w1[2] = __byte_perm_S (w0[0], w0[1], selector); - w1[1] = __byte_perm_S ( 0, w0[0], selector); + w3[3] = hc_byte_perm_S (w2[1], w2[2], selector); + w3[2] = hc_byte_perm_S (w2[0], w2[1], selector); + w3[1] = hc_byte_perm_S (w1[3], w2[0], selector); + w3[0] = hc_byte_perm_S (w1[2], w1[3], selector); + w2[3] = hc_byte_perm_S (w1[1], w1[2], selector); + w2[2] = hc_byte_perm_S (w1[0], w1[1], selector); + w2[1] = hc_byte_perm_S (w0[3], w1[0], selector); + w2[0] = hc_byte_perm_S (w0[2], w0[3], selector); + w1[3] = hc_byte_perm_S (w0[1], w0[2], selector); + w1[2] = hc_byte_perm_S (w0[0], w0[1], selector); + w1[1] = hc_byte_perm_S ( 0, w0[0], selector); w1[0] = 0; w0[3] = 0; w0[2] = 0; @@ -31700,16 +31700,16 @@ DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 6: - w3[3] = __byte_perm_S (w2[0], w2[1], selector); - w3[2] = __byte_perm_S (w1[3], w2[0], selector); - w3[1] = __byte_perm_S (w1[2], w1[3], selector); - w3[0] = __byte_perm_S (w1[1], w1[2], selector); - w2[3] = __byte_perm_S (w1[0], w1[1], selector); - w2[2] = __byte_perm_S (w0[3], w1[0], selector); - w2[1] = __byte_perm_S (w0[2], w0[3], selector); - w2[0] = __byte_perm_S (w0[1], w0[2], selector); - w1[3] = __byte_perm_S (w0[0], w0[1], selector); - w1[2] = __byte_perm_S ( 0, w0[0], selector); + w3[3] = hc_byte_perm_S (w2[0], w2[1], selector); + w3[2] = hc_byte_perm_S (w1[3], w2[0], selector); + w3[1] = hc_byte_perm_S (w1[2], w1[3], selector); + w3[0] = hc_byte_perm_S (w1[1], w1[2], selector); + w2[3] = hc_byte_perm_S (w1[0], w1[1], selector); + w2[2] = hc_byte_perm_S (w0[3], w1[0], selector); + w2[1] = hc_byte_perm_S (w0[2], w0[3], selector); + w2[0] = hc_byte_perm_S (w0[1], w0[2], selector); + w1[3] = hc_byte_perm_S (w0[0], w0[1], selector); + w1[2] = hc_byte_perm_S ( 0, w0[0], selector); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -31720,15 +31720,15 @@ DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 7: - w3[3] = __byte_perm_S (w1[3], w2[0], selector); - w3[2] = __byte_perm_S (w1[2], w1[3], selector); - w3[1] = __byte_perm_S (w1[1], w1[2], selector); - w3[0] = __byte_perm_S (w1[0], w1[1], selector); - w2[3] = __byte_perm_S (w0[3], w1[0], selector); - w2[2] = __byte_perm_S (w0[2], w0[3], selector); - w2[1] = __byte_perm_S (w0[1], w0[2], selector); - w2[0] = __byte_perm_S (w0[0], w0[1], selector); - w1[3] = __byte_perm_S ( 0, w0[0], selector); + w3[3] = hc_byte_perm_S (w1[3], w2[0], selector); + w3[2] = hc_byte_perm_S (w1[2], w1[3], selector); + w3[1] = hc_byte_perm_S (w1[1], w1[2], selector); + w3[0] = hc_byte_perm_S (w1[0], w1[1], selector); + w2[3] = hc_byte_perm_S (w0[3], w1[0], selector); + w2[2] = hc_byte_perm_S (w0[2], w0[3], selector); + w2[1] = hc_byte_perm_S (w0[1], w0[2], selector); + w2[0] = hc_byte_perm_S (w0[0], w0[1], selector); + w1[3] = hc_byte_perm_S ( 0, w0[0], selector); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -31740,14 +31740,14 @@ DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 8: - w3[3] = __byte_perm_S (w1[2], w1[3], selector); - w3[2] = __byte_perm_S (w1[1], w1[2], selector); - w3[1] = __byte_perm_S (w1[0], w1[1], selector); - w3[0] = __byte_perm_S (w0[3], w1[0], selector); - w2[3] = __byte_perm_S (w0[2], w0[3], selector); - w2[2] = __byte_perm_S (w0[1], w0[2], selector); - w2[1] = __byte_perm_S (w0[0], w0[1], selector); - w2[0] = __byte_perm_S ( 0, w0[0], selector); + w3[3] = hc_byte_perm_S (w1[2], w1[3], selector); + w3[2] = hc_byte_perm_S (w1[1], w1[2], selector); + w3[1] = hc_byte_perm_S (w1[0], w1[1], selector); + w3[0] = hc_byte_perm_S (w0[3], w1[0], selector); + w2[3] = hc_byte_perm_S (w0[2], w0[3], selector); + w2[2] = hc_byte_perm_S (w0[1], w0[2], selector); + w2[1] = hc_byte_perm_S (w0[0], w0[1], selector); + w2[0] = hc_byte_perm_S ( 0, w0[0], selector); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -31760,13 +31760,13 @@ DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 9: - w3[3] = __byte_perm_S (w1[1], w1[2], selector); - w3[2] = __byte_perm_S (w1[0], w1[1], selector); - w3[1] = __byte_perm_S (w0[3], w1[0], selector); - w3[0] = __byte_perm_S (w0[2], w0[3], selector); - w2[3] = __byte_perm_S (w0[1], w0[2], selector); - w2[2] = __byte_perm_S (w0[0], w0[1], selector); - w2[1] = __byte_perm_S ( 0, w0[0], selector); + w3[3] = hc_byte_perm_S (w1[1], w1[2], selector); + w3[2] = hc_byte_perm_S (w1[0], w1[1], selector); + w3[1] = hc_byte_perm_S (w0[3], w1[0], selector); + w3[0] = hc_byte_perm_S (w0[2], w0[3], selector); + w2[3] = hc_byte_perm_S (w0[1], w0[2], selector); + w2[2] = hc_byte_perm_S (w0[0], w0[1], selector); + w2[1] = hc_byte_perm_S ( 0, w0[0], selector); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -31780,12 +31780,12 @@ DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 10: - w3[3] = __byte_perm_S (w1[0], w1[1], selector); - w3[2] = __byte_perm_S (w0[3], w1[0], selector); - w3[1] = __byte_perm_S (w0[2], w0[3], selector); - w3[0] = __byte_perm_S (w0[1], w0[2], selector); - w2[3] = __byte_perm_S (w0[0], w0[1], selector); - w2[2] = __byte_perm_S ( 0, w0[0], selector); + w3[3] = hc_byte_perm_S (w1[0], w1[1], selector); + w3[2] = hc_byte_perm_S (w0[3], w1[0], selector); + w3[1] = hc_byte_perm_S (w0[2], w0[3], selector); + w3[0] = hc_byte_perm_S (w0[1], w0[2], selector); + w2[3] = hc_byte_perm_S (w0[0], w0[1], selector); + w2[2] = hc_byte_perm_S ( 0, w0[0], selector); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -31800,11 +31800,11 @@ DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 11: - w3[3] = __byte_perm_S (w0[3], w1[0], selector); - w3[2] = __byte_perm_S (w0[2], w0[3], selector); - w3[1] = __byte_perm_S (w0[1], w0[2], selector); - w3[0] = __byte_perm_S (w0[0], w0[1], selector); - w2[3] = __byte_perm_S ( 0, w0[0], selector); + w3[3] = hc_byte_perm_S (w0[3], w1[0], selector); + w3[2] = hc_byte_perm_S (w0[2], w0[3], selector); + w3[1] = hc_byte_perm_S (w0[1], w0[2], selector); + w3[0] = hc_byte_perm_S (w0[0], w0[1], selector); + w2[3] = hc_byte_perm_S ( 0, w0[0], selector); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -31820,10 +31820,10 @@ DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 12: - w3[3] = __byte_perm_S (w0[2], w0[3], selector); - w3[2] = __byte_perm_S (w0[1], w0[2], selector); - w3[1] = __byte_perm_S (w0[0], w0[1], selector); - w3[0] = __byte_perm_S ( 0, w0[0], selector); + w3[3] = hc_byte_perm_S (w0[2], w0[3], selector); + w3[2] = hc_byte_perm_S (w0[1], w0[2], selector); + w3[1] = hc_byte_perm_S (w0[0], w0[1], selector); + w3[0] = hc_byte_perm_S ( 0, w0[0], selector); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -31840,9 +31840,9 @@ DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 13: - w3[3] = __byte_perm_S (w0[1], w0[2], selector); - w3[2] = __byte_perm_S (w0[0], w0[1], selector); - w3[1] = __byte_perm_S ( 0, w0[0], selector); + w3[3] = hc_byte_perm_S (w0[1], w0[2], selector); + w3[2] = hc_byte_perm_S (w0[0], w0[1], selector); + w3[1] = hc_byte_perm_S ( 0, w0[0], selector); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -31860,8 +31860,8 @@ DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 14: - w3[3] = __byte_perm_S (w0[0], w0[1], selector); - w3[2] = __byte_perm_S ( 0, w0[0], selector); + w3[3] = hc_byte_perm_S (w0[0], w0[1], selector); + w3[2] = hc_byte_perm_S ( 0, w0[0], selector); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -31880,7 +31880,7 @@ DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 15: - w3[3] = __byte_perm_S ( 0, w0[0], selector); + w3[3] = hc_byte_perm_S ( 0, w0[0], selector); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -31931,89 +31931,89 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 switch (offset_switch) { case 0: - c0[0] = amd_bytealign_S (w3[3], 0, offset); - w3[3] = amd_bytealign_S (w3[2], w3[3], offset); - w3[2] = amd_bytealign_S (w3[1], w3[2], offset); - w3[1] = amd_bytealign_S (w3[0], w3[1], offset); - w3[0] = amd_bytealign_S (w2[3], w3[0], offset); - w2[3] = amd_bytealign_S (w2[2], w2[3], offset); - w2[2] = amd_bytealign_S (w2[1], w2[2], offset); - w2[1] = amd_bytealign_S (w2[0], w2[1], offset); - w2[0] = amd_bytealign_S (w1[3], w2[0], offset); - w1[3] = amd_bytealign_S (w1[2], w1[3], offset); - w1[2] = amd_bytealign_S (w1[1], w1[2], offset); - w1[1] = amd_bytealign_S (w1[0], w1[1], offset); - w1[0] = amd_bytealign_S (w0[3], w1[0], offset); - w0[3] = amd_bytealign_S (w0[2], w0[3], offset); - w0[2] = amd_bytealign_S (w0[1], w0[2], offset); - w0[1] = amd_bytealign_S (w0[0], w0[1], offset); - w0[0] = amd_bytealign_S ( 0, w0[0], offset); + c0[0] = hc_bytealign_S (w3[3], 0, offset); + w3[3] = hc_bytealign_S (w3[2], w3[3], offset); + w3[2] = hc_bytealign_S (w3[1], w3[2], offset); + w3[1] = hc_bytealign_S (w3[0], w3[1], offset); + w3[0] = hc_bytealign_S (w2[3], w3[0], offset); + w2[3] = hc_bytealign_S (w2[2], w2[3], offset); + w2[2] = hc_bytealign_S (w2[1], w2[2], offset); + w2[1] = hc_bytealign_S (w2[0], w2[1], offset); + w2[0] = hc_bytealign_S (w1[3], w2[0], offset); + w1[3] = hc_bytealign_S (w1[2], w1[3], offset); + w1[2] = hc_bytealign_S (w1[1], w1[2], offset); + w1[1] = hc_bytealign_S (w1[0], w1[1], offset); + w1[0] = hc_bytealign_S (w0[3], w1[0], offset); + w0[3] = hc_bytealign_S (w0[2], w0[3], offset); + w0[2] = hc_bytealign_S (w0[1], w0[2], offset); + w0[1] = hc_bytealign_S (w0[0], w0[1], offset); + w0[0] = hc_bytealign_S ( 0, w0[0], offset); break; case 1: - c0[1] = amd_bytealign_S (w3[3], 0, offset); - c0[0] = amd_bytealign_S (w3[2], w3[3], offset); - w3[3] = amd_bytealign_S (w3[1], w3[2], offset); - w3[2] = amd_bytealign_S (w3[0], w3[1], offset); - w3[1] = amd_bytealign_S (w2[3], w3[0], offset); - w3[0] = amd_bytealign_S (w2[2], w2[3], offset); - w2[3] = amd_bytealign_S (w2[1], w2[2], offset); - w2[2] = amd_bytealign_S (w2[0], w2[1], offset); - w2[1] = amd_bytealign_S (w1[3], w2[0], offset); - w2[0] = amd_bytealign_S (w1[2], w1[3], offset); - w1[3] = amd_bytealign_S (w1[1], w1[2], offset); - w1[2] = amd_bytealign_S (w1[0], w1[1], offset); - w1[1] = amd_bytealign_S (w0[3], w1[0], offset); - w1[0] = amd_bytealign_S (w0[2], w0[3], offset); - w0[3] = amd_bytealign_S (w0[1], w0[2], offset); - w0[2] = amd_bytealign_S (w0[0], w0[1], offset); - w0[1] = amd_bytealign_S ( 0, w0[0], offset); + c0[1] = hc_bytealign_S (w3[3], 0, offset); + c0[0] = hc_bytealign_S (w3[2], w3[3], offset); + w3[3] = hc_bytealign_S (w3[1], w3[2], offset); + w3[2] = hc_bytealign_S (w3[0], w3[1], offset); + w3[1] = hc_bytealign_S (w2[3], w3[0], offset); + w3[0] = hc_bytealign_S (w2[2], w2[3], offset); + w2[3] = hc_bytealign_S (w2[1], w2[2], offset); + w2[2] = hc_bytealign_S (w2[0], w2[1], offset); + w2[1] = hc_bytealign_S (w1[3], w2[0], offset); + w2[0] = hc_bytealign_S (w1[2], w1[3], offset); + w1[3] = hc_bytealign_S (w1[1], w1[2], offset); + w1[2] = hc_bytealign_S (w1[0], w1[1], offset); + w1[1] = hc_bytealign_S (w0[3], w1[0], offset); + w1[0] = hc_bytealign_S (w0[2], w0[3], offset); + w0[3] = hc_bytealign_S (w0[1], w0[2], offset); + w0[2] = hc_bytealign_S (w0[0], w0[1], offset); + w0[1] = hc_bytealign_S ( 0, w0[0], offset); w0[0] = 0; break; case 2: - c0[2] = amd_bytealign_S (w3[3], 0, offset); - c0[1] = amd_bytealign_S (w3[2], w3[3], offset); - c0[0] = amd_bytealign_S (w3[1], w3[2], offset); - w3[3] = amd_bytealign_S (w3[0], w3[1], offset); - w3[2] = amd_bytealign_S (w2[3], w3[0], offset); - w3[1] = amd_bytealign_S (w2[2], w2[3], offset); - w3[0] = amd_bytealign_S (w2[1], w2[2], offset); - w2[3] = amd_bytealign_S (w2[0], w2[1], offset); - w2[2] = amd_bytealign_S (w1[3], w2[0], offset); - w2[1] = amd_bytealign_S (w1[2], w1[3], offset); - w2[0] = amd_bytealign_S (w1[1], w1[2], offset); - w1[3] = amd_bytealign_S (w1[0], w1[1], offset); - w1[2] = amd_bytealign_S (w0[3], w1[0], offset); - w1[1] = amd_bytealign_S (w0[2], w0[3], offset); - w1[0] = amd_bytealign_S (w0[1], w0[2], offset); - w0[3] = amd_bytealign_S (w0[0], w0[1], offset); - w0[2] = amd_bytealign_S ( 0, w0[0], offset); + c0[2] = hc_bytealign_S (w3[3], 0, offset); + c0[1] = hc_bytealign_S (w3[2], w3[3], offset); + c0[0] = hc_bytealign_S (w3[1], w3[2], offset); + w3[3] = hc_bytealign_S (w3[0], w3[1], offset); + w3[2] = hc_bytealign_S (w2[3], w3[0], offset); + w3[1] = hc_bytealign_S (w2[2], w2[3], offset); + w3[0] = hc_bytealign_S (w2[1], w2[2], offset); + w2[3] = hc_bytealign_S (w2[0], w2[1], offset); + w2[2] = hc_bytealign_S (w1[3], w2[0], offset); + w2[1] = hc_bytealign_S (w1[2], w1[3], offset); + w2[0] = hc_bytealign_S (w1[1], w1[2], offset); + w1[3] = hc_bytealign_S (w1[0], w1[1], offset); + w1[2] = hc_bytealign_S (w0[3], w1[0], offset); + w1[1] = hc_bytealign_S (w0[2], w0[3], offset); + w1[0] = hc_bytealign_S (w0[1], w0[2], offset); + w0[3] = hc_bytealign_S (w0[0], w0[1], offset); + w0[2] = hc_bytealign_S ( 0, w0[0], offset); w0[1] = 0; w0[0] = 0; break; case 3: - c0[3] = amd_bytealign_S (w3[3], 0, offset); - c0[2] = amd_bytealign_S (w3[2], w3[3], offset); - c0[1] = amd_bytealign_S (w3[1], w3[2], offset); - c0[0] = amd_bytealign_S (w3[0], w3[1], offset); - w3[3] = amd_bytealign_S (w2[3], w3[0], offset); - w3[2] = amd_bytealign_S (w2[2], w2[3], offset); - w3[1] = amd_bytealign_S (w2[1], w2[2], offset); - w3[0] = amd_bytealign_S (w2[0], w2[1], offset); - w2[3] = amd_bytealign_S (w1[3], w2[0], offset); - w2[2] = amd_bytealign_S (w1[2], w1[3], offset); - w2[1] = amd_bytealign_S (w1[1], w1[2], offset); - w2[0] = amd_bytealign_S (w1[0], w1[1], offset); - w1[3] = amd_bytealign_S (w0[3], w1[0], offset); - w1[2] = amd_bytealign_S (w0[2], w0[3], offset); - w1[1] = amd_bytealign_S (w0[1], w0[2], offset); - w1[0] = amd_bytealign_S (w0[0], w0[1], offset); - w0[3] = amd_bytealign_S ( 0, w0[0], offset); + c0[3] = hc_bytealign_S (w3[3], 0, offset); + c0[2] = hc_bytealign_S (w3[2], w3[3], offset); + c0[1] = hc_bytealign_S (w3[1], w3[2], offset); + c0[0] = hc_bytealign_S (w3[0], w3[1], offset); + w3[3] = hc_bytealign_S (w2[3], w3[0], offset); + w3[2] = hc_bytealign_S (w2[2], w2[3], offset); + w3[1] = hc_bytealign_S (w2[1], w2[2], offset); + w3[0] = hc_bytealign_S (w2[0], w2[1], offset); + w2[3] = hc_bytealign_S (w1[3], w2[0], offset); + w2[2] = hc_bytealign_S (w1[2], w1[3], offset); + w2[1] = hc_bytealign_S (w1[1], w1[2], offset); + w2[0] = hc_bytealign_S (w1[0], w1[1], offset); + w1[3] = hc_bytealign_S (w0[3], w1[0], offset); + w1[2] = hc_bytealign_S (w0[2], w0[3], offset); + w1[1] = hc_bytealign_S (w0[1], w0[2], offset); + w1[0] = hc_bytealign_S (w0[0], w0[1], offset); + w0[3] = hc_bytealign_S ( 0, w0[0], offset); w0[2] = 0; w0[1] = 0; w0[0] = 0; @@ -32021,23 +32021,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 4: - c1[0] = amd_bytealign_S (w3[3], 0, offset); - c0[3] = amd_bytealign_S (w3[2], w3[3], offset); - c0[2] = amd_bytealign_S (w3[1], w3[2], offset); - c0[1] = amd_bytealign_S (w3[0], w3[1], offset); - c0[0] = amd_bytealign_S (w2[3], w3[0], offset); - w3[3] = amd_bytealign_S (w2[2], w2[3], offset); - w3[2] = amd_bytealign_S (w2[1], w2[2], offset); - w3[1] = amd_bytealign_S (w2[0], w2[1], offset); - w3[0] = amd_bytealign_S (w1[3], w2[0], offset); - w2[3] = amd_bytealign_S (w1[2], w1[3], offset); - w2[2] = amd_bytealign_S (w1[1], w1[2], offset); - w2[1] = amd_bytealign_S (w1[0], w1[1], offset); - w2[0] = amd_bytealign_S (w0[3], w1[0], offset); - w1[3] = amd_bytealign_S (w0[2], w0[3], offset); - w1[2] = amd_bytealign_S (w0[1], w0[2], offset); - w1[1] = amd_bytealign_S (w0[0], w0[1], offset); - w1[0] = amd_bytealign_S ( 0, w0[0], offset); + c1[0] = hc_bytealign_S (w3[3], 0, offset); + c0[3] = hc_bytealign_S (w3[2], w3[3], offset); + c0[2] = hc_bytealign_S (w3[1], w3[2], offset); + c0[1] = hc_bytealign_S (w3[0], w3[1], offset); + c0[0] = hc_bytealign_S (w2[3], w3[0], offset); + w3[3] = hc_bytealign_S (w2[2], w2[3], offset); + w3[2] = hc_bytealign_S (w2[1], w2[2], offset); + w3[1] = hc_bytealign_S (w2[0], w2[1], offset); + w3[0] = hc_bytealign_S (w1[3], w2[0], offset); + w2[3] = hc_bytealign_S (w1[2], w1[3], offset); + w2[2] = hc_bytealign_S (w1[1], w1[2], offset); + w2[1] = hc_bytealign_S (w1[0], w1[1], offset); + w2[0] = hc_bytealign_S (w0[3], w1[0], offset); + w1[3] = hc_bytealign_S (w0[2], w0[3], offset); + w1[2] = hc_bytealign_S (w0[1], w0[2], offset); + w1[1] = hc_bytealign_S (w0[0], w0[1], offset); + w1[0] = hc_bytealign_S ( 0, w0[0], offset); w0[3] = 0; w0[2] = 0; w0[1] = 0; @@ -32046,23 +32046,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 5: - c1[1] = amd_bytealign_S (w3[3], 0, offset); - c1[0] = amd_bytealign_S (w3[2], w3[3], offset); - c0[3] = amd_bytealign_S (w3[1], w3[2], offset); - c0[2] = amd_bytealign_S (w3[0], w3[1], offset); - c0[1] = amd_bytealign_S (w2[3], w3[0], offset); - c0[0] = amd_bytealign_S (w2[2], w2[3], offset); - w3[3] = amd_bytealign_S (w2[1], w2[2], offset); - w3[2] = amd_bytealign_S (w2[0], w2[1], offset); - w3[1] = amd_bytealign_S (w1[3], w2[0], offset); - w3[0] = amd_bytealign_S (w1[2], w1[3], offset); - w2[3] = amd_bytealign_S (w1[1], w1[2], offset); - w2[2] = amd_bytealign_S (w1[0], w1[1], offset); - w2[1] = amd_bytealign_S (w0[3], w1[0], offset); - w2[0] = amd_bytealign_S (w0[2], w0[3], offset); - w1[3] = amd_bytealign_S (w0[1], w0[2], offset); - w1[2] = amd_bytealign_S (w0[0], w0[1], offset); - w1[1] = amd_bytealign_S ( 0, w0[0], offset); + c1[1] = hc_bytealign_S (w3[3], 0, offset); + c1[0] = hc_bytealign_S (w3[2], w3[3], offset); + c0[3] = hc_bytealign_S (w3[1], w3[2], offset); + c0[2] = hc_bytealign_S (w3[0], w3[1], offset); + c0[1] = hc_bytealign_S (w2[3], w3[0], offset); + c0[0] = hc_bytealign_S (w2[2], w2[3], offset); + w3[3] = hc_bytealign_S (w2[1], w2[2], offset); + w3[2] = hc_bytealign_S (w2[0], w2[1], offset); + w3[1] = hc_bytealign_S (w1[3], w2[0], offset); + w3[0] = hc_bytealign_S (w1[2], w1[3], offset); + w2[3] = hc_bytealign_S (w1[1], w1[2], offset); + w2[2] = hc_bytealign_S (w1[0], w1[1], offset); + w2[1] = hc_bytealign_S (w0[3], w1[0], offset); + w2[0] = hc_bytealign_S (w0[2], w0[3], offset); + w1[3] = hc_bytealign_S (w0[1], w0[2], offset); + w1[2] = hc_bytealign_S (w0[0], w0[1], offset); + w1[1] = hc_bytealign_S ( 0, w0[0], offset); w1[0] = 0; w0[3] = 0; w0[2] = 0; @@ -32072,23 +32072,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 6: - c1[2] = amd_bytealign_S (w3[3], 0, offset); - c1[1] = amd_bytealign_S (w3[2], w3[3], offset); - c1[0] = amd_bytealign_S (w3[1], w3[2], offset); - c0[3] = amd_bytealign_S (w3[0], w3[1], offset); - c0[2] = amd_bytealign_S (w2[3], w3[0], offset); - c0[1] = amd_bytealign_S (w2[2], w2[3], offset); - c0[0] = amd_bytealign_S (w2[1], w2[2], offset); - w3[3] = amd_bytealign_S (w2[0], w2[1], offset); - w3[2] = amd_bytealign_S (w1[3], w2[0], offset); - w3[1] = amd_bytealign_S (w1[2], w1[3], offset); - w3[0] = amd_bytealign_S (w1[1], w1[2], offset); - w2[3] = amd_bytealign_S (w1[0], w1[1], offset); - w2[2] = amd_bytealign_S (w0[3], w1[0], offset); - w2[1] = amd_bytealign_S (w0[2], w0[3], offset); - w2[0] = amd_bytealign_S (w0[1], w0[2], offset); - w1[3] = amd_bytealign_S (w0[0], w0[1], offset); - w1[2] = amd_bytealign_S ( 0, w0[0], offset); + c1[2] = hc_bytealign_S (w3[3], 0, offset); + c1[1] = hc_bytealign_S (w3[2], w3[3], offset); + c1[0] = hc_bytealign_S (w3[1], w3[2], offset); + c0[3] = hc_bytealign_S (w3[0], w3[1], offset); + c0[2] = hc_bytealign_S (w2[3], w3[0], offset); + c0[1] = hc_bytealign_S (w2[2], w2[3], offset); + c0[0] = hc_bytealign_S (w2[1], w2[2], offset); + w3[3] = hc_bytealign_S (w2[0], w2[1], offset); + w3[2] = hc_bytealign_S (w1[3], w2[0], offset); + w3[1] = hc_bytealign_S (w1[2], w1[3], offset); + w3[0] = hc_bytealign_S (w1[1], w1[2], offset); + w2[3] = hc_bytealign_S (w1[0], w1[1], offset); + w2[2] = hc_bytealign_S (w0[3], w1[0], offset); + w2[1] = hc_bytealign_S (w0[2], w0[3], offset); + w2[0] = hc_bytealign_S (w0[1], w0[2], offset); + w1[3] = hc_bytealign_S (w0[0], w0[1], offset); + w1[2] = hc_bytealign_S ( 0, w0[0], offset); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -32099,23 +32099,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 7: - c1[3] = amd_bytealign_S (w3[3], 0, offset); - c1[2] = amd_bytealign_S (w3[2], w3[3], offset); - c1[1] = amd_bytealign_S (w3[1], w3[2], offset); - c1[0] = amd_bytealign_S (w3[0], w3[1], offset); - c0[3] = amd_bytealign_S (w2[3], w3[0], offset); - c0[2] = amd_bytealign_S (w2[2], w2[3], offset); - c0[1] = amd_bytealign_S (w2[1], w2[2], offset); - c0[0] = amd_bytealign_S (w2[0], w2[1], offset); - w3[3] = amd_bytealign_S (w1[3], w2[0], offset); - w3[2] = amd_bytealign_S (w1[2], w1[3], offset); - w3[1] = amd_bytealign_S (w1[1], w1[2], offset); - w3[0] = amd_bytealign_S (w1[0], w1[1], offset); - w2[3] = amd_bytealign_S (w0[3], w1[0], offset); - w2[2] = amd_bytealign_S (w0[2], w0[3], offset); - w2[1] = amd_bytealign_S (w0[1], w0[2], offset); - w2[0] = amd_bytealign_S (w0[0], w0[1], offset); - w1[3] = amd_bytealign_S ( 0, w0[0], offset); + c1[3] = hc_bytealign_S (w3[3], 0, offset); + c1[2] = hc_bytealign_S (w3[2], w3[3], offset); + c1[1] = hc_bytealign_S (w3[1], w3[2], offset); + c1[0] = hc_bytealign_S (w3[0], w3[1], offset); + c0[3] = hc_bytealign_S (w2[3], w3[0], offset); + c0[2] = hc_bytealign_S (w2[2], w2[3], offset); + c0[1] = hc_bytealign_S (w2[1], w2[2], offset); + c0[0] = hc_bytealign_S (w2[0], w2[1], offset); + w3[3] = hc_bytealign_S (w1[3], w2[0], offset); + w3[2] = hc_bytealign_S (w1[2], w1[3], offset); + w3[1] = hc_bytealign_S (w1[1], w1[2], offset); + w3[0] = hc_bytealign_S (w1[0], w1[1], offset); + w2[3] = hc_bytealign_S (w0[3], w1[0], offset); + w2[2] = hc_bytealign_S (w0[2], w0[3], offset); + w2[1] = hc_bytealign_S (w0[1], w0[2], offset); + w2[0] = hc_bytealign_S (w0[0], w0[1], offset); + w1[3] = hc_bytealign_S ( 0, w0[0], offset); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -32127,23 +32127,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 8: - c2[0] = amd_bytealign_S (w3[3], 0, offset); - c1[3] = amd_bytealign_S (w3[2], w3[3], offset); - c1[2] = amd_bytealign_S (w3[1], w3[2], offset); - c1[1] = amd_bytealign_S (w3[0], w3[1], offset); - c1[0] = amd_bytealign_S (w2[3], w3[0], offset); - c0[3] = amd_bytealign_S (w2[2], w2[3], offset); - c0[2] = amd_bytealign_S (w2[1], w2[2], offset); - c0[1] = amd_bytealign_S (w2[0], w2[1], offset); - c0[0] = amd_bytealign_S (w1[3], w2[0], offset); - w3[3] = amd_bytealign_S (w1[2], w1[3], offset); - w3[2] = amd_bytealign_S (w1[1], w1[2], offset); - w3[1] = amd_bytealign_S (w1[0], w1[1], offset); - w3[0] = amd_bytealign_S (w0[3], w1[0], offset); - w2[3] = amd_bytealign_S (w0[2], w0[3], offset); - w2[2] = amd_bytealign_S (w0[1], w0[2], offset); - w2[1] = amd_bytealign_S (w0[0], w0[1], offset); - w2[0] = amd_bytealign_S ( 0, w0[0], offset); + c2[0] = hc_bytealign_S (w3[3], 0, offset); + c1[3] = hc_bytealign_S (w3[2], w3[3], offset); + c1[2] = hc_bytealign_S (w3[1], w3[2], offset); + c1[1] = hc_bytealign_S (w3[0], w3[1], offset); + c1[0] = hc_bytealign_S (w2[3], w3[0], offset); + c0[3] = hc_bytealign_S (w2[2], w2[3], offset); + c0[2] = hc_bytealign_S (w2[1], w2[2], offset); + c0[1] = hc_bytealign_S (w2[0], w2[1], offset); + c0[0] = hc_bytealign_S (w1[3], w2[0], offset); + w3[3] = hc_bytealign_S (w1[2], w1[3], offset); + w3[2] = hc_bytealign_S (w1[1], w1[2], offset); + w3[1] = hc_bytealign_S (w1[0], w1[1], offset); + w3[0] = hc_bytealign_S (w0[3], w1[0], offset); + w2[3] = hc_bytealign_S (w0[2], w0[3], offset); + w2[2] = hc_bytealign_S (w0[1], w0[2], offset); + w2[1] = hc_bytealign_S (w0[0], w0[1], offset); + w2[0] = hc_bytealign_S ( 0, w0[0], offset); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -32156,23 +32156,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 9: - c2[1] = amd_bytealign_S (w3[3], 0, offset); - c2[0] = amd_bytealign_S (w3[2], w3[3], offset); - c1[3] = amd_bytealign_S (w3[1], w3[2], offset); - c1[2] = amd_bytealign_S (w3[0], w3[1], offset); - c1[1] = amd_bytealign_S (w2[3], w3[0], offset); - c1[0] = amd_bytealign_S (w2[2], w2[3], offset); - c0[3] = amd_bytealign_S (w2[1], w2[2], offset); - c0[2] = amd_bytealign_S (w2[0], w2[1], offset); - c0[1] = amd_bytealign_S (w1[3], w2[0], offset); - c0[0] = amd_bytealign_S (w1[2], w1[3], offset); - w3[3] = amd_bytealign_S (w1[1], w1[2], offset); - w3[2] = amd_bytealign_S (w1[0], w1[1], offset); - w3[1] = amd_bytealign_S (w0[3], w1[0], offset); - w3[0] = amd_bytealign_S (w0[2], w0[3], offset); - w2[3] = amd_bytealign_S (w0[1], w0[2], offset); - w2[2] = amd_bytealign_S (w0[0], w0[1], offset); - w2[1] = amd_bytealign_S ( 0, w0[0], offset); + c2[1] = hc_bytealign_S (w3[3], 0, offset); + c2[0] = hc_bytealign_S (w3[2], w3[3], offset); + c1[3] = hc_bytealign_S (w3[1], w3[2], offset); + c1[2] = hc_bytealign_S (w3[0], w3[1], offset); + c1[1] = hc_bytealign_S (w2[3], w3[0], offset); + c1[0] = hc_bytealign_S (w2[2], w2[3], offset); + c0[3] = hc_bytealign_S (w2[1], w2[2], offset); + c0[2] = hc_bytealign_S (w2[0], w2[1], offset); + c0[1] = hc_bytealign_S (w1[3], w2[0], offset); + c0[0] = hc_bytealign_S (w1[2], w1[3], offset); + w3[3] = hc_bytealign_S (w1[1], w1[2], offset); + w3[2] = hc_bytealign_S (w1[0], w1[1], offset); + w3[1] = hc_bytealign_S (w0[3], w1[0], offset); + w3[0] = hc_bytealign_S (w0[2], w0[3], offset); + w2[3] = hc_bytealign_S (w0[1], w0[2], offset); + w2[2] = hc_bytealign_S (w0[0], w0[1], offset); + w2[1] = hc_bytealign_S ( 0, w0[0], offset); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -32186,23 +32186,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 10: - c2[2] = amd_bytealign_S (w3[3], 0, offset); - c2[1] = amd_bytealign_S (w3[2], w3[3], offset); - c2[0] = amd_bytealign_S (w3[1], w3[2], offset); - c1[3] = amd_bytealign_S (w3[0], w3[1], offset); - c1[2] = amd_bytealign_S (w2[3], w3[0], offset); - c1[1] = amd_bytealign_S (w2[2], w2[3], offset); - c1[0] = amd_bytealign_S (w2[1], w2[2], offset); - c0[3] = amd_bytealign_S (w2[0], w2[1], offset); - c0[2] = amd_bytealign_S (w1[3], w2[0], offset); - c0[1] = amd_bytealign_S (w1[2], w1[3], offset); - c0[0] = amd_bytealign_S (w1[1], w1[2], offset); - w3[3] = amd_bytealign_S (w1[0], w1[1], offset); - w3[2] = amd_bytealign_S (w0[3], w1[0], offset); - w3[1] = amd_bytealign_S (w0[2], w0[3], offset); - w3[0] = amd_bytealign_S (w0[1], w0[2], offset); - w2[3] = amd_bytealign_S (w0[0], w0[1], offset); - w2[2] = amd_bytealign_S ( 0, w0[0], offset); + c2[2] = hc_bytealign_S (w3[3], 0, offset); + c2[1] = hc_bytealign_S (w3[2], w3[3], offset); + c2[0] = hc_bytealign_S (w3[1], w3[2], offset); + c1[3] = hc_bytealign_S (w3[0], w3[1], offset); + c1[2] = hc_bytealign_S (w2[3], w3[0], offset); + c1[1] = hc_bytealign_S (w2[2], w2[3], offset); + c1[0] = hc_bytealign_S (w2[1], w2[2], offset); + c0[3] = hc_bytealign_S (w2[0], w2[1], offset); + c0[2] = hc_bytealign_S (w1[3], w2[0], offset); + c0[1] = hc_bytealign_S (w1[2], w1[3], offset); + c0[0] = hc_bytealign_S (w1[1], w1[2], offset); + w3[3] = hc_bytealign_S (w1[0], w1[1], offset); + w3[2] = hc_bytealign_S (w0[3], w1[0], offset); + w3[1] = hc_bytealign_S (w0[2], w0[3], offset); + w3[0] = hc_bytealign_S (w0[1], w0[2], offset); + w2[3] = hc_bytealign_S (w0[0], w0[1], offset); + w2[2] = hc_bytealign_S ( 0, w0[0], offset); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -32217,23 +32217,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 11: - c2[3] = amd_bytealign_S (w3[3], 0, offset); - c2[2] = amd_bytealign_S (w3[2], w3[3], offset); - c2[1] = amd_bytealign_S (w3[1], w3[2], offset); - c2[0] = amd_bytealign_S (w3[0], w3[1], offset); - c1[3] = amd_bytealign_S (w2[3], w3[0], offset); - c1[2] = amd_bytealign_S (w2[2], w2[3], offset); - c1[1] = amd_bytealign_S (w2[1], w2[2], offset); - c1[0] = amd_bytealign_S (w2[0], w2[1], offset); - c0[3] = amd_bytealign_S (w1[3], w2[0], offset); - c0[2] = amd_bytealign_S (w1[2], w1[3], offset); - c0[1] = amd_bytealign_S (w1[1], w1[2], offset); - c0[0] = amd_bytealign_S (w1[0], w1[1], offset); - w3[3] = amd_bytealign_S (w0[3], w1[0], offset); - w3[2] = amd_bytealign_S (w0[2], w0[3], offset); - w3[1] = amd_bytealign_S (w0[1], w0[2], offset); - w3[0] = amd_bytealign_S (w0[0], w0[1], offset); - w2[3] = amd_bytealign_S ( 0, w0[0], offset); + c2[3] = hc_bytealign_S (w3[3], 0, offset); + c2[2] = hc_bytealign_S (w3[2], w3[3], offset); + c2[1] = hc_bytealign_S (w3[1], w3[2], offset); + c2[0] = hc_bytealign_S (w3[0], w3[1], offset); + c1[3] = hc_bytealign_S (w2[3], w3[0], offset); + c1[2] = hc_bytealign_S (w2[2], w2[3], offset); + c1[1] = hc_bytealign_S (w2[1], w2[2], offset); + c1[0] = hc_bytealign_S (w2[0], w2[1], offset); + c0[3] = hc_bytealign_S (w1[3], w2[0], offset); + c0[2] = hc_bytealign_S (w1[2], w1[3], offset); + c0[1] = hc_bytealign_S (w1[1], w1[2], offset); + c0[0] = hc_bytealign_S (w1[0], w1[1], offset); + w3[3] = hc_bytealign_S (w0[3], w1[0], offset); + w3[2] = hc_bytealign_S (w0[2], w0[3], offset); + w3[1] = hc_bytealign_S (w0[1], w0[2], offset); + w3[0] = hc_bytealign_S (w0[0], w0[1], offset); + w2[3] = hc_bytealign_S ( 0, w0[0], offset); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -32249,23 +32249,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 12: - c3[0] = amd_bytealign_S (w3[3], 0, offset); - c2[3] = amd_bytealign_S (w3[2], w3[3], offset); - c2[2] = amd_bytealign_S (w3[1], w3[2], offset); - c2[1] = amd_bytealign_S (w3[0], w3[1], offset); - c2[0] = amd_bytealign_S (w2[3], w3[0], offset); - c1[3] = amd_bytealign_S (w2[2], w2[3], offset); - c1[2] = amd_bytealign_S (w2[1], w2[2], offset); - c1[1] = amd_bytealign_S (w2[0], w2[1], offset); - c1[0] = amd_bytealign_S (w1[3], w2[0], offset); - c0[3] = amd_bytealign_S (w1[2], w1[3], offset); - c0[2] = amd_bytealign_S (w1[1], w1[2], offset); - c0[1] = amd_bytealign_S (w1[0], w1[1], offset); - c0[0] = amd_bytealign_S (w0[3], w1[0], offset); - w3[3] = amd_bytealign_S (w0[2], w0[3], offset); - w3[2] = amd_bytealign_S (w0[1], w0[2], offset); - w3[1] = amd_bytealign_S (w0[0], w0[1], offset); - w3[0] = amd_bytealign_S ( 0, w0[0], offset); + c3[0] = hc_bytealign_S (w3[3], 0, offset); + c2[3] = hc_bytealign_S (w3[2], w3[3], offset); + c2[2] = hc_bytealign_S (w3[1], w3[2], offset); + c2[1] = hc_bytealign_S (w3[0], w3[1], offset); + c2[0] = hc_bytealign_S (w2[3], w3[0], offset); + c1[3] = hc_bytealign_S (w2[2], w2[3], offset); + c1[2] = hc_bytealign_S (w2[1], w2[2], offset); + c1[1] = hc_bytealign_S (w2[0], w2[1], offset); + c1[0] = hc_bytealign_S (w1[3], w2[0], offset); + c0[3] = hc_bytealign_S (w1[2], w1[3], offset); + c0[2] = hc_bytealign_S (w1[1], w1[2], offset); + c0[1] = hc_bytealign_S (w1[0], w1[1], offset); + c0[0] = hc_bytealign_S (w0[3], w1[0], offset); + w3[3] = hc_bytealign_S (w0[2], w0[3], offset); + w3[2] = hc_bytealign_S (w0[1], w0[2], offset); + w3[1] = hc_bytealign_S (w0[0], w0[1], offset); + w3[0] = hc_bytealign_S ( 0, w0[0], offset); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -32282,23 +32282,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 13: - c3[1] = amd_bytealign_S (w3[3], 0, offset); - c3[0] = amd_bytealign_S (w3[2], w3[3], offset); - c2[3] = amd_bytealign_S (w3[1], w3[2], offset); - c2[2] = amd_bytealign_S (w3[0], w3[1], offset); - c2[1] = amd_bytealign_S (w2[3], w3[0], offset); - c2[0] = amd_bytealign_S (w2[2], w2[3], offset); - c1[3] = amd_bytealign_S (w2[1], w2[2], offset); - c1[2] = amd_bytealign_S (w2[0], w2[1], offset); - c1[1] = amd_bytealign_S (w1[3], w2[0], offset); - c1[0] = amd_bytealign_S (w1[2], w1[3], offset); - c0[3] = amd_bytealign_S (w1[1], w1[2], offset); - c0[2] = amd_bytealign_S (w1[0], w1[1], offset); - c0[1] = amd_bytealign_S (w0[3], w1[0], offset); - c0[0] = amd_bytealign_S (w0[2], w0[3], offset); - w3[3] = amd_bytealign_S (w0[1], w0[2], offset); - w3[2] = amd_bytealign_S (w0[0], w0[1], offset); - w3[1] = amd_bytealign_S ( 0, w0[0], offset); + c3[1] = hc_bytealign_S (w3[3], 0, offset); + c3[0] = hc_bytealign_S (w3[2], w3[3], offset); + c2[3] = hc_bytealign_S (w3[1], w3[2], offset); + c2[2] = hc_bytealign_S (w3[0], w3[1], offset); + c2[1] = hc_bytealign_S (w2[3], w3[0], offset); + c2[0] = hc_bytealign_S (w2[2], w2[3], offset); + c1[3] = hc_bytealign_S (w2[1], w2[2], offset); + c1[2] = hc_bytealign_S (w2[0], w2[1], offset); + c1[1] = hc_bytealign_S (w1[3], w2[0], offset); + c1[0] = hc_bytealign_S (w1[2], w1[3], offset); + c0[3] = hc_bytealign_S (w1[1], w1[2], offset); + c0[2] = hc_bytealign_S (w1[0], w1[1], offset); + c0[1] = hc_bytealign_S (w0[3], w1[0], offset); + c0[0] = hc_bytealign_S (w0[2], w0[3], offset); + w3[3] = hc_bytealign_S (w0[1], w0[2], offset); + w3[2] = hc_bytealign_S (w0[0], w0[1], offset); + w3[1] = hc_bytealign_S ( 0, w0[0], offset); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -32316,23 +32316,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 14: - c3[2] = amd_bytealign_S (w3[3], 0, offset); - c3[1] = amd_bytealign_S (w3[2], w3[3], offset); - c3[0] = amd_bytealign_S (w3[1], w3[2], offset); - c2[3] = amd_bytealign_S (w3[0], w3[1], offset); - c2[2] = amd_bytealign_S (w2[3], w3[0], offset); - c2[1] = amd_bytealign_S (w2[2], w2[3], offset); - c2[0] = amd_bytealign_S (w2[1], w2[2], offset); - c1[3] = amd_bytealign_S (w2[0], w2[1], offset); - c1[2] = amd_bytealign_S (w1[3], w2[0], offset); - c1[1] = amd_bytealign_S (w1[2], w1[3], offset); - c1[0] = amd_bytealign_S (w1[1], w1[2], offset); - c0[3] = amd_bytealign_S (w1[0], w1[1], offset); - c0[2] = amd_bytealign_S (w0[3], w1[0], offset); - c0[1] = amd_bytealign_S (w0[2], w0[3], offset); - c0[0] = amd_bytealign_S (w0[1], w0[2], offset); - w3[3] = amd_bytealign_S (w0[0], w0[1], offset); - w3[2] = amd_bytealign_S ( 0, w0[0], offset); + c3[2] = hc_bytealign_S (w3[3], 0, offset); + c3[1] = hc_bytealign_S (w3[2], w3[3], offset); + c3[0] = hc_bytealign_S (w3[1], w3[2], offset); + c2[3] = hc_bytealign_S (w3[0], w3[1], offset); + c2[2] = hc_bytealign_S (w2[3], w3[0], offset); + c2[1] = hc_bytealign_S (w2[2], w2[3], offset); + c2[0] = hc_bytealign_S (w2[1], w2[2], offset); + c1[3] = hc_bytealign_S (w2[0], w2[1], offset); + c1[2] = hc_bytealign_S (w1[3], w2[0], offset); + c1[1] = hc_bytealign_S (w1[2], w1[3], offset); + c1[0] = hc_bytealign_S (w1[1], w1[2], offset); + c0[3] = hc_bytealign_S (w1[0], w1[1], offset); + c0[2] = hc_bytealign_S (w0[3], w1[0], offset); + c0[1] = hc_bytealign_S (w0[2], w0[3], offset); + c0[0] = hc_bytealign_S (w0[1], w0[2], offset); + w3[3] = hc_bytealign_S (w0[0], w0[1], offset); + w3[2] = hc_bytealign_S ( 0, w0[0], offset); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -32351,23 +32351,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 15: - c3[3] = amd_bytealign_S (w3[3], 0, offset); - c3[2] = amd_bytealign_S (w3[2], w3[3], offset); - c3[1] = amd_bytealign_S (w3[1], w3[2], offset); - c3[0] = amd_bytealign_S (w3[0], w3[1], offset); - c2[3] = amd_bytealign_S (w2[3], w3[0], offset); - c2[2] = amd_bytealign_S (w2[2], w2[3], offset); - c2[1] = amd_bytealign_S (w2[1], w2[2], offset); - c2[0] = amd_bytealign_S (w2[0], w2[1], offset); - c1[3] = amd_bytealign_S (w1[3], w2[0], offset); - c1[2] = amd_bytealign_S (w1[2], w1[3], offset); - c1[1] = amd_bytealign_S (w1[1], w1[2], offset); - c1[0] = amd_bytealign_S (w1[0], w1[1], offset); - c0[3] = amd_bytealign_S (w0[3], w1[0], offset); - c0[2] = amd_bytealign_S (w0[2], w0[3], offset); - c0[1] = amd_bytealign_S (w0[1], w0[2], offset); - c0[0] = amd_bytealign_S (w0[0], w0[1], offset); - w3[3] = amd_bytealign_S ( 0, w0[0], offset); + c3[3] = hc_bytealign_S (w3[3], 0, offset); + c3[2] = hc_bytealign_S (w3[2], w3[3], offset); + c3[1] = hc_bytealign_S (w3[1], w3[2], offset); + c3[0] = hc_bytealign_S (w3[0], w3[1], offset); + c2[3] = hc_bytealign_S (w2[3], w3[0], offset); + c2[2] = hc_bytealign_S (w2[2], w2[3], offset); + c2[1] = hc_bytealign_S (w2[1], w2[2], offset); + c2[0] = hc_bytealign_S (w2[0], w2[1], offset); + c1[3] = hc_bytealign_S (w1[3], w2[0], offset); + c1[2] = hc_bytealign_S (w1[2], w1[3], offset); + c1[1] = hc_bytealign_S (w1[1], w1[2], offset); + c1[0] = hc_bytealign_S (w1[0], w1[1], offset); + c0[3] = hc_bytealign_S (w0[3], w1[0], offset); + c0[2] = hc_bytealign_S (w0[2], w0[3], offset); + c0[1] = hc_bytealign_S (w0[1], w0[2], offset); + c0[0] = hc_bytealign_S (w0[0], w0[1], offset); + w3[3] = hc_bytealign_S ( 0, w0[0], offset); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -32426,23 +32426,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 switch (offset_switch) { case 0: - c0[0] = amd_bytealign_S ( 0, w3[3], offset_minus_4); - w3[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w3[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w3[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w2[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w2[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w2[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w1[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w1[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w1[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w0[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w0[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w0[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + c0[0] = hc_bytealign_S ( 0, w3[3], offset_minus_4); + w3[3] = hc_bytealign_S (w3[3], w3[2], offset_minus_4); + w3[2] = hc_bytealign_S (w3[2], w3[1], offset_minus_4); + w3[1] = hc_bytealign_S (w3[1], w3[0], offset_minus_4); + w3[0] = hc_bytealign_S (w3[0], w2[3], offset_minus_4); + w2[3] = hc_bytealign_S (w2[3], w2[2], offset_minus_4); + w2[2] = hc_bytealign_S (w2[2], w2[1], offset_minus_4); + w2[1] = hc_bytealign_S (w2[1], w2[0], offset_minus_4); + w2[0] = hc_bytealign_S (w2[0], w1[3], offset_minus_4); + w1[3] = hc_bytealign_S (w1[3], w1[2], offset_minus_4); + w1[2] = hc_bytealign_S (w1[2], w1[1], offset_minus_4); + w1[1] = hc_bytealign_S (w1[1], w1[0], offset_minus_4); + w1[0] = hc_bytealign_S (w1[0], w0[3], offset_minus_4); + w0[3] = hc_bytealign_S (w0[3], w0[2], offset_minus_4); + w0[2] = hc_bytealign_S (w0[2], w0[1], offset_minus_4); + w0[1] = hc_bytealign_S (w0[1], w0[0], offset_minus_4); + w0[0] = hc_bytealign_S (w0[0], 0, offset_minus_4); if (offset_mod_4 == 0) { @@ -32468,23 +32468,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 1: - c0[1] = amd_bytealign_S ( 0, w3[3], offset_minus_4); - c0[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w3[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w3[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w2[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w2[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w1[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w1[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w0[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w0[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + c0[1] = hc_bytealign_S ( 0, w3[3], offset_minus_4); + c0[0] = hc_bytealign_S (w3[3], w3[2], offset_minus_4); + w3[3] = hc_bytealign_S (w3[2], w3[1], offset_minus_4); + w3[2] = hc_bytealign_S (w3[1], w3[0], offset_minus_4); + w3[1] = hc_bytealign_S (w3[0], w2[3], offset_minus_4); + w3[0] = hc_bytealign_S (w2[3], w2[2], offset_minus_4); + w2[3] = hc_bytealign_S (w2[2], w2[1], offset_minus_4); + w2[2] = hc_bytealign_S (w2[1], w2[0], offset_minus_4); + w2[1] = hc_bytealign_S (w2[0], w1[3], offset_minus_4); + w2[0] = hc_bytealign_S (w1[3], w1[2], offset_minus_4); + w1[3] = hc_bytealign_S (w1[2], w1[1], offset_minus_4); + w1[2] = hc_bytealign_S (w1[1], w1[0], offset_minus_4); + w1[1] = hc_bytealign_S (w1[0], w0[3], offset_minus_4); + w1[0] = hc_bytealign_S (w0[3], w0[2], offset_minus_4); + w0[3] = hc_bytealign_S (w0[2], w0[1], offset_minus_4); + w0[2] = hc_bytealign_S (w0[1], w0[0], offset_minus_4); + w0[1] = hc_bytealign_S (w0[0], 0, offset_minus_4); w0[0] = 0; if (offset_mod_4 == 0) @@ -32511,23 +32511,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 2: - c0[2] = amd_bytealign_S ( 0, w3[3], offset_minus_4); - c0[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - c0[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w3[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w2[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w1[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w0[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + c0[2] = hc_bytealign_S ( 0, w3[3], offset_minus_4); + c0[1] = hc_bytealign_S (w3[3], w3[2], offset_minus_4); + c0[0] = hc_bytealign_S (w3[2], w3[1], offset_minus_4); + w3[3] = hc_bytealign_S (w3[1], w3[0], offset_minus_4); + w3[2] = hc_bytealign_S (w3[0], w2[3], offset_minus_4); + w3[1] = hc_bytealign_S (w2[3], w2[2], offset_minus_4); + w3[0] = hc_bytealign_S (w2[2], w2[1], offset_minus_4); + w2[3] = hc_bytealign_S (w2[1], w2[0], offset_minus_4); + w2[2] = hc_bytealign_S (w2[0], w1[3], offset_minus_4); + w2[1] = hc_bytealign_S (w1[3], w1[2], offset_minus_4); + w2[0] = hc_bytealign_S (w1[2], w1[1], offset_minus_4); + w1[3] = hc_bytealign_S (w1[1], w1[0], offset_minus_4); + w1[2] = hc_bytealign_S (w1[0], w0[3], offset_minus_4); + w1[1] = hc_bytealign_S (w0[3], w0[2], offset_minus_4); + w1[0] = hc_bytealign_S (w0[2], w0[1], offset_minus_4); + w0[3] = hc_bytealign_S (w0[1], w0[0], offset_minus_4); + w0[2] = hc_bytealign_S (w0[0], 0, offset_minus_4); w0[1] = 0; w0[0] = 0; @@ -32555,23 +32555,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 3: - c0[3] = amd_bytealign_S ( 0, w3[3], offset_minus_4); - c0[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - c0[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - c0[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + c0[3] = hc_bytealign_S ( 0, w3[3], offset_minus_4); + c0[2] = hc_bytealign_S (w3[3], w3[2], offset_minus_4); + c0[1] = hc_bytealign_S (w3[2], w3[1], offset_minus_4); + c0[0] = hc_bytealign_S (w3[1], w3[0], offset_minus_4); + w3[3] = hc_bytealign_S (w3[0], w2[3], offset_minus_4); + w3[2] = hc_bytealign_S (w2[3], w2[2], offset_minus_4); + w3[1] = hc_bytealign_S (w2[2], w2[1], offset_minus_4); + w3[0] = hc_bytealign_S (w2[1], w2[0], offset_minus_4); + w2[3] = hc_bytealign_S (w2[0], w1[3], offset_minus_4); + w2[2] = hc_bytealign_S (w1[3], w1[2], offset_minus_4); + w2[1] = hc_bytealign_S (w1[2], w1[1], offset_minus_4); + w2[0] = hc_bytealign_S (w1[1], w1[0], offset_minus_4); + w1[3] = hc_bytealign_S (w1[0], w0[3], offset_minus_4); + w1[2] = hc_bytealign_S (w0[3], w0[2], offset_minus_4); + w1[1] = hc_bytealign_S (w0[2], w0[1], offset_minus_4); + w1[0] = hc_bytealign_S (w0[1], w0[0], offset_minus_4); + w0[3] = hc_bytealign_S (w0[0], 0, offset_minus_4); w0[2] = 0; w0[1] = 0; w0[0] = 0; @@ -32600,23 +32600,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 4: - c1[0] = amd_bytealign_S ( 0, w3[3], offset_minus_4); - c0[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - c0[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - c0[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - c0[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + c1[0] = hc_bytealign_S ( 0, w3[3], offset_minus_4); + c0[3] = hc_bytealign_S (w3[3], w3[2], offset_minus_4); + c0[2] = hc_bytealign_S (w3[2], w3[1], offset_minus_4); + c0[1] = hc_bytealign_S (w3[1], w3[0], offset_minus_4); + c0[0] = hc_bytealign_S (w3[0], w2[3], offset_minus_4); + w3[3] = hc_bytealign_S (w2[3], w2[2], offset_minus_4); + w3[2] = hc_bytealign_S (w2[2], w2[1], offset_minus_4); + w3[1] = hc_bytealign_S (w2[1], w2[0], offset_minus_4); + w3[0] = hc_bytealign_S (w2[0], w1[3], offset_minus_4); + w2[3] = hc_bytealign_S (w1[3], w1[2], offset_minus_4); + w2[2] = hc_bytealign_S (w1[2], w1[1], offset_minus_4); + w2[1] = hc_bytealign_S (w1[1], w1[0], offset_minus_4); + w2[0] = hc_bytealign_S (w1[0], w0[3], offset_minus_4); + w1[3] = hc_bytealign_S (w0[3], w0[2], offset_minus_4); + w1[2] = hc_bytealign_S (w0[2], w0[1], offset_minus_4); + w1[1] = hc_bytealign_S (w0[1], w0[0], offset_minus_4); + w1[0] = hc_bytealign_S (w0[0], 0, offset_minus_4); w0[3] = 0; w0[2] = 0; w0[1] = 0; @@ -32646,23 +32646,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 5: - c1[1] = amd_bytealign_S ( 0, w3[3], offset_minus_4); - c1[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - c0[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - c0[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - c0[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - c0[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + c1[1] = hc_bytealign_S ( 0, w3[3], offset_minus_4); + c1[0] = hc_bytealign_S (w3[3], w3[2], offset_minus_4); + c0[3] = hc_bytealign_S (w3[2], w3[1], offset_minus_4); + c0[2] = hc_bytealign_S (w3[1], w3[0], offset_minus_4); + c0[1] = hc_bytealign_S (w3[0], w2[3], offset_minus_4); + c0[0] = hc_bytealign_S (w2[3], w2[2], offset_minus_4); + w3[3] = hc_bytealign_S (w2[2], w2[1], offset_minus_4); + w3[2] = hc_bytealign_S (w2[1], w2[0], offset_minus_4); + w3[1] = hc_bytealign_S (w2[0], w1[3], offset_minus_4); + w3[0] = hc_bytealign_S (w1[3], w1[2], offset_minus_4); + w2[3] = hc_bytealign_S (w1[2], w1[1], offset_minus_4); + w2[2] = hc_bytealign_S (w1[1], w1[0], offset_minus_4); + w2[1] = hc_bytealign_S (w1[0], w0[3], offset_minus_4); + w2[0] = hc_bytealign_S (w0[3], w0[2], offset_minus_4); + w1[3] = hc_bytealign_S (w0[2], w0[1], offset_minus_4); + w1[2] = hc_bytealign_S (w0[1], w0[0], offset_minus_4); + w1[1] = hc_bytealign_S (w0[0], 0, offset_minus_4); w1[0] = 0; w0[3] = 0; w0[2] = 0; @@ -32693,23 +32693,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 6: - c1[2] = amd_bytealign_S ( 0, w3[3], offset_minus_4); - c1[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - c1[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - c0[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - c0[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - c0[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - c0[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + c1[2] = hc_bytealign_S ( 0, w3[3], offset_minus_4); + c1[1] = hc_bytealign_S (w3[3], w3[2], offset_minus_4); + c1[0] = hc_bytealign_S (w3[2], w3[1], offset_minus_4); + c0[3] = hc_bytealign_S (w3[1], w3[0], offset_minus_4); + c0[2] = hc_bytealign_S (w3[0], w2[3], offset_minus_4); + c0[1] = hc_bytealign_S (w2[3], w2[2], offset_minus_4); + c0[0] = hc_bytealign_S (w2[2], w2[1], offset_minus_4); + w3[3] = hc_bytealign_S (w2[1], w2[0], offset_minus_4); + w3[2] = hc_bytealign_S (w2[0], w1[3], offset_minus_4); + w3[1] = hc_bytealign_S (w1[3], w1[2], offset_minus_4); + w3[0] = hc_bytealign_S (w1[2], w1[1], offset_minus_4); + w2[3] = hc_bytealign_S (w1[1], w1[0], offset_minus_4); + w2[2] = hc_bytealign_S (w1[0], w0[3], offset_minus_4); + w2[1] = hc_bytealign_S (w0[3], w0[2], offset_minus_4); + w2[0] = hc_bytealign_S (w0[2], w0[1], offset_minus_4); + w1[3] = hc_bytealign_S (w0[1], w0[0], offset_minus_4); + w1[2] = hc_bytealign_S (w0[0], 0, offset_minus_4); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -32741,23 +32741,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 7: - c1[3] = amd_bytealign_S ( 0, w3[3], offset_minus_4); - c1[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - c1[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - c1[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - c0[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - c0[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - c0[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - c0[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + c1[3] = hc_bytealign_S ( 0, w3[3], offset_minus_4); + c1[2] = hc_bytealign_S (w3[3], w3[2], offset_minus_4); + c1[1] = hc_bytealign_S (w3[2], w3[1], offset_minus_4); + c1[0] = hc_bytealign_S (w3[1], w3[0], offset_minus_4); + c0[3] = hc_bytealign_S (w3[0], w2[3], offset_minus_4); + c0[2] = hc_bytealign_S (w2[3], w2[2], offset_minus_4); + c0[1] = hc_bytealign_S (w2[2], w2[1], offset_minus_4); + c0[0] = hc_bytealign_S (w2[1], w2[0], offset_minus_4); + w3[3] = hc_bytealign_S (w2[0], w1[3], offset_minus_4); + w3[2] = hc_bytealign_S (w1[3], w1[2], offset_minus_4); + w3[1] = hc_bytealign_S (w1[2], w1[1], offset_minus_4); + w3[0] = hc_bytealign_S (w1[1], w1[0], offset_minus_4); + w2[3] = hc_bytealign_S (w1[0], w0[3], offset_minus_4); + w2[2] = hc_bytealign_S (w0[3], w0[2], offset_minus_4); + w2[1] = hc_bytealign_S (w0[2], w0[1], offset_minus_4); + w2[0] = hc_bytealign_S (w0[1], w0[0], offset_minus_4); + w1[3] = hc_bytealign_S (w0[0], 0, offset_minus_4); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -32790,23 +32790,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 8: - c2[0] = amd_bytealign_S ( 0, w3[3], offset_minus_4); - c1[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - c1[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - c1[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - c1[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - c0[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - c0[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - c0[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - c0[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + c2[0] = hc_bytealign_S ( 0, w3[3], offset_minus_4); + c1[3] = hc_bytealign_S (w3[3], w3[2], offset_minus_4); + c1[2] = hc_bytealign_S (w3[2], w3[1], offset_minus_4); + c1[1] = hc_bytealign_S (w3[1], w3[0], offset_minus_4); + c1[0] = hc_bytealign_S (w3[0], w2[3], offset_minus_4); + c0[3] = hc_bytealign_S (w2[3], w2[2], offset_minus_4); + c0[2] = hc_bytealign_S (w2[2], w2[1], offset_minus_4); + c0[1] = hc_bytealign_S (w2[1], w2[0], offset_minus_4); + c0[0] = hc_bytealign_S (w2[0], w1[3], offset_minus_4); + w3[3] = hc_bytealign_S (w1[3], w1[2], offset_minus_4); + w3[2] = hc_bytealign_S (w1[2], w1[1], offset_minus_4); + w3[1] = hc_bytealign_S (w1[1], w1[0], offset_minus_4); + w3[0] = hc_bytealign_S (w1[0], w0[3], offset_minus_4); + w2[3] = hc_bytealign_S (w0[3], w0[2], offset_minus_4); + w2[2] = hc_bytealign_S (w0[2], w0[1], offset_minus_4); + w2[1] = hc_bytealign_S (w0[1], w0[0], offset_minus_4); + w2[0] = hc_bytealign_S (w0[0], 0, offset_minus_4); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -32840,23 +32840,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 9: - c2[1] = amd_bytealign_S ( 0, w3[3], offset_minus_4); - c2[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - c1[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - c1[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - c1[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - c1[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - c0[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - c0[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - c0[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - c0[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + c2[1] = hc_bytealign_S ( 0, w3[3], offset_minus_4); + c2[0] = hc_bytealign_S (w3[3], w3[2], offset_minus_4); + c1[3] = hc_bytealign_S (w3[2], w3[1], offset_minus_4); + c1[2] = hc_bytealign_S (w3[1], w3[0], offset_minus_4); + c1[1] = hc_bytealign_S (w3[0], w2[3], offset_minus_4); + c1[0] = hc_bytealign_S (w2[3], w2[2], offset_minus_4); + c0[3] = hc_bytealign_S (w2[2], w2[1], offset_minus_4); + c0[2] = hc_bytealign_S (w2[1], w2[0], offset_minus_4); + c0[1] = hc_bytealign_S (w2[0], w1[3], offset_minus_4); + c0[0] = hc_bytealign_S (w1[3], w1[2], offset_minus_4); + w3[3] = hc_bytealign_S (w1[2], w1[1], offset_minus_4); + w3[2] = hc_bytealign_S (w1[1], w1[0], offset_minus_4); + w3[1] = hc_bytealign_S (w1[0], w0[3], offset_minus_4); + w3[0] = hc_bytealign_S (w0[3], w0[2], offset_minus_4); + w2[3] = hc_bytealign_S (w0[2], w0[1], offset_minus_4); + w2[2] = hc_bytealign_S (w0[1], w0[0], offset_minus_4); + w2[1] = hc_bytealign_S (w0[0], 0, offset_minus_4); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -32891,23 +32891,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 10: - c2[2] = amd_bytealign_S ( 0, w3[3], offset_minus_4); - c2[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - c2[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - c1[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - c1[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - c1[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - c1[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - c0[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - c0[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - c0[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - c0[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + c2[2] = hc_bytealign_S ( 0, w3[3], offset_minus_4); + c2[1] = hc_bytealign_S (w3[3], w3[2], offset_minus_4); + c2[0] = hc_bytealign_S (w3[2], w3[1], offset_minus_4); + c1[3] = hc_bytealign_S (w3[1], w3[0], offset_minus_4); + c1[2] = hc_bytealign_S (w3[0], w2[3], offset_minus_4); + c1[1] = hc_bytealign_S (w2[3], w2[2], offset_minus_4); + c1[0] = hc_bytealign_S (w2[2], w2[1], offset_minus_4); + c0[3] = hc_bytealign_S (w2[1], w2[0], offset_minus_4); + c0[2] = hc_bytealign_S (w2[0], w1[3], offset_minus_4); + c0[1] = hc_bytealign_S (w1[3], w1[2], offset_minus_4); + c0[0] = hc_bytealign_S (w1[2], w1[1], offset_minus_4); + w3[3] = hc_bytealign_S (w1[1], w1[0], offset_minus_4); + w3[2] = hc_bytealign_S (w1[0], w0[3], offset_minus_4); + w3[1] = hc_bytealign_S (w0[3], w0[2], offset_minus_4); + w3[0] = hc_bytealign_S (w0[2], w0[1], offset_minus_4); + w2[3] = hc_bytealign_S (w0[1], w0[0], offset_minus_4); + w2[2] = hc_bytealign_S (w0[0], 0, offset_minus_4); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -32943,23 +32943,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 11: - c2[3] = amd_bytealign_S ( 0, w3[3], offset_minus_4); - c2[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - c2[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - c2[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - c1[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - c1[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - c1[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - c1[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - c0[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - c0[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - c0[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - c0[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + c2[3] = hc_bytealign_S ( 0, w3[3], offset_minus_4); + c2[2] = hc_bytealign_S (w3[3], w3[2], offset_minus_4); + c2[1] = hc_bytealign_S (w3[2], w3[1], offset_minus_4); + c2[0] = hc_bytealign_S (w3[1], w3[0], offset_minus_4); + c1[3] = hc_bytealign_S (w3[0], w2[3], offset_minus_4); + c1[2] = hc_bytealign_S (w2[3], w2[2], offset_minus_4); + c1[1] = hc_bytealign_S (w2[2], w2[1], offset_minus_4); + c1[0] = hc_bytealign_S (w2[1], w2[0], offset_minus_4); + c0[3] = hc_bytealign_S (w2[0], w1[3], offset_minus_4); + c0[2] = hc_bytealign_S (w1[3], w1[2], offset_minus_4); + c0[1] = hc_bytealign_S (w1[2], w1[1], offset_minus_4); + c0[0] = hc_bytealign_S (w1[1], w1[0], offset_minus_4); + w3[3] = hc_bytealign_S (w1[0], w0[3], offset_minus_4); + w3[2] = hc_bytealign_S (w0[3], w0[2], offset_minus_4); + w3[1] = hc_bytealign_S (w0[2], w0[1], offset_minus_4); + w3[0] = hc_bytealign_S (w0[1], w0[0], offset_minus_4); + w2[3] = hc_bytealign_S (w0[0], 0, offset_minus_4); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -32996,23 +32996,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 12: - c3[0] = amd_bytealign_S ( 0, w3[3], offset_minus_4); - c2[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - c2[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - c2[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - c2[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - c1[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - c1[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - c1[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - c1[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - c0[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - c0[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - c0[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - c0[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + c3[0] = hc_bytealign_S ( 0, w3[3], offset_minus_4); + c2[3] = hc_bytealign_S (w3[3], w3[2], offset_minus_4); + c2[2] = hc_bytealign_S (w3[2], w3[1], offset_minus_4); + c2[1] = hc_bytealign_S (w3[1], w3[0], offset_minus_4); + c2[0] = hc_bytealign_S (w3[0], w2[3], offset_minus_4); + c1[3] = hc_bytealign_S (w2[3], w2[2], offset_minus_4); + c1[2] = hc_bytealign_S (w2[2], w2[1], offset_minus_4); + c1[1] = hc_bytealign_S (w2[1], w2[0], offset_minus_4); + c1[0] = hc_bytealign_S (w2[0], w1[3], offset_minus_4); + c0[3] = hc_bytealign_S (w1[3], w1[2], offset_minus_4); + c0[2] = hc_bytealign_S (w1[2], w1[1], offset_minus_4); + c0[1] = hc_bytealign_S (w1[1], w1[0], offset_minus_4); + c0[0] = hc_bytealign_S (w1[0], w0[3], offset_minus_4); + w3[3] = hc_bytealign_S (w0[3], w0[2], offset_minus_4); + w3[2] = hc_bytealign_S (w0[2], w0[1], offset_minus_4); + w3[1] = hc_bytealign_S (w0[1], w0[0], offset_minus_4); + w3[0] = hc_bytealign_S (w0[0], 0, offset_minus_4); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -33050,23 +33050,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 13: - c3[1] = amd_bytealign_S ( 0, w3[3], offset_minus_4); - c3[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - c2[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - c2[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - c2[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - c2[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - c1[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - c1[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - c1[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - c1[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - c0[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - c0[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - c0[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - c0[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + c3[1] = hc_bytealign_S ( 0, w3[3], offset_minus_4); + c3[0] = hc_bytealign_S (w3[3], w3[2], offset_minus_4); + c2[3] = hc_bytealign_S (w3[2], w3[1], offset_minus_4); + c2[2] = hc_bytealign_S (w3[1], w3[0], offset_minus_4); + c2[1] = hc_bytealign_S (w3[0], w2[3], offset_minus_4); + c2[0] = hc_bytealign_S (w2[3], w2[2], offset_minus_4); + c1[3] = hc_bytealign_S (w2[2], w2[1], offset_minus_4); + c1[2] = hc_bytealign_S (w2[1], w2[0], offset_minus_4); + c1[1] = hc_bytealign_S (w2[0], w1[3], offset_minus_4); + c1[0] = hc_bytealign_S (w1[3], w1[2], offset_minus_4); + c0[3] = hc_bytealign_S (w1[2], w1[1], offset_minus_4); + c0[2] = hc_bytealign_S (w1[1], w1[0], offset_minus_4); + c0[1] = hc_bytealign_S (w1[0], w0[3], offset_minus_4); + c0[0] = hc_bytealign_S (w0[3], w0[2], offset_minus_4); + w3[3] = hc_bytealign_S (w0[2], w0[1], offset_minus_4); + w3[2] = hc_bytealign_S (w0[1], w0[0], offset_minus_4); + w3[1] = hc_bytealign_S (w0[0], 0, offset_minus_4); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -33105,23 +33105,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 14: - c3[2] = amd_bytealign_S ( 0, w3[3], offset_minus_4); - c3[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - c3[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - c2[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - c2[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - c2[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - c2[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - c1[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - c1[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - c1[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - c1[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - c0[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - c0[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - c0[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - c0[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + c3[2] = hc_bytealign_S ( 0, w3[3], offset_minus_4); + c3[1] = hc_bytealign_S (w3[3], w3[2], offset_minus_4); + c3[0] = hc_bytealign_S (w3[2], w3[1], offset_minus_4); + c2[3] = hc_bytealign_S (w3[1], w3[0], offset_minus_4); + c2[2] = hc_bytealign_S (w3[0], w2[3], offset_minus_4); + c2[1] = hc_bytealign_S (w2[3], w2[2], offset_minus_4); + c2[0] = hc_bytealign_S (w2[2], w2[1], offset_minus_4); + c1[3] = hc_bytealign_S (w2[1], w2[0], offset_minus_4); + c1[2] = hc_bytealign_S (w2[0], w1[3], offset_minus_4); + c1[1] = hc_bytealign_S (w1[3], w1[2], offset_minus_4); + c1[0] = hc_bytealign_S (w1[2], w1[1], offset_minus_4); + c0[3] = hc_bytealign_S (w1[1], w1[0], offset_minus_4); + c0[2] = hc_bytealign_S (w1[0], w0[3], offset_minus_4); + c0[1] = hc_bytealign_S (w0[3], w0[2], offset_minus_4); + c0[0] = hc_bytealign_S (w0[2], w0[1], offset_minus_4); + w3[3] = hc_bytealign_S (w0[1], w0[0], offset_minus_4); + w3[2] = hc_bytealign_S (w0[0], 0, offset_minus_4); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -33161,23 +33161,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 15: - c3[3] = amd_bytealign_S ( 0, w3[3], offset_minus_4); - c3[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - c3[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - c3[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - c2[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - c2[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - c2[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - c2[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - c1[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - c1[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - c1[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - c1[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - c0[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - c0[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - c0[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - c0[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + c3[3] = hc_bytealign_S ( 0, w3[3], offset_minus_4); + c3[2] = hc_bytealign_S (w3[3], w3[2], offset_minus_4); + c3[1] = hc_bytealign_S (w3[2], w3[1], offset_minus_4); + c3[0] = hc_bytealign_S (w3[1], w3[0], offset_minus_4); + c2[3] = hc_bytealign_S (w3[0], w2[3], offset_minus_4); + c2[2] = hc_bytealign_S (w2[3], w2[2], offset_minus_4); + c2[1] = hc_bytealign_S (w2[2], w2[1], offset_minus_4); + c2[0] = hc_bytealign_S (w2[1], w2[0], offset_minus_4); + c1[3] = hc_bytealign_S (w2[0], w1[3], offset_minus_4); + c1[2] = hc_bytealign_S (w1[3], w1[2], offset_minus_4); + c1[1] = hc_bytealign_S (w1[2], w1[1], offset_minus_4); + c1[0] = hc_bytealign_S (w1[1], w1[0], offset_minus_4); + c0[3] = hc_bytealign_S (w1[0], w0[3], offset_minus_4); + c0[2] = hc_bytealign_S (w0[3], w0[2], offset_minus_4); + c0[1] = hc_bytealign_S (w0[2], w0[1], offset_minus_4); + c0[0] = hc_bytealign_S (w0[1], w0[0], offset_minus_4); + w3[3] = hc_bytealign_S (w0[0], 0, offset_minus_4); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -33228,79 +33228,79 @@ DECLSPEC void switch_buffer_by_offset_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, switch (offset_switch) { case 0: - w3[3] = amd_bytealign_S (w3[2], w3[3], offset); - w3[2] = amd_bytealign_S (w3[1], w3[2], offset); - w3[1] = amd_bytealign_S (w3[0], w3[1], offset); - w3[0] = amd_bytealign_S (w2[3], w3[0], offset); - w2[3] = amd_bytealign_S (w2[2], w2[3], offset); - w2[2] = amd_bytealign_S (w2[1], w2[2], offset); - w2[1] = amd_bytealign_S (w2[0], w2[1], offset); - w2[0] = amd_bytealign_S (w1[3], w2[0], offset); - w1[3] = amd_bytealign_S (w1[2], w1[3], offset); - w1[2] = amd_bytealign_S (w1[1], w1[2], offset); - w1[1] = amd_bytealign_S (w1[0], w1[1], offset); - w1[0] = amd_bytealign_S (w0[3], w1[0], offset); - w0[3] = amd_bytealign_S (w0[2], w0[3], offset); - w0[2] = amd_bytealign_S (w0[1], w0[2], offset); - w0[1] = amd_bytealign_S (w0[0], w0[1], offset); - w0[0] = amd_bytealign_S ( 0, w0[0], offset); + w3[3] = hc_bytealign_S (w3[2], w3[3], offset); + w3[2] = hc_bytealign_S (w3[1], w3[2], offset); + w3[1] = hc_bytealign_S (w3[0], w3[1], offset); + w3[0] = hc_bytealign_S (w2[3], w3[0], offset); + w2[3] = hc_bytealign_S (w2[2], w2[3], offset); + w2[2] = hc_bytealign_S (w2[1], w2[2], offset); + w2[1] = hc_bytealign_S (w2[0], w2[1], offset); + w2[0] = hc_bytealign_S (w1[3], w2[0], offset); + w1[3] = hc_bytealign_S (w1[2], w1[3], offset); + w1[2] = hc_bytealign_S (w1[1], w1[2], offset); + w1[1] = hc_bytealign_S (w1[0], w1[1], offset); + w1[0] = hc_bytealign_S (w0[3], w1[0], offset); + w0[3] = hc_bytealign_S (w0[2], w0[3], offset); + w0[2] = hc_bytealign_S (w0[1], w0[2], offset); + w0[1] = hc_bytealign_S (w0[0], w0[1], offset); + w0[0] = hc_bytealign_S ( 0, w0[0], offset); break; case 1: - w3[3] = amd_bytealign_S (w3[1], w3[2], offset); - w3[2] = amd_bytealign_S (w3[0], w3[1], offset); - w3[1] = amd_bytealign_S (w2[3], w3[0], offset); - w3[0] = amd_bytealign_S (w2[2], w2[3], offset); - w2[3] = amd_bytealign_S (w2[1], w2[2], offset); - w2[2] = amd_bytealign_S (w2[0], w2[1], offset); - w2[1] = amd_bytealign_S (w1[3], w2[0], offset); - w2[0] = amd_bytealign_S (w1[2], w1[3], offset); - w1[3] = amd_bytealign_S (w1[1], w1[2], offset); - w1[2] = amd_bytealign_S (w1[0], w1[1], offset); - w1[1] = amd_bytealign_S (w0[3], w1[0], offset); - w1[0] = amd_bytealign_S (w0[2], w0[3], offset); - w0[3] = amd_bytealign_S (w0[1], w0[2], offset); - w0[2] = amd_bytealign_S (w0[0], w0[1], offset); - w0[1] = amd_bytealign_S ( 0, w0[0], offset); + w3[3] = hc_bytealign_S (w3[1], w3[2], offset); + w3[2] = hc_bytealign_S (w3[0], w3[1], offset); + w3[1] = hc_bytealign_S (w2[3], w3[0], offset); + w3[0] = hc_bytealign_S (w2[2], w2[3], offset); + w2[3] = hc_bytealign_S (w2[1], w2[2], offset); + w2[2] = hc_bytealign_S (w2[0], w2[1], offset); + w2[1] = hc_bytealign_S (w1[3], w2[0], offset); + w2[0] = hc_bytealign_S (w1[2], w1[3], offset); + w1[3] = hc_bytealign_S (w1[1], w1[2], offset); + w1[2] = hc_bytealign_S (w1[0], w1[1], offset); + w1[1] = hc_bytealign_S (w0[3], w1[0], offset); + w1[0] = hc_bytealign_S (w0[2], w0[3], offset); + w0[3] = hc_bytealign_S (w0[1], w0[2], offset); + w0[2] = hc_bytealign_S (w0[0], w0[1], offset); + w0[1] = hc_bytealign_S ( 0, w0[0], offset); w0[0] = 0; break; case 2: - w3[3] = amd_bytealign_S (w3[0], w3[1], offset); - w3[2] = amd_bytealign_S (w2[3], w3[0], offset); - w3[1] = amd_bytealign_S (w2[2], w2[3], offset); - w3[0] = amd_bytealign_S (w2[1], w2[2], offset); - w2[3] = amd_bytealign_S (w2[0], w2[1], offset); - w2[2] = amd_bytealign_S (w1[3], w2[0], offset); - w2[1] = amd_bytealign_S (w1[2], w1[3], offset); - w2[0] = amd_bytealign_S (w1[1], w1[2], offset); - w1[3] = amd_bytealign_S (w1[0], w1[1], offset); - w1[2] = amd_bytealign_S (w0[3], w1[0], offset); - w1[1] = amd_bytealign_S (w0[2], w0[3], offset); - w1[0] = amd_bytealign_S (w0[1], w0[2], offset); - w0[3] = amd_bytealign_S (w0[0], w0[1], offset); - w0[2] = amd_bytealign_S ( 0, w0[0], offset); + w3[3] = hc_bytealign_S (w3[0], w3[1], offset); + w3[2] = hc_bytealign_S (w2[3], w3[0], offset); + w3[1] = hc_bytealign_S (w2[2], w2[3], offset); + w3[0] = hc_bytealign_S (w2[1], w2[2], offset); + w2[3] = hc_bytealign_S (w2[0], w2[1], offset); + w2[2] = hc_bytealign_S (w1[3], w2[0], offset); + w2[1] = hc_bytealign_S (w1[2], w1[3], offset); + w2[0] = hc_bytealign_S (w1[1], w1[2], offset); + w1[3] = hc_bytealign_S (w1[0], w1[1], offset); + w1[2] = hc_bytealign_S (w0[3], w1[0], offset); + w1[1] = hc_bytealign_S (w0[2], w0[3], offset); + w1[0] = hc_bytealign_S (w0[1], w0[2], offset); + w0[3] = hc_bytealign_S (w0[0], w0[1], offset); + w0[2] = hc_bytealign_S ( 0, w0[0], offset); w0[1] = 0; w0[0] = 0; break; case 3: - w3[3] = amd_bytealign_S (w2[3], w3[0], offset); - w3[2] = amd_bytealign_S (w2[2], w2[3], offset); - w3[1] = amd_bytealign_S (w2[1], w2[2], offset); - w3[0] = amd_bytealign_S (w2[0], w2[1], offset); - w2[3] = amd_bytealign_S (w1[3], w2[0], offset); - w2[2] = amd_bytealign_S (w1[2], w1[3], offset); - w2[1] = amd_bytealign_S (w1[1], w1[2], offset); - w2[0] = amd_bytealign_S (w1[0], w1[1], offset); - w1[3] = amd_bytealign_S (w0[3], w1[0], offset); - w1[2] = amd_bytealign_S (w0[2], w0[3], offset); - w1[1] = amd_bytealign_S (w0[1], w0[2], offset); - w1[0] = amd_bytealign_S (w0[0], w0[1], offset); - w0[3] = amd_bytealign_S ( 0, w0[0], offset); + w3[3] = hc_bytealign_S (w2[3], w3[0], offset); + w3[2] = hc_bytealign_S (w2[2], w2[3], offset); + w3[1] = hc_bytealign_S (w2[1], w2[2], offset); + w3[0] = hc_bytealign_S (w2[0], w2[1], offset); + w2[3] = hc_bytealign_S (w1[3], w2[0], offset); + w2[2] = hc_bytealign_S (w1[2], w1[3], offset); + w2[1] = hc_bytealign_S (w1[1], w1[2], offset); + w2[0] = hc_bytealign_S (w1[0], w1[1], offset); + w1[3] = hc_bytealign_S (w0[3], w1[0], offset); + w1[2] = hc_bytealign_S (w0[2], w0[3], offset); + w1[1] = hc_bytealign_S (w0[1], w0[2], offset); + w1[0] = hc_bytealign_S (w0[0], w0[1], offset); + w0[3] = hc_bytealign_S ( 0, w0[0], offset); w0[2] = 0; w0[1] = 0; w0[0] = 0; @@ -33308,18 +33308,18 @@ DECLSPEC void switch_buffer_by_offset_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 4: - w3[3] = amd_bytealign_S (w2[2], w2[3], offset); - w3[2] = amd_bytealign_S (w2[1], w2[2], offset); - w3[1] = amd_bytealign_S (w2[0], w2[1], offset); - w3[0] = amd_bytealign_S (w1[3], w2[0], offset); - w2[3] = amd_bytealign_S (w1[2], w1[3], offset); - w2[2] = amd_bytealign_S (w1[1], w1[2], offset); - w2[1] = amd_bytealign_S (w1[0], w1[1], offset); - w2[0] = amd_bytealign_S (w0[3], w1[0], offset); - w1[3] = amd_bytealign_S (w0[2], w0[3], offset); - w1[2] = amd_bytealign_S (w0[1], w0[2], offset); - w1[1] = amd_bytealign_S (w0[0], w0[1], offset); - w1[0] = amd_bytealign_S ( 0, w0[0], offset); + w3[3] = hc_bytealign_S (w2[2], w2[3], offset); + w3[2] = hc_bytealign_S (w2[1], w2[2], offset); + w3[1] = hc_bytealign_S (w2[0], w2[1], offset); + w3[0] = hc_bytealign_S (w1[3], w2[0], offset); + w2[3] = hc_bytealign_S (w1[2], w1[3], offset); + w2[2] = hc_bytealign_S (w1[1], w1[2], offset); + w2[1] = hc_bytealign_S (w1[0], w1[1], offset); + w2[0] = hc_bytealign_S (w0[3], w1[0], offset); + w1[3] = hc_bytealign_S (w0[2], w0[3], offset); + w1[2] = hc_bytealign_S (w0[1], w0[2], offset); + w1[1] = hc_bytealign_S (w0[0], w0[1], offset); + w1[0] = hc_bytealign_S ( 0, w0[0], offset); w0[3] = 0; w0[2] = 0; w0[1] = 0; @@ -33328,17 +33328,17 @@ DECLSPEC void switch_buffer_by_offset_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 5: - w3[3] = amd_bytealign_S (w2[1], w2[2], offset); - w3[2] = amd_bytealign_S (w2[0], w2[1], offset); - w3[1] = amd_bytealign_S (w1[3], w2[0], offset); - w3[0] = amd_bytealign_S (w1[2], w1[3], offset); - w2[3] = amd_bytealign_S (w1[1], w1[2], offset); - w2[2] = amd_bytealign_S (w1[0], w1[1], offset); - w2[1] = amd_bytealign_S (w0[3], w1[0], offset); - w2[0] = amd_bytealign_S (w0[2], w0[3], offset); - w1[3] = amd_bytealign_S (w0[1], w0[2], offset); - w1[2] = amd_bytealign_S (w0[0], w0[1], offset); - w1[1] = amd_bytealign_S ( 0, w0[0], offset); + w3[3] = hc_bytealign_S (w2[1], w2[2], offset); + w3[2] = hc_bytealign_S (w2[0], w2[1], offset); + w3[1] = hc_bytealign_S (w1[3], w2[0], offset); + w3[0] = hc_bytealign_S (w1[2], w1[3], offset); + w2[3] = hc_bytealign_S (w1[1], w1[2], offset); + w2[2] = hc_bytealign_S (w1[0], w1[1], offset); + w2[1] = hc_bytealign_S (w0[3], w1[0], offset); + w2[0] = hc_bytealign_S (w0[2], w0[3], offset); + w1[3] = hc_bytealign_S (w0[1], w0[2], offset); + w1[2] = hc_bytealign_S (w0[0], w0[1], offset); + w1[1] = hc_bytealign_S ( 0, w0[0], offset); w1[0] = 0; w0[3] = 0; w0[2] = 0; @@ -33348,16 +33348,16 @@ DECLSPEC void switch_buffer_by_offset_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 6: - w3[3] = amd_bytealign_S (w2[0], w2[1], offset); - w3[2] = amd_bytealign_S (w1[3], w2[0], offset); - w3[1] = amd_bytealign_S (w1[2], w1[3], offset); - w3[0] = amd_bytealign_S (w1[1], w1[2], offset); - w2[3] = amd_bytealign_S (w1[0], w1[1], offset); - w2[2] = amd_bytealign_S (w0[3], w1[0], offset); - w2[1] = amd_bytealign_S (w0[2], w0[3], offset); - w2[0] = amd_bytealign_S (w0[1], w0[2], offset); - w1[3] = amd_bytealign_S (w0[0], w0[1], offset); - w1[2] = amd_bytealign_S ( 0, w0[0], offset); + w3[3] = hc_bytealign_S (w2[0], w2[1], offset); + w3[2] = hc_bytealign_S (w1[3], w2[0], offset); + w3[1] = hc_bytealign_S (w1[2], w1[3], offset); + w3[0] = hc_bytealign_S (w1[1], w1[2], offset); + w2[3] = hc_bytealign_S (w1[0], w1[1], offset); + w2[2] = hc_bytealign_S (w0[3], w1[0], offset); + w2[1] = hc_bytealign_S (w0[2], w0[3], offset); + w2[0] = hc_bytealign_S (w0[1], w0[2], offset); + w1[3] = hc_bytealign_S (w0[0], w0[1], offset); + w1[2] = hc_bytealign_S ( 0, w0[0], offset); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -33368,15 +33368,15 @@ DECLSPEC void switch_buffer_by_offset_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 7: - w3[3] = amd_bytealign_S (w1[3], w2[0], offset); - w3[2] = amd_bytealign_S (w1[2], w1[3], offset); - w3[1] = amd_bytealign_S (w1[1], w1[2], offset); - w3[0] = amd_bytealign_S (w1[0], w1[1], offset); - w2[3] = amd_bytealign_S (w0[3], w1[0], offset); - w2[2] = amd_bytealign_S (w0[2], w0[3], offset); - w2[1] = amd_bytealign_S (w0[1], w0[2], offset); - w2[0] = amd_bytealign_S (w0[0], w0[1], offset); - w1[3] = amd_bytealign_S ( 0, w0[0], offset); + w3[3] = hc_bytealign_S (w1[3], w2[0], offset); + w3[2] = hc_bytealign_S (w1[2], w1[3], offset); + w3[1] = hc_bytealign_S (w1[1], w1[2], offset); + w3[0] = hc_bytealign_S (w1[0], w1[1], offset); + w2[3] = hc_bytealign_S (w0[3], w1[0], offset); + w2[2] = hc_bytealign_S (w0[2], w0[3], offset); + w2[1] = hc_bytealign_S (w0[1], w0[2], offset); + w2[0] = hc_bytealign_S (w0[0], w0[1], offset); + w1[3] = hc_bytealign_S ( 0, w0[0], offset); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -33388,14 +33388,14 @@ DECLSPEC void switch_buffer_by_offset_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 8: - w3[3] = amd_bytealign_S (w1[2], w1[3], offset); - w3[2] = amd_bytealign_S (w1[1], w1[2], offset); - w3[1] = amd_bytealign_S (w1[0], w1[1], offset); - w3[0] = amd_bytealign_S (w0[3], w1[0], offset); - w2[3] = amd_bytealign_S (w0[2], w0[3], offset); - w2[2] = amd_bytealign_S (w0[1], w0[2], offset); - w2[1] = amd_bytealign_S (w0[0], w0[1], offset); - w2[0] = amd_bytealign_S ( 0, w0[0], offset); + w3[3] = hc_bytealign_S (w1[2], w1[3], offset); + w3[2] = hc_bytealign_S (w1[1], w1[2], offset); + w3[1] = hc_bytealign_S (w1[0], w1[1], offset); + w3[0] = hc_bytealign_S (w0[3], w1[0], offset); + w2[3] = hc_bytealign_S (w0[2], w0[3], offset); + w2[2] = hc_bytealign_S (w0[1], w0[2], offset); + w2[1] = hc_bytealign_S (w0[0], w0[1], offset); + w2[0] = hc_bytealign_S ( 0, w0[0], offset); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -33408,13 +33408,13 @@ DECLSPEC void switch_buffer_by_offset_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 9: - w3[3] = amd_bytealign_S (w1[1], w1[2], offset); - w3[2] = amd_bytealign_S (w1[0], w1[1], offset); - w3[1] = amd_bytealign_S (w0[3], w1[0], offset); - w3[0] = amd_bytealign_S (w0[2], w0[3], offset); - w2[3] = amd_bytealign_S (w0[1], w0[2], offset); - w2[2] = amd_bytealign_S (w0[0], w0[1], offset); - w2[1] = amd_bytealign_S ( 0, w0[0], offset); + w3[3] = hc_bytealign_S (w1[1], w1[2], offset); + w3[2] = hc_bytealign_S (w1[0], w1[1], offset); + w3[1] = hc_bytealign_S (w0[3], w1[0], offset); + w3[0] = hc_bytealign_S (w0[2], w0[3], offset); + w2[3] = hc_bytealign_S (w0[1], w0[2], offset); + w2[2] = hc_bytealign_S (w0[0], w0[1], offset); + w2[1] = hc_bytealign_S ( 0, w0[0], offset); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -33428,12 +33428,12 @@ DECLSPEC void switch_buffer_by_offset_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 10: - w3[3] = amd_bytealign_S (w1[0], w1[1], offset); - w3[2] = amd_bytealign_S (w0[3], w1[0], offset); - w3[1] = amd_bytealign_S (w0[2], w0[3], offset); - w3[0] = amd_bytealign_S (w0[1], w0[2], offset); - w2[3] = amd_bytealign_S (w0[0], w0[1], offset); - w2[2] = amd_bytealign_S ( 0, w0[0], offset); + w3[3] = hc_bytealign_S (w1[0], w1[1], offset); + w3[2] = hc_bytealign_S (w0[3], w1[0], offset); + w3[1] = hc_bytealign_S (w0[2], w0[3], offset); + w3[0] = hc_bytealign_S (w0[1], w0[2], offset); + w2[3] = hc_bytealign_S (w0[0], w0[1], offset); + w2[2] = hc_bytealign_S ( 0, w0[0], offset); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -33448,11 +33448,11 @@ DECLSPEC void switch_buffer_by_offset_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 11: - w3[3] = amd_bytealign_S (w0[3], w1[0], offset); - w3[2] = amd_bytealign_S (w0[2], w0[3], offset); - w3[1] = amd_bytealign_S (w0[1], w0[2], offset); - w3[0] = amd_bytealign_S (w0[0], w0[1], offset); - w2[3] = amd_bytealign_S ( 0, w0[0], offset); + w3[3] = hc_bytealign_S (w0[3], w1[0], offset); + w3[2] = hc_bytealign_S (w0[2], w0[3], offset); + w3[1] = hc_bytealign_S (w0[1], w0[2], offset); + w3[0] = hc_bytealign_S (w0[0], w0[1], offset); + w2[3] = hc_bytealign_S ( 0, w0[0], offset); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -33468,10 +33468,10 @@ DECLSPEC void switch_buffer_by_offset_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 12: - w3[3] = amd_bytealign_S (w0[2], w0[3], offset); - w3[2] = amd_bytealign_S (w0[1], w0[2], offset); - w3[1] = amd_bytealign_S (w0[0], w0[1], offset); - w3[0] = amd_bytealign_S ( 0, w0[0], offset); + w3[3] = hc_bytealign_S (w0[2], w0[3], offset); + w3[2] = hc_bytealign_S (w0[1], w0[2], offset); + w3[1] = hc_bytealign_S (w0[0], w0[1], offset); + w3[0] = hc_bytealign_S ( 0, w0[0], offset); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -33488,9 +33488,9 @@ DECLSPEC void switch_buffer_by_offset_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 13: - w3[3] = amd_bytealign_S (w0[1], w0[2], offset); - w3[2] = amd_bytealign_S (w0[0], w0[1], offset); - w3[1] = amd_bytealign_S ( 0, w0[0], offset); + w3[3] = hc_bytealign_S (w0[1], w0[2], offset); + w3[2] = hc_bytealign_S (w0[0], w0[1], offset); + w3[1] = hc_bytealign_S ( 0, w0[0], offset); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -33508,8 +33508,8 @@ DECLSPEC void switch_buffer_by_offset_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 14: - w3[3] = amd_bytealign_S (w0[0], w0[1], offset); - w3[2] = amd_bytealign_S ( 0, w0[0], offset); + w3[3] = hc_bytealign_S (w0[0], w0[1], offset); + w3[2] = hc_bytealign_S ( 0, w0[0], offset); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -33528,7 +33528,7 @@ DECLSPEC void switch_buffer_by_offset_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 15: - w3[3] = amd_bytealign_S ( 0, w0[0], offset); + w3[3] = hc_bytealign_S ( 0, w0[0], offset); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -33562,79 +33562,79 @@ DECLSPEC void switch_buffer_by_offset_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, switch (offset_switch) { case 0: - w3[3] = __byte_perm_S (w3[3], w3[2], selector); - w3[2] = __byte_perm_S (w3[2], w3[1], selector); - w3[1] = __byte_perm_S (w3[1], w3[0], selector); - w3[0] = __byte_perm_S (w3[0], w2[3], selector); - w2[3] = __byte_perm_S (w2[3], w2[2], selector); - w2[2] = __byte_perm_S (w2[2], w2[1], selector); - w2[1] = __byte_perm_S (w2[1], w2[0], selector); - w2[0] = __byte_perm_S (w2[0], w1[3], selector); - w1[3] = __byte_perm_S (w1[3], w1[2], selector); - w1[2] = __byte_perm_S (w1[2], w1[1], selector); - w1[1] = __byte_perm_S (w1[1], w1[0], selector); - w1[0] = __byte_perm_S (w1[0], w0[3], selector); - w0[3] = __byte_perm_S (w0[3], w0[2], selector); - w0[2] = __byte_perm_S (w0[2], w0[1], selector); - w0[1] = __byte_perm_S (w0[1], w0[0], selector); - w0[0] = __byte_perm_S (w0[0], 0, selector); + w3[3] = hc_byte_perm_S (w3[3], w3[2], selector); + w3[2] = hc_byte_perm_S (w3[2], w3[1], selector); + w3[1] = hc_byte_perm_S (w3[1], w3[0], selector); + w3[0] = hc_byte_perm_S (w3[0], w2[3], selector); + w2[3] = hc_byte_perm_S (w2[3], w2[2], selector); + w2[2] = hc_byte_perm_S (w2[2], w2[1], selector); + w2[1] = hc_byte_perm_S (w2[1], w2[0], selector); + w2[0] = hc_byte_perm_S (w2[0], w1[3], selector); + w1[3] = hc_byte_perm_S (w1[3], w1[2], selector); + w1[2] = hc_byte_perm_S (w1[2], w1[1], selector); + w1[1] = hc_byte_perm_S (w1[1], w1[0], selector); + w1[0] = hc_byte_perm_S (w1[0], w0[3], selector); + w0[3] = hc_byte_perm_S (w0[3], w0[2], selector); + w0[2] = hc_byte_perm_S (w0[2], w0[1], selector); + w0[1] = hc_byte_perm_S (w0[1], w0[0], selector); + w0[0] = hc_byte_perm_S (w0[0], 0, selector); break; case 1: - w3[3] = __byte_perm_S (w3[2], w3[1], selector); - w3[2] = __byte_perm_S (w3[1], w3[0], selector); - w3[1] = __byte_perm_S (w3[0], w2[3], selector); - w3[0] = __byte_perm_S (w2[3], w2[2], selector); - w2[3] = __byte_perm_S (w2[2], w2[1], selector); - w2[2] = __byte_perm_S (w2[1], w2[0], selector); - w2[1] = __byte_perm_S (w2[0], w1[3], selector); - w2[0] = __byte_perm_S (w1[3], w1[2], selector); - w1[3] = __byte_perm_S (w1[2], w1[1], selector); - w1[2] = __byte_perm_S (w1[1], w1[0], selector); - w1[1] = __byte_perm_S (w1[0], w0[3], selector); - w1[0] = __byte_perm_S (w0[3], w0[2], selector); - w0[3] = __byte_perm_S (w0[2], w0[1], selector); - w0[2] = __byte_perm_S (w0[1], w0[0], selector); - w0[1] = __byte_perm_S (w0[0], 0, selector); + w3[3] = hc_byte_perm_S (w3[2], w3[1], selector); + w3[2] = hc_byte_perm_S (w3[1], w3[0], selector); + w3[1] = hc_byte_perm_S (w3[0], w2[3], selector); + w3[0] = hc_byte_perm_S (w2[3], w2[2], selector); + w2[3] = hc_byte_perm_S (w2[2], w2[1], selector); + w2[2] = hc_byte_perm_S (w2[1], w2[0], selector); + w2[1] = hc_byte_perm_S (w2[0], w1[3], selector); + w2[0] = hc_byte_perm_S (w1[3], w1[2], selector); + w1[3] = hc_byte_perm_S (w1[2], w1[1], selector); + w1[2] = hc_byte_perm_S (w1[1], w1[0], selector); + w1[1] = hc_byte_perm_S (w1[0], w0[3], selector); + w1[0] = hc_byte_perm_S (w0[3], w0[2], selector); + w0[3] = hc_byte_perm_S (w0[2], w0[1], selector); + w0[2] = hc_byte_perm_S (w0[1], w0[0], selector); + w0[1] = hc_byte_perm_S (w0[0], 0, selector); w0[0] = 0; break; case 2: - w3[3] = __byte_perm_S (w3[1], w3[0], selector); - w3[2] = __byte_perm_S (w3[0], w2[3], selector); - w3[1] = __byte_perm_S (w2[3], w2[2], selector); - w3[0] = __byte_perm_S (w2[2], w2[1], selector); - w2[3] = __byte_perm_S (w2[1], w2[0], selector); - w2[2] = __byte_perm_S (w2[0], w1[3], selector); - w2[1] = __byte_perm_S (w1[3], w1[2], selector); - w2[0] = __byte_perm_S (w1[2], w1[1], selector); - w1[3] = __byte_perm_S (w1[1], w1[0], selector); - w1[2] = __byte_perm_S (w1[0], w0[3], selector); - w1[1] = __byte_perm_S (w0[3], w0[2], selector); - w1[0] = __byte_perm_S (w0[2], w0[1], selector); - w0[3] = __byte_perm_S (w0[1], w0[0], selector); - w0[2] = __byte_perm_S (w0[0], 0, selector); + w3[3] = hc_byte_perm_S (w3[1], w3[0], selector); + w3[2] = hc_byte_perm_S (w3[0], w2[3], selector); + w3[1] = hc_byte_perm_S (w2[3], w2[2], selector); + w3[0] = hc_byte_perm_S (w2[2], w2[1], selector); + w2[3] = hc_byte_perm_S (w2[1], w2[0], selector); + w2[2] = hc_byte_perm_S (w2[0], w1[3], selector); + w2[1] = hc_byte_perm_S (w1[3], w1[2], selector); + w2[0] = hc_byte_perm_S (w1[2], w1[1], selector); + w1[3] = hc_byte_perm_S (w1[1], w1[0], selector); + w1[2] = hc_byte_perm_S (w1[0], w0[3], selector); + w1[1] = hc_byte_perm_S (w0[3], w0[2], selector); + w1[0] = hc_byte_perm_S (w0[2], w0[1], selector); + w0[3] = hc_byte_perm_S (w0[1], w0[0], selector); + w0[2] = hc_byte_perm_S (w0[0], 0, selector); w0[1] = 0; w0[0] = 0; break; case 3: - w3[3] = __byte_perm_S (w3[0], w2[3], selector); - w3[2] = __byte_perm_S (w2[3], w2[2], selector); - w3[1] = __byte_perm_S (w2[2], w2[1], selector); - w3[0] = __byte_perm_S (w2[1], w2[0], selector); - w2[3] = __byte_perm_S (w2[0], w1[3], selector); - w2[2] = __byte_perm_S (w1[3], w1[2], selector); - w2[1] = __byte_perm_S (w1[2], w1[1], selector); - w2[0] = __byte_perm_S (w1[1], w1[0], selector); - w1[3] = __byte_perm_S (w1[0], w0[3], selector); - w1[2] = __byte_perm_S (w0[3], w0[2], selector); - w1[1] = __byte_perm_S (w0[2], w0[1], selector); - w1[0] = __byte_perm_S (w0[1], w0[0], selector); - w0[3] = __byte_perm_S (w0[0], 0, selector); + w3[3] = hc_byte_perm_S (w3[0], w2[3], selector); + w3[2] = hc_byte_perm_S (w2[3], w2[2], selector); + w3[1] = hc_byte_perm_S (w2[2], w2[1], selector); + w3[0] = hc_byte_perm_S (w2[1], w2[0], selector); + w2[3] = hc_byte_perm_S (w2[0], w1[3], selector); + w2[2] = hc_byte_perm_S (w1[3], w1[2], selector); + w2[1] = hc_byte_perm_S (w1[2], w1[1], selector); + w2[0] = hc_byte_perm_S (w1[1], w1[0], selector); + w1[3] = hc_byte_perm_S (w1[0], w0[3], selector); + w1[2] = hc_byte_perm_S (w0[3], w0[2], selector); + w1[1] = hc_byte_perm_S (w0[2], w0[1], selector); + w1[0] = hc_byte_perm_S (w0[1], w0[0], selector); + w0[3] = hc_byte_perm_S (w0[0], 0, selector); w0[2] = 0; w0[1] = 0; w0[0] = 0; @@ -33642,18 +33642,18 @@ DECLSPEC void switch_buffer_by_offset_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 4: - w3[3] = __byte_perm_S (w2[3], w2[2], selector); - w3[2] = __byte_perm_S (w2[2], w2[1], selector); - w3[1] = __byte_perm_S (w2[1], w2[0], selector); - w3[0] = __byte_perm_S (w2[0], w1[3], selector); - w2[3] = __byte_perm_S (w1[3], w1[2], selector); - w2[2] = __byte_perm_S (w1[2], w1[1], selector); - w2[1] = __byte_perm_S (w1[1], w1[0], selector); - w2[0] = __byte_perm_S (w1[0], w0[3], selector); - w1[3] = __byte_perm_S (w0[3], w0[2], selector); - w1[2] = __byte_perm_S (w0[2], w0[1], selector); - w1[1] = __byte_perm_S (w0[1], w0[0], selector); - w1[0] = __byte_perm_S (w0[0], 0, selector); + w3[3] = hc_byte_perm_S (w2[3], w2[2], selector); + w3[2] = hc_byte_perm_S (w2[2], w2[1], selector); + w3[1] = hc_byte_perm_S (w2[1], w2[0], selector); + w3[0] = hc_byte_perm_S (w2[0], w1[3], selector); + w2[3] = hc_byte_perm_S (w1[3], w1[2], selector); + w2[2] = hc_byte_perm_S (w1[2], w1[1], selector); + w2[1] = hc_byte_perm_S (w1[1], w1[0], selector); + w2[0] = hc_byte_perm_S (w1[0], w0[3], selector); + w1[3] = hc_byte_perm_S (w0[3], w0[2], selector); + w1[2] = hc_byte_perm_S (w0[2], w0[1], selector); + w1[1] = hc_byte_perm_S (w0[1], w0[0], selector); + w1[0] = hc_byte_perm_S (w0[0], 0, selector); w0[3] = 0; w0[2] = 0; w0[1] = 0; @@ -33662,17 +33662,17 @@ DECLSPEC void switch_buffer_by_offset_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 5: - w3[3] = __byte_perm_S (w2[2], w2[1], selector); - w3[2] = __byte_perm_S (w2[1], w2[0], selector); - w3[1] = __byte_perm_S (w2[0], w1[3], selector); - w3[0] = __byte_perm_S (w1[3], w1[2], selector); - w2[3] = __byte_perm_S (w1[2], w1[1], selector); - w2[2] = __byte_perm_S (w1[1], w1[0], selector); - w2[1] = __byte_perm_S (w1[0], w0[3], selector); - w2[0] = __byte_perm_S (w0[3], w0[2], selector); - w1[3] = __byte_perm_S (w0[2], w0[1], selector); - w1[2] = __byte_perm_S (w0[1], w0[0], selector); - w1[1] = __byte_perm_S (w0[0], 0, selector); + w3[3] = hc_byte_perm_S (w2[2], w2[1], selector); + w3[2] = hc_byte_perm_S (w2[1], w2[0], selector); + w3[1] = hc_byte_perm_S (w2[0], w1[3], selector); + w3[0] = hc_byte_perm_S (w1[3], w1[2], selector); + w2[3] = hc_byte_perm_S (w1[2], w1[1], selector); + w2[2] = hc_byte_perm_S (w1[1], w1[0], selector); + w2[1] = hc_byte_perm_S (w1[0], w0[3], selector); + w2[0] = hc_byte_perm_S (w0[3], w0[2], selector); + w1[3] = hc_byte_perm_S (w0[2], w0[1], selector); + w1[2] = hc_byte_perm_S (w0[1], w0[0], selector); + w1[1] = hc_byte_perm_S (w0[0], 0, selector); w1[0] = 0; w0[3] = 0; w0[2] = 0; @@ -33682,16 +33682,16 @@ DECLSPEC void switch_buffer_by_offset_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 6: - w3[3] = __byte_perm_S (w2[1], w2[0], selector); - w3[2] = __byte_perm_S (w2[0], w1[3], selector); - w3[1] = __byte_perm_S (w1[3], w1[2], selector); - w3[0] = __byte_perm_S (w1[2], w1[1], selector); - w2[3] = __byte_perm_S (w1[1], w1[0], selector); - w2[2] = __byte_perm_S (w1[0], w0[3], selector); - w2[1] = __byte_perm_S (w0[3], w0[2], selector); - w2[0] = __byte_perm_S (w0[2], w0[1], selector); - w1[3] = __byte_perm_S (w0[1], w0[0], selector); - w1[2] = __byte_perm_S (w0[0], 0, selector); + w3[3] = hc_byte_perm_S (w2[1], w2[0], selector); + w3[2] = hc_byte_perm_S (w2[0], w1[3], selector); + w3[1] = hc_byte_perm_S (w1[3], w1[2], selector); + w3[0] = hc_byte_perm_S (w1[2], w1[1], selector); + w2[3] = hc_byte_perm_S (w1[1], w1[0], selector); + w2[2] = hc_byte_perm_S (w1[0], w0[3], selector); + w2[1] = hc_byte_perm_S (w0[3], w0[2], selector); + w2[0] = hc_byte_perm_S (w0[2], w0[1], selector); + w1[3] = hc_byte_perm_S (w0[1], w0[0], selector); + w1[2] = hc_byte_perm_S (w0[0], 0, selector); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -33702,15 +33702,15 @@ DECLSPEC void switch_buffer_by_offset_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 7: - w3[3] = __byte_perm_S (w2[0], w1[3], selector); - w3[2] = __byte_perm_S (w1[3], w1[2], selector); - w3[1] = __byte_perm_S (w1[2], w1[1], selector); - w3[0] = __byte_perm_S (w1[1], w1[0], selector); - w2[3] = __byte_perm_S (w1[0], w0[3], selector); - w2[2] = __byte_perm_S (w0[3], w0[2], selector); - w2[1] = __byte_perm_S (w0[2], w0[1], selector); - w2[0] = __byte_perm_S (w0[1], w0[0], selector); - w1[3] = __byte_perm_S (w0[0], 0, selector); + w3[3] = hc_byte_perm_S (w2[0], w1[3], selector); + w3[2] = hc_byte_perm_S (w1[3], w1[2], selector); + w3[1] = hc_byte_perm_S (w1[2], w1[1], selector); + w3[0] = hc_byte_perm_S (w1[1], w1[0], selector); + w2[3] = hc_byte_perm_S (w1[0], w0[3], selector); + w2[2] = hc_byte_perm_S (w0[3], w0[2], selector); + w2[1] = hc_byte_perm_S (w0[2], w0[1], selector); + w2[0] = hc_byte_perm_S (w0[1], w0[0], selector); + w1[3] = hc_byte_perm_S (w0[0], 0, selector); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -33722,14 +33722,14 @@ DECLSPEC void switch_buffer_by_offset_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 8: - w3[3] = __byte_perm_S (w1[3], w1[2], selector); - w3[2] = __byte_perm_S (w1[2], w1[1], selector); - w3[1] = __byte_perm_S (w1[1], w1[0], selector); - w3[0] = __byte_perm_S (w1[0], w0[3], selector); - w2[3] = __byte_perm_S (w0[3], w0[2], selector); - w2[2] = __byte_perm_S (w0[2], w0[1], selector); - w2[1] = __byte_perm_S (w0[1], w0[0], selector); - w2[0] = __byte_perm_S (w0[0], 0, selector); + w3[3] = hc_byte_perm_S (w1[3], w1[2], selector); + w3[2] = hc_byte_perm_S (w1[2], w1[1], selector); + w3[1] = hc_byte_perm_S (w1[1], w1[0], selector); + w3[0] = hc_byte_perm_S (w1[0], w0[3], selector); + w2[3] = hc_byte_perm_S (w0[3], w0[2], selector); + w2[2] = hc_byte_perm_S (w0[2], w0[1], selector); + w2[1] = hc_byte_perm_S (w0[1], w0[0], selector); + w2[0] = hc_byte_perm_S (w0[0], 0, selector); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -33742,13 +33742,13 @@ DECLSPEC void switch_buffer_by_offset_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 9: - w3[3] = __byte_perm_S (w1[2], w1[1], selector); - w3[2] = __byte_perm_S (w1[1], w1[0], selector); - w3[1] = __byte_perm_S (w1[0], w0[3], selector); - w3[0] = __byte_perm_S (w0[3], w0[2], selector); - w2[3] = __byte_perm_S (w0[2], w0[1], selector); - w2[2] = __byte_perm_S (w0[1], w0[0], selector); - w2[1] = __byte_perm_S (w0[0], 0, selector); + w3[3] = hc_byte_perm_S (w1[2], w1[1], selector); + w3[2] = hc_byte_perm_S (w1[1], w1[0], selector); + w3[1] = hc_byte_perm_S (w1[0], w0[3], selector); + w3[0] = hc_byte_perm_S (w0[3], w0[2], selector); + w2[3] = hc_byte_perm_S (w0[2], w0[1], selector); + w2[2] = hc_byte_perm_S (w0[1], w0[0], selector); + w2[1] = hc_byte_perm_S (w0[0], 0, selector); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -33762,12 +33762,12 @@ DECLSPEC void switch_buffer_by_offset_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 10: - w3[3] = __byte_perm_S (w1[1], w1[0], selector); - w3[2] = __byte_perm_S (w1[0], w0[3], selector); - w3[1] = __byte_perm_S (w0[3], w0[2], selector); - w3[0] = __byte_perm_S (w0[2], w0[1], selector); - w2[3] = __byte_perm_S (w0[1], w0[0], selector); - w2[2] = __byte_perm_S (w0[0], 0, selector); + w3[3] = hc_byte_perm_S (w1[1], w1[0], selector); + w3[2] = hc_byte_perm_S (w1[0], w0[3], selector); + w3[1] = hc_byte_perm_S (w0[3], w0[2], selector); + w3[0] = hc_byte_perm_S (w0[2], w0[1], selector); + w2[3] = hc_byte_perm_S (w0[1], w0[0], selector); + w2[2] = hc_byte_perm_S (w0[0], 0, selector); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -33782,11 +33782,11 @@ DECLSPEC void switch_buffer_by_offset_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 11: - w3[3] = __byte_perm_S (w1[0], w0[3], selector); - w3[2] = __byte_perm_S (w0[3], w0[2], selector); - w3[1] = __byte_perm_S (w0[2], w0[1], selector); - w3[0] = __byte_perm_S (w0[1], w0[0], selector); - w2[3] = __byte_perm_S (w0[0], 0, selector); + w3[3] = hc_byte_perm_S (w1[0], w0[3], selector); + w3[2] = hc_byte_perm_S (w0[3], w0[2], selector); + w3[1] = hc_byte_perm_S (w0[2], w0[1], selector); + w3[0] = hc_byte_perm_S (w0[1], w0[0], selector); + w2[3] = hc_byte_perm_S (w0[0], 0, selector); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -33802,10 +33802,10 @@ DECLSPEC void switch_buffer_by_offset_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 12: - w3[3] = __byte_perm_S (w0[3], w0[2], selector); - w3[2] = __byte_perm_S (w0[2], w0[1], selector); - w3[1] = __byte_perm_S (w0[1], w0[0], selector); - w3[0] = __byte_perm_S (w0[0], 0, selector); + w3[3] = hc_byte_perm_S (w0[3], w0[2], selector); + w3[2] = hc_byte_perm_S (w0[2], w0[1], selector); + w3[1] = hc_byte_perm_S (w0[1], w0[0], selector); + w3[0] = hc_byte_perm_S (w0[0], 0, selector); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -33822,9 +33822,9 @@ DECLSPEC void switch_buffer_by_offset_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 13: - w3[3] = __byte_perm_S (w0[2], w0[1], selector); - w3[2] = __byte_perm_S (w0[1], w0[0], selector); - w3[1] = __byte_perm_S (w0[0], 0, selector); + w3[3] = hc_byte_perm_S (w0[2], w0[1], selector); + w3[2] = hc_byte_perm_S (w0[1], w0[0], selector); + w3[1] = hc_byte_perm_S (w0[0], 0, selector); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -33842,8 +33842,8 @@ DECLSPEC void switch_buffer_by_offset_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 14: - w3[3] = __byte_perm_S (w0[1], w0[0], selector); - w3[2] = __byte_perm_S (w0[0], 0, selector); + w3[3] = hc_byte_perm_S (w0[1], w0[0], selector); + w3[2] = hc_byte_perm_S (w0[0], 0, selector); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -33862,7 +33862,7 @@ DECLSPEC void switch_buffer_by_offset_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 15: - w3[3] = __byte_perm_S (w0[0], 0, selector); + w3[3] = hc_byte_perm_S (w0[0], 0, selector); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -33892,89 +33892,89 @@ DECLSPEC void switch_buffer_by_offset_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 switch (offset_switch) { case 0: - c0[0] = amd_bytealign_S (w3[3], 0, offset); - w3[3] = amd_bytealign_S (w3[2], w3[3], offset); - w3[2] = amd_bytealign_S (w3[1], w3[2], offset); - w3[1] = amd_bytealign_S (w3[0], w3[1], offset); - w3[0] = amd_bytealign_S (w2[3], w3[0], offset); - w2[3] = amd_bytealign_S (w2[2], w2[3], offset); - w2[2] = amd_bytealign_S (w2[1], w2[2], offset); - w2[1] = amd_bytealign_S (w2[0], w2[1], offset); - w2[0] = amd_bytealign_S (w1[3], w2[0], offset); - w1[3] = amd_bytealign_S (w1[2], w1[3], offset); - w1[2] = amd_bytealign_S (w1[1], w1[2], offset); - w1[1] = amd_bytealign_S (w1[0], w1[1], offset); - w1[0] = amd_bytealign_S (w0[3], w1[0], offset); - w0[3] = amd_bytealign_S (w0[2], w0[3], offset); - w0[2] = amd_bytealign_S (w0[1], w0[2], offset); - w0[1] = amd_bytealign_S (w0[0], w0[1], offset); - w0[0] = amd_bytealign_S ( 0, w0[0], offset); + c0[0] = hc_bytealign_S (w3[3], 0, offset); + w3[3] = hc_bytealign_S (w3[2], w3[3], offset); + w3[2] = hc_bytealign_S (w3[1], w3[2], offset); + w3[1] = hc_bytealign_S (w3[0], w3[1], offset); + w3[0] = hc_bytealign_S (w2[3], w3[0], offset); + w2[3] = hc_bytealign_S (w2[2], w2[3], offset); + w2[2] = hc_bytealign_S (w2[1], w2[2], offset); + w2[1] = hc_bytealign_S (w2[0], w2[1], offset); + w2[0] = hc_bytealign_S (w1[3], w2[0], offset); + w1[3] = hc_bytealign_S (w1[2], w1[3], offset); + w1[2] = hc_bytealign_S (w1[1], w1[2], offset); + w1[1] = hc_bytealign_S (w1[0], w1[1], offset); + w1[0] = hc_bytealign_S (w0[3], w1[0], offset); + w0[3] = hc_bytealign_S (w0[2], w0[3], offset); + w0[2] = hc_bytealign_S (w0[1], w0[2], offset); + w0[1] = hc_bytealign_S (w0[0], w0[1], offset); + w0[0] = hc_bytealign_S ( 0, w0[0], offset); break; case 1: - c0[1] = amd_bytealign_S (w3[3], 0, offset); - c0[0] = amd_bytealign_S (w3[2], w3[3], offset); - w3[3] = amd_bytealign_S (w3[1], w3[2], offset); - w3[2] = amd_bytealign_S (w3[0], w3[1], offset); - w3[1] = amd_bytealign_S (w2[3], w3[0], offset); - w3[0] = amd_bytealign_S (w2[2], w2[3], offset); - w2[3] = amd_bytealign_S (w2[1], w2[2], offset); - w2[2] = amd_bytealign_S (w2[0], w2[1], offset); - w2[1] = amd_bytealign_S (w1[3], w2[0], offset); - w2[0] = amd_bytealign_S (w1[2], w1[3], offset); - w1[3] = amd_bytealign_S (w1[1], w1[2], offset); - w1[2] = amd_bytealign_S (w1[0], w1[1], offset); - w1[1] = amd_bytealign_S (w0[3], w1[0], offset); - w1[0] = amd_bytealign_S (w0[2], w0[3], offset); - w0[3] = amd_bytealign_S (w0[1], w0[2], offset); - w0[2] = amd_bytealign_S (w0[0], w0[1], offset); - w0[1] = amd_bytealign_S ( 0, w0[0], offset); + c0[1] = hc_bytealign_S (w3[3], 0, offset); + c0[0] = hc_bytealign_S (w3[2], w3[3], offset); + w3[3] = hc_bytealign_S (w3[1], w3[2], offset); + w3[2] = hc_bytealign_S (w3[0], w3[1], offset); + w3[1] = hc_bytealign_S (w2[3], w3[0], offset); + w3[0] = hc_bytealign_S (w2[2], w2[3], offset); + w2[3] = hc_bytealign_S (w2[1], w2[2], offset); + w2[2] = hc_bytealign_S (w2[0], w2[1], offset); + w2[1] = hc_bytealign_S (w1[3], w2[0], offset); + w2[0] = hc_bytealign_S (w1[2], w1[3], offset); + w1[3] = hc_bytealign_S (w1[1], w1[2], offset); + w1[2] = hc_bytealign_S (w1[0], w1[1], offset); + w1[1] = hc_bytealign_S (w0[3], w1[0], offset); + w1[0] = hc_bytealign_S (w0[2], w0[3], offset); + w0[3] = hc_bytealign_S (w0[1], w0[2], offset); + w0[2] = hc_bytealign_S (w0[0], w0[1], offset); + w0[1] = hc_bytealign_S ( 0, w0[0], offset); w0[0] = 0; break; case 2: - c0[2] = amd_bytealign_S (w3[3], 0, offset); - c0[1] = amd_bytealign_S (w3[2], w3[3], offset); - c0[0] = amd_bytealign_S (w3[1], w3[2], offset); - w3[3] = amd_bytealign_S (w3[0], w3[1], offset); - w3[2] = amd_bytealign_S (w2[3], w3[0], offset); - w3[1] = amd_bytealign_S (w2[2], w2[3], offset); - w3[0] = amd_bytealign_S (w2[1], w2[2], offset); - w2[3] = amd_bytealign_S (w2[0], w2[1], offset); - w2[2] = amd_bytealign_S (w1[3], w2[0], offset); - w2[1] = amd_bytealign_S (w1[2], w1[3], offset); - w2[0] = amd_bytealign_S (w1[1], w1[2], offset); - w1[3] = amd_bytealign_S (w1[0], w1[1], offset); - w1[2] = amd_bytealign_S (w0[3], w1[0], offset); - w1[1] = amd_bytealign_S (w0[2], w0[3], offset); - w1[0] = amd_bytealign_S (w0[1], w0[2], offset); - w0[3] = amd_bytealign_S (w0[0], w0[1], offset); - w0[2] = amd_bytealign_S ( 0, w0[0], offset); + c0[2] = hc_bytealign_S (w3[3], 0, offset); + c0[1] = hc_bytealign_S (w3[2], w3[3], offset); + c0[0] = hc_bytealign_S (w3[1], w3[2], offset); + w3[3] = hc_bytealign_S (w3[0], w3[1], offset); + w3[2] = hc_bytealign_S (w2[3], w3[0], offset); + w3[1] = hc_bytealign_S (w2[2], w2[3], offset); + w3[0] = hc_bytealign_S (w2[1], w2[2], offset); + w2[3] = hc_bytealign_S (w2[0], w2[1], offset); + w2[2] = hc_bytealign_S (w1[3], w2[0], offset); + w2[1] = hc_bytealign_S (w1[2], w1[3], offset); + w2[0] = hc_bytealign_S (w1[1], w1[2], offset); + w1[3] = hc_bytealign_S (w1[0], w1[1], offset); + w1[2] = hc_bytealign_S (w0[3], w1[0], offset); + w1[1] = hc_bytealign_S (w0[2], w0[3], offset); + w1[0] = hc_bytealign_S (w0[1], w0[2], offset); + w0[3] = hc_bytealign_S (w0[0], w0[1], offset); + w0[2] = hc_bytealign_S ( 0, w0[0], offset); w0[1] = 0; w0[0] = 0; break; case 3: - c0[3] = amd_bytealign_S (w3[3], 0, offset); - c0[2] = amd_bytealign_S (w3[2], w3[3], offset); - c0[1] = amd_bytealign_S (w3[1], w3[2], offset); - c0[0] = amd_bytealign_S (w3[0], w3[1], offset); - w3[3] = amd_bytealign_S (w2[3], w3[0], offset); - w3[2] = amd_bytealign_S (w2[2], w2[3], offset); - w3[1] = amd_bytealign_S (w2[1], w2[2], offset); - w3[0] = amd_bytealign_S (w2[0], w2[1], offset); - w2[3] = amd_bytealign_S (w1[3], w2[0], offset); - w2[2] = amd_bytealign_S (w1[2], w1[3], offset); - w2[1] = amd_bytealign_S (w1[1], w1[2], offset); - w2[0] = amd_bytealign_S (w1[0], w1[1], offset); - w1[3] = amd_bytealign_S (w0[3], w1[0], offset); - w1[2] = amd_bytealign_S (w0[2], w0[3], offset); - w1[1] = amd_bytealign_S (w0[1], w0[2], offset); - w1[0] = amd_bytealign_S (w0[0], w0[1], offset); - w0[3] = amd_bytealign_S ( 0, w0[0], offset); + c0[3] = hc_bytealign_S (w3[3], 0, offset); + c0[2] = hc_bytealign_S (w3[2], w3[3], offset); + c0[1] = hc_bytealign_S (w3[1], w3[2], offset); + c0[0] = hc_bytealign_S (w3[0], w3[1], offset); + w3[3] = hc_bytealign_S (w2[3], w3[0], offset); + w3[2] = hc_bytealign_S (w2[2], w2[3], offset); + w3[1] = hc_bytealign_S (w2[1], w2[2], offset); + w3[0] = hc_bytealign_S (w2[0], w2[1], offset); + w2[3] = hc_bytealign_S (w1[3], w2[0], offset); + w2[2] = hc_bytealign_S (w1[2], w1[3], offset); + w2[1] = hc_bytealign_S (w1[1], w1[2], offset); + w2[0] = hc_bytealign_S (w1[0], w1[1], offset); + w1[3] = hc_bytealign_S (w0[3], w1[0], offset); + w1[2] = hc_bytealign_S (w0[2], w0[3], offset); + w1[1] = hc_bytealign_S (w0[1], w0[2], offset); + w1[0] = hc_bytealign_S (w0[0], w0[1], offset); + w0[3] = hc_bytealign_S ( 0, w0[0], offset); w0[2] = 0; w0[1] = 0; w0[0] = 0; @@ -33982,23 +33982,23 @@ DECLSPEC void switch_buffer_by_offset_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 4: - c1[0] = amd_bytealign_S (w3[3], 0, offset); - c0[3] = amd_bytealign_S (w3[2], w3[3], offset); - c0[2] = amd_bytealign_S (w3[1], w3[2], offset); - c0[1] = amd_bytealign_S (w3[0], w3[1], offset); - c0[0] = amd_bytealign_S (w2[3], w3[0], offset); - w3[3] = amd_bytealign_S (w2[2], w2[3], offset); - w3[2] = amd_bytealign_S (w2[1], w2[2], offset); - w3[1] = amd_bytealign_S (w2[0], w2[1], offset); - w3[0] = amd_bytealign_S (w1[3], w2[0], offset); - w2[3] = amd_bytealign_S (w1[2], w1[3], offset); - w2[2] = amd_bytealign_S (w1[1], w1[2], offset); - w2[1] = amd_bytealign_S (w1[0], w1[1], offset); - w2[0] = amd_bytealign_S (w0[3], w1[0], offset); - w1[3] = amd_bytealign_S (w0[2], w0[3], offset); - w1[2] = amd_bytealign_S (w0[1], w0[2], offset); - w1[1] = amd_bytealign_S (w0[0], w0[1], offset); - w1[0] = amd_bytealign_S ( 0, w0[0], offset); + c1[0] = hc_bytealign_S (w3[3], 0, offset); + c0[3] = hc_bytealign_S (w3[2], w3[3], offset); + c0[2] = hc_bytealign_S (w3[1], w3[2], offset); + c0[1] = hc_bytealign_S (w3[0], w3[1], offset); + c0[0] = hc_bytealign_S (w2[3], w3[0], offset); + w3[3] = hc_bytealign_S (w2[2], w2[3], offset); + w3[2] = hc_bytealign_S (w2[1], w2[2], offset); + w3[1] = hc_bytealign_S (w2[0], w2[1], offset); + w3[0] = hc_bytealign_S (w1[3], w2[0], offset); + w2[3] = hc_bytealign_S (w1[2], w1[3], offset); + w2[2] = hc_bytealign_S (w1[1], w1[2], offset); + w2[1] = hc_bytealign_S (w1[0], w1[1], offset); + w2[0] = hc_bytealign_S (w0[3], w1[0], offset); + w1[3] = hc_bytealign_S (w0[2], w0[3], offset); + w1[2] = hc_bytealign_S (w0[1], w0[2], offset); + w1[1] = hc_bytealign_S (w0[0], w0[1], offset); + w1[0] = hc_bytealign_S ( 0, w0[0], offset); w0[3] = 0; w0[2] = 0; w0[1] = 0; @@ -34007,23 +34007,23 @@ DECLSPEC void switch_buffer_by_offset_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 5: - c1[1] = amd_bytealign_S (w3[3], 0, offset); - c1[0] = amd_bytealign_S (w3[2], w3[3], offset); - c0[3] = amd_bytealign_S (w3[1], w3[2], offset); - c0[2] = amd_bytealign_S (w3[0], w3[1], offset); - c0[1] = amd_bytealign_S (w2[3], w3[0], offset); - c0[0] = amd_bytealign_S (w2[2], w2[3], offset); - w3[3] = amd_bytealign_S (w2[1], w2[2], offset); - w3[2] = amd_bytealign_S (w2[0], w2[1], offset); - w3[1] = amd_bytealign_S (w1[3], w2[0], offset); - w3[0] = amd_bytealign_S (w1[2], w1[3], offset); - w2[3] = amd_bytealign_S (w1[1], w1[2], offset); - w2[2] = amd_bytealign_S (w1[0], w1[1], offset); - w2[1] = amd_bytealign_S (w0[3], w1[0], offset); - w2[0] = amd_bytealign_S (w0[2], w0[3], offset); - w1[3] = amd_bytealign_S (w0[1], w0[2], offset); - w1[2] = amd_bytealign_S (w0[0], w0[1], offset); - w1[1] = amd_bytealign_S ( 0, w0[0], offset); + c1[1] = hc_bytealign_S (w3[3], 0, offset); + c1[0] = hc_bytealign_S (w3[2], w3[3], offset); + c0[3] = hc_bytealign_S (w3[1], w3[2], offset); + c0[2] = hc_bytealign_S (w3[0], w3[1], offset); + c0[1] = hc_bytealign_S (w2[3], w3[0], offset); + c0[0] = hc_bytealign_S (w2[2], w2[3], offset); + w3[3] = hc_bytealign_S (w2[1], w2[2], offset); + w3[2] = hc_bytealign_S (w2[0], w2[1], offset); + w3[1] = hc_bytealign_S (w1[3], w2[0], offset); + w3[0] = hc_bytealign_S (w1[2], w1[3], offset); + w2[3] = hc_bytealign_S (w1[1], w1[2], offset); + w2[2] = hc_bytealign_S (w1[0], w1[1], offset); + w2[1] = hc_bytealign_S (w0[3], w1[0], offset); + w2[0] = hc_bytealign_S (w0[2], w0[3], offset); + w1[3] = hc_bytealign_S (w0[1], w0[2], offset); + w1[2] = hc_bytealign_S (w0[0], w0[1], offset); + w1[1] = hc_bytealign_S ( 0, w0[0], offset); w1[0] = 0; w0[3] = 0; w0[2] = 0; @@ -34033,23 +34033,23 @@ DECLSPEC void switch_buffer_by_offset_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 6: - c1[2] = amd_bytealign_S (w3[3], 0, offset); - c1[1] = amd_bytealign_S (w3[2], w3[3], offset); - c1[0] = amd_bytealign_S (w3[1], w3[2], offset); - c0[3] = amd_bytealign_S (w3[0], w3[1], offset); - c0[2] = amd_bytealign_S (w2[3], w3[0], offset); - c0[1] = amd_bytealign_S (w2[2], w2[3], offset); - c0[0] = amd_bytealign_S (w2[1], w2[2], offset); - w3[3] = amd_bytealign_S (w2[0], w2[1], offset); - w3[2] = amd_bytealign_S (w1[3], w2[0], offset); - w3[1] = amd_bytealign_S (w1[2], w1[3], offset); - w3[0] = amd_bytealign_S (w1[1], w1[2], offset); - w2[3] = amd_bytealign_S (w1[0], w1[1], offset); - w2[2] = amd_bytealign_S (w0[3], w1[0], offset); - w2[1] = amd_bytealign_S (w0[2], w0[3], offset); - w2[0] = amd_bytealign_S (w0[1], w0[2], offset); - w1[3] = amd_bytealign_S (w0[0], w0[1], offset); - w1[2] = amd_bytealign_S ( 0, w0[0], offset); + c1[2] = hc_bytealign_S (w3[3], 0, offset); + c1[1] = hc_bytealign_S (w3[2], w3[3], offset); + c1[0] = hc_bytealign_S (w3[1], w3[2], offset); + c0[3] = hc_bytealign_S (w3[0], w3[1], offset); + c0[2] = hc_bytealign_S (w2[3], w3[0], offset); + c0[1] = hc_bytealign_S (w2[2], w2[3], offset); + c0[0] = hc_bytealign_S (w2[1], w2[2], offset); + w3[3] = hc_bytealign_S (w2[0], w2[1], offset); + w3[2] = hc_bytealign_S (w1[3], w2[0], offset); + w3[1] = hc_bytealign_S (w1[2], w1[3], offset); + w3[0] = hc_bytealign_S (w1[1], w1[2], offset); + w2[3] = hc_bytealign_S (w1[0], w1[1], offset); + w2[2] = hc_bytealign_S (w0[3], w1[0], offset); + w2[1] = hc_bytealign_S (w0[2], w0[3], offset); + w2[0] = hc_bytealign_S (w0[1], w0[2], offset); + w1[3] = hc_bytealign_S (w0[0], w0[1], offset); + w1[2] = hc_bytealign_S ( 0, w0[0], offset); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -34060,23 +34060,23 @@ DECLSPEC void switch_buffer_by_offset_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 7: - c1[3] = amd_bytealign_S (w3[3], 0, offset); - c1[2] = amd_bytealign_S (w3[2], w3[3], offset); - c1[1] = amd_bytealign_S (w3[1], w3[2], offset); - c1[0] = amd_bytealign_S (w3[0], w3[1], offset); - c0[3] = amd_bytealign_S (w2[3], w3[0], offset); - c0[2] = amd_bytealign_S (w2[2], w2[3], offset); - c0[1] = amd_bytealign_S (w2[1], w2[2], offset); - c0[0] = amd_bytealign_S (w2[0], w2[1], offset); - w3[3] = amd_bytealign_S (w1[3], w2[0], offset); - w3[2] = amd_bytealign_S (w1[2], w1[3], offset); - w3[1] = amd_bytealign_S (w1[1], w1[2], offset); - w3[0] = amd_bytealign_S (w1[0], w1[1], offset); - w2[3] = amd_bytealign_S (w0[3], w1[0], offset); - w2[2] = amd_bytealign_S (w0[2], w0[3], offset); - w2[1] = amd_bytealign_S (w0[1], w0[2], offset); - w2[0] = amd_bytealign_S (w0[0], w0[1], offset); - w1[3] = amd_bytealign_S ( 0, w0[0], offset); + c1[3] = hc_bytealign_S (w3[3], 0, offset); + c1[2] = hc_bytealign_S (w3[2], w3[3], offset); + c1[1] = hc_bytealign_S (w3[1], w3[2], offset); + c1[0] = hc_bytealign_S (w3[0], w3[1], offset); + c0[3] = hc_bytealign_S (w2[3], w3[0], offset); + c0[2] = hc_bytealign_S (w2[2], w2[3], offset); + c0[1] = hc_bytealign_S (w2[1], w2[2], offset); + c0[0] = hc_bytealign_S (w2[0], w2[1], offset); + w3[3] = hc_bytealign_S (w1[3], w2[0], offset); + w3[2] = hc_bytealign_S (w1[2], w1[3], offset); + w3[1] = hc_bytealign_S (w1[1], w1[2], offset); + w3[0] = hc_bytealign_S (w1[0], w1[1], offset); + w2[3] = hc_bytealign_S (w0[3], w1[0], offset); + w2[2] = hc_bytealign_S (w0[2], w0[3], offset); + w2[1] = hc_bytealign_S (w0[1], w0[2], offset); + w2[0] = hc_bytealign_S (w0[0], w0[1], offset); + w1[3] = hc_bytealign_S ( 0, w0[0], offset); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -34088,23 +34088,23 @@ DECLSPEC void switch_buffer_by_offset_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 8: - c2[0] = amd_bytealign_S (w3[3], 0, offset); - c1[3] = amd_bytealign_S (w3[2], w3[3], offset); - c1[2] = amd_bytealign_S (w3[1], w3[2], offset); - c1[1] = amd_bytealign_S (w3[0], w3[1], offset); - c1[0] = amd_bytealign_S (w2[3], w3[0], offset); - c0[3] = amd_bytealign_S (w2[2], w2[3], offset); - c0[2] = amd_bytealign_S (w2[1], w2[2], offset); - c0[1] = amd_bytealign_S (w2[0], w2[1], offset); - c0[0] = amd_bytealign_S (w1[3], w2[0], offset); - w3[3] = amd_bytealign_S (w1[2], w1[3], offset); - w3[2] = amd_bytealign_S (w1[1], w1[2], offset); - w3[1] = amd_bytealign_S (w1[0], w1[1], offset); - w3[0] = amd_bytealign_S (w0[3], w1[0], offset); - w2[3] = amd_bytealign_S (w0[2], w0[3], offset); - w2[2] = amd_bytealign_S (w0[1], w0[2], offset); - w2[1] = amd_bytealign_S (w0[0], w0[1], offset); - w2[0] = amd_bytealign_S ( 0, w0[0], offset); + c2[0] = hc_bytealign_S (w3[3], 0, offset); + c1[3] = hc_bytealign_S (w3[2], w3[3], offset); + c1[2] = hc_bytealign_S (w3[1], w3[2], offset); + c1[1] = hc_bytealign_S (w3[0], w3[1], offset); + c1[0] = hc_bytealign_S (w2[3], w3[0], offset); + c0[3] = hc_bytealign_S (w2[2], w2[3], offset); + c0[2] = hc_bytealign_S (w2[1], w2[2], offset); + c0[1] = hc_bytealign_S (w2[0], w2[1], offset); + c0[0] = hc_bytealign_S (w1[3], w2[0], offset); + w3[3] = hc_bytealign_S (w1[2], w1[3], offset); + w3[2] = hc_bytealign_S (w1[1], w1[2], offset); + w3[1] = hc_bytealign_S (w1[0], w1[1], offset); + w3[0] = hc_bytealign_S (w0[3], w1[0], offset); + w2[3] = hc_bytealign_S (w0[2], w0[3], offset); + w2[2] = hc_bytealign_S (w0[1], w0[2], offset); + w2[1] = hc_bytealign_S (w0[0], w0[1], offset); + w2[0] = hc_bytealign_S ( 0, w0[0], offset); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -34117,23 +34117,23 @@ DECLSPEC void switch_buffer_by_offset_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 9: - c2[1] = amd_bytealign_S (w3[3], 0, offset); - c2[0] = amd_bytealign_S (w3[2], w3[3], offset); - c1[3] = amd_bytealign_S (w3[1], w3[2], offset); - c1[2] = amd_bytealign_S (w3[0], w3[1], offset); - c1[1] = amd_bytealign_S (w2[3], w3[0], offset); - c1[0] = amd_bytealign_S (w2[2], w2[3], offset); - c0[3] = amd_bytealign_S (w2[1], w2[2], offset); - c0[2] = amd_bytealign_S (w2[0], w2[1], offset); - c0[1] = amd_bytealign_S (w1[3], w2[0], offset); - c0[0] = amd_bytealign_S (w1[2], w1[3], offset); - w3[3] = amd_bytealign_S (w1[1], w1[2], offset); - w3[2] = amd_bytealign_S (w1[0], w1[1], offset); - w3[1] = amd_bytealign_S (w0[3], w1[0], offset); - w3[0] = amd_bytealign_S (w0[2], w0[3], offset); - w2[3] = amd_bytealign_S (w0[1], w0[2], offset); - w2[2] = amd_bytealign_S (w0[0], w0[1], offset); - w2[1] = amd_bytealign_S ( 0, w0[0], offset); + c2[1] = hc_bytealign_S (w3[3], 0, offset); + c2[0] = hc_bytealign_S (w3[2], w3[3], offset); + c1[3] = hc_bytealign_S (w3[1], w3[2], offset); + c1[2] = hc_bytealign_S (w3[0], w3[1], offset); + c1[1] = hc_bytealign_S (w2[3], w3[0], offset); + c1[0] = hc_bytealign_S (w2[2], w2[3], offset); + c0[3] = hc_bytealign_S (w2[1], w2[2], offset); + c0[2] = hc_bytealign_S (w2[0], w2[1], offset); + c0[1] = hc_bytealign_S (w1[3], w2[0], offset); + c0[0] = hc_bytealign_S (w1[2], w1[3], offset); + w3[3] = hc_bytealign_S (w1[1], w1[2], offset); + w3[2] = hc_bytealign_S (w1[0], w1[1], offset); + w3[1] = hc_bytealign_S (w0[3], w1[0], offset); + w3[0] = hc_bytealign_S (w0[2], w0[3], offset); + w2[3] = hc_bytealign_S (w0[1], w0[2], offset); + w2[2] = hc_bytealign_S (w0[0], w0[1], offset); + w2[1] = hc_bytealign_S ( 0, w0[0], offset); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -34147,23 +34147,23 @@ DECLSPEC void switch_buffer_by_offset_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 10: - c2[2] = amd_bytealign_S (w3[3], 0, offset); - c2[1] = amd_bytealign_S (w3[2], w3[3], offset); - c2[0] = amd_bytealign_S (w3[1], w3[2], offset); - c1[3] = amd_bytealign_S (w3[0], w3[1], offset); - c1[2] = amd_bytealign_S (w2[3], w3[0], offset); - c1[1] = amd_bytealign_S (w2[2], w2[3], offset); - c1[0] = amd_bytealign_S (w2[1], w2[2], offset); - c0[3] = amd_bytealign_S (w2[0], w2[1], offset); - c0[2] = amd_bytealign_S (w1[3], w2[0], offset); - c0[1] = amd_bytealign_S (w1[2], w1[3], offset); - c0[0] = amd_bytealign_S (w1[1], w1[2], offset); - w3[3] = amd_bytealign_S (w1[0], w1[1], offset); - w3[2] = amd_bytealign_S (w0[3], w1[0], offset); - w3[1] = amd_bytealign_S (w0[2], w0[3], offset); - w3[0] = amd_bytealign_S (w0[1], w0[2], offset); - w2[3] = amd_bytealign_S (w0[0], w0[1], offset); - w2[2] = amd_bytealign_S ( 0, w0[0], offset); + c2[2] = hc_bytealign_S (w3[3], 0, offset); + c2[1] = hc_bytealign_S (w3[2], w3[3], offset); + c2[0] = hc_bytealign_S (w3[1], w3[2], offset); + c1[3] = hc_bytealign_S (w3[0], w3[1], offset); + c1[2] = hc_bytealign_S (w2[3], w3[0], offset); + c1[1] = hc_bytealign_S (w2[2], w2[3], offset); + c1[0] = hc_bytealign_S (w2[1], w2[2], offset); + c0[3] = hc_bytealign_S (w2[0], w2[1], offset); + c0[2] = hc_bytealign_S (w1[3], w2[0], offset); + c0[1] = hc_bytealign_S (w1[2], w1[3], offset); + c0[0] = hc_bytealign_S (w1[1], w1[2], offset); + w3[3] = hc_bytealign_S (w1[0], w1[1], offset); + w3[2] = hc_bytealign_S (w0[3], w1[0], offset); + w3[1] = hc_bytealign_S (w0[2], w0[3], offset); + w3[0] = hc_bytealign_S (w0[1], w0[2], offset); + w2[3] = hc_bytealign_S (w0[0], w0[1], offset); + w2[2] = hc_bytealign_S ( 0, w0[0], offset); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -34178,23 +34178,23 @@ DECLSPEC void switch_buffer_by_offset_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 11: - c2[3] = amd_bytealign_S (w3[3], 0, offset); - c2[2] = amd_bytealign_S (w3[2], w3[3], offset); - c2[1] = amd_bytealign_S (w3[1], w3[2], offset); - c2[0] = amd_bytealign_S (w3[0], w3[1], offset); - c1[3] = amd_bytealign_S (w2[3], w3[0], offset); - c1[2] = amd_bytealign_S (w2[2], w2[3], offset); - c1[1] = amd_bytealign_S (w2[1], w2[2], offset); - c1[0] = amd_bytealign_S (w2[0], w2[1], offset); - c0[3] = amd_bytealign_S (w1[3], w2[0], offset); - c0[2] = amd_bytealign_S (w1[2], w1[3], offset); - c0[1] = amd_bytealign_S (w1[1], w1[2], offset); - c0[0] = amd_bytealign_S (w1[0], w1[1], offset); - w3[3] = amd_bytealign_S (w0[3], w1[0], offset); - w3[2] = amd_bytealign_S (w0[2], w0[3], offset); - w3[1] = amd_bytealign_S (w0[1], w0[2], offset); - w3[0] = amd_bytealign_S (w0[0], w0[1], offset); - w2[3] = amd_bytealign_S ( 0, w0[0], offset); + c2[3] = hc_bytealign_S (w3[3], 0, offset); + c2[2] = hc_bytealign_S (w3[2], w3[3], offset); + c2[1] = hc_bytealign_S (w3[1], w3[2], offset); + c2[0] = hc_bytealign_S (w3[0], w3[1], offset); + c1[3] = hc_bytealign_S (w2[3], w3[0], offset); + c1[2] = hc_bytealign_S (w2[2], w2[3], offset); + c1[1] = hc_bytealign_S (w2[1], w2[2], offset); + c1[0] = hc_bytealign_S (w2[0], w2[1], offset); + c0[3] = hc_bytealign_S (w1[3], w2[0], offset); + c0[2] = hc_bytealign_S (w1[2], w1[3], offset); + c0[1] = hc_bytealign_S (w1[1], w1[2], offset); + c0[0] = hc_bytealign_S (w1[0], w1[1], offset); + w3[3] = hc_bytealign_S (w0[3], w1[0], offset); + w3[2] = hc_bytealign_S (w0[2], w0[3], offset); + w3[1] = hc_bytealign_S (w0[1], w0[2], offset); + w3[0] = hc_bytealign_S (w0[0], w0[1], offset); + w2[3] = hc_bytealign_S ( 0, w0[0], offset); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -34210,23 +34210,23 @@ DECLSPEC void switch_buffer_by_offset_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 12: - c3[0] = amd_bytealign_S (w3[3], 0, offset); - c2[3] = amd_bytealign_S (w3[2], w3[3], offset); - c2[2] = amd_bytealign_S (w3[1], w3[2], offset); - c2[1] = amd_bytealign_S (w3[0], w3[1], offset); - c2[0] = amd_bytealign_S (w2[3], w3[0], offset); - c1[3] = amd_bytealign_S (w2[2], w2[3], offset); - c1[2] = amd_bytealign_S (w2[1], w2[2], offset); - c1[1] = amd_bytealign_S (w2[0], w2[1], offset); - c1[0] = amd_bytealign_S (w1[3], w2[0], offset); - c0[3] = amd_bytealign_S (w1[2], w1[3], offset); - c0[2] = amd_bytealign_S (w1[1], w1[2], offset); - c0[1] = amd_bytealign_S (w1[0], w1[1], offset); - c0[0] = amd_bytealign_S (w0[3], w1[0], offset); - w3[3] = amd_bytealign_S (w0[2], w0[3], offset); - w3[2] = amd_bytealign_S (w0[1], w0[2], offset); - w3[1] = amd_bytealign_S (w0[0], w0[1], offset); - w3[0] = amd_bytealign_S ( 0, w0[0], offset); + c3[0] = hc_bytealign_S (w3[3], 0, offset); + c2[3] = hc_bytealign_S (w3[2], w3[3], offset); + c2[2] = hc_bytealign_S (w3[1], w3[2], offset); + c2[1] = hc_bytealign_S (w3[0], w3[1], offset); + c2[0] = hc_bytealign_S (w2[3], w3[0], offset); + c1[3] = hc_bytealign_S (w2[2], w2[3], offset); + c1[2] = hc_bytealign_S (w2[1], w2[2], offset); + c1[1] = hc_bytealign_S (w2[0], w2[1], offset); + c1[0] = hc_bytealign_S (w1[3], w2[0], offset); + c0[3] = hc_bytealign_S (w1[2], w1[3], offset); + c0[2] = hc_bytealign_S (w1[1], w1[2], offset); + c0[1] = hc_bytealign_S (w1[0], w1[1], offset); + c0[0] = hc_bytealign_S (w0[3], w1[0], offset); + w3[3] = hc_bytealign_S (w0[2], w0[3], offset); + w3[2] = hc_bytealign_S (w0[1], w0[2], offset); + w3[1] = hc_bytealign_S (w0[0], w0[1], offset); + w3[0] = hc_bytealign_S ( 0, w0[0], offset); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -34243,23 +34243,23 @@ DECLSPEC void switch_buffer_by_offset_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 13: - c3[1] = amd_bytealign_S (w3[3], 0, offset); - c3[0] = amd_bytealign_S (w3[2], w3[3], offset); - c2[3] = amd_bytealign_S (w3[1], w3[2], offset); - c2[2] = amd_bytealign_S (w3[0], w3[1], offset); - c2[1] = amd_bytealign_S (w2[3], w3[0], offset); - c2[0] = amd_bytealign_S (w2[2], w2[3], offset); - c1[3] = amd_bytealign_S (w2[1], w2[2], offset); - c1[2] = amd_bytealign_S (w2[0], w2[1], offset); - c1[1] = amd_bytealign_S (w1[3], w2[0], offset); - c1[0] = amd_bytealign_S (w1[2], w1[3], offset); - c0[3] = amd_bytealign_S (w1[1], w1[2], offset); - c0[2] = amd_bytealign_S (w1[0], w1[1], offset); - c0[1] = amd_bytealign_S (w0[3], w1[0], offset); - c0[0] = amd_bytealign_S (w0[2], w0[3], offset); - w3[3] = amd_bytealign_S (w0[1], w0[2], offset); - w3[2] = amd_bytealign_S (w0[0], w0[1], offset); - w3[1] = amd_bytealign_S ( 0, w0[0], offset); + c3[1] = hc_bytealign_S (w3[3], 0, offset); + c3[0] = hc_bytealign_S (w3[2], w3[3], offset); + c2[3] = hc_bytealign_S (w3[1], w3[2], offset); + c2[2] = hc_bytealign_S (w3[0], w3[1], offset); + c2[1] = hc_bytealign_S (w2[3], w3[0], offset); + c2[0] = hc_bytealign_S (w2[2], w2[3], offset); + c1[3] = hc_bytealign_S (w2[1], w2[2], offset); + c1[2] = hc_bytealign_S (w2[0], w2[1], offset); + c1[1] = hc_bytealign_S (w1[3], w2[0], offset); + c1[0] = hc_bytealign_S (w1[2], w1[3], offset); + c0[3] = hc_bytealign_S (w1[1], w1[2], offset); + c0[2] = hc_bytealign_S (w1[0], w1[1], offset); + c0[1] = hc_bytealign_S (w0[3], w1[0], offset); + c0[0] = hc_bytealign_S (w0[2], w0[3], offset); + w3[3] = hc_bytealign_S (w0[1], w0[2], offset); + w3[2] = hc_bytealign_S (w0[0], w0[1], offset); + w3[1] = hc_bytealign_S ( 0, w0[0], offset); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -34277,23 +34277,23 @@ DECLSPEC void switch_buffer_by_offset_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 14: - c3[2] = amd_bytealign_S (w3[3], 0, offset); - c3[1] = amd_bytealign_S (w3[2], w3[3], offset); - c3[0] = amd_bytealign_S (w3[1], w3[2], offset); - c2[3] = amd_bytealign_S (w3[0], w3[1], offset); - c2[2] = amd_bytealign_S (w2[3], w3[0], offset); - c2[1] = amd_bytealign_S (w2[2], w2[3], offset); - c2[0] = amd_bytealign_S (w2[1], w2[2], offset); - c1[3] = amd_bytealign_S (w2[0], w2[1], offset); - c1[2] = amd_bytealign_S (w1[3], w2[0], offset); - c1[1] = amd_bytealign_S (w1[2], w1[3], offset); - c1[0] = amd_bytealign_S (w1[1], w1[2], offset); - c0[3] = amd_bytealign_S (w1[0], w1[1], offset); - c0[2] = amd_bytealign_S (w0[3], w1[0], offset); - c0[1] = amd_bytealign_S (w0[2], w0[3], offset); - c0[0] = amd_bytealign_S (w0[1], w0[2], offset); - w3[3] = amd_bytealign_S (w0[0], w0[1], offset); - w3[2] = amd_bytealign_S ( 0, w0[0], offset); + c3[2] = hc_bytealign_S (w3[3], 0, offset); + c3[1] = hc_bytealign_S (w3[2], w3[3], offset); + c3[0] = hc_bytealign_S (w3[1], w3[2], offset); + c2[3] = hc_bytealign_S (w3[0], w3[1], offset); + c2[2] = hc_bytealign_S (w2[3], w3[0], offset); + c2[1] = hc_bytealign_S (w2[2], w2[3], offset); + c2[0] = hc_bytealign_S (w2[1], w2[2], offset); + c1[3] = hc_bytealign_S (w2[0], w2[1], offset); + c1[2] = hc_bytealign_S (w1[3], w2[0], offset); + c1[1] = hc_bytealign_S (w1[2], w1[3], offset); + c1[0] = hc_bytealign_S (w1[1], w1[2], offset); + c0[3] = hc_bytealign_S (w1[0], w1[1], offset); + c0[2] = hc_bytealign_S (w0[3], w1[0], offset); + c0[1] = hc_bytealign_S (w0[2], w0[3], offset); + c0[0] = hc_bytealign_S (w0[1], w0[2], offset); + w3[3] = hc_bytealign_S (w0[0], w0[1], offset); + w3[2] = hc_bytealign_S ( 0, w0[0], offset); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -34312,23 +34312,23 @@ DECLSPEC void switch_buffer_by_offset_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 15: - c3[3] = amd_bytealign_S (w3[3], 0, offset); - c3[2] = amd_bytealign_S (w3[2], w3[3], offset); - c3[1] = amd_bytealign_S (w3[1], w3[2], offset); - c3[0] = amd_bytealign_S (w3[0], w3[1], offset); - c2[3] = amd_bytealign_S (w2[3], w3[0], offset); - c2[2] = amd_bytealign_S (w2[2], w2[3], offset); - c2[1] = amd_bytealign_S (w2[1], w2[2], offset); - c2[0] = amd_bytealign_S (w2[0], w2[1], offset); - c1[3] = amd_bytealign_S (w1[3], w2[0], offset); - c1[2] = amd_bytealign_S (w1[2], w1[3], offset); - c1[1] = amd_bytealign_S (w1[1], w1[2], offset); - c1[0] = amd_bytealign_S (w1[0], w1[1], offset); - c0[3] = amd_bytealign_S (w0[3], w1[0], offset); - c0[2] = amd_bytealign_S (w0[2], w0[3], offset); - c0[1] = amd_bytealign_S (w0[1], w0[2], offset); - c0[0] = amd_bytealign_S (w0[0], w0[1], offset); - w3[3] = amd_bytealign_S ( 0, w0[0], offset); + c3[3] = hc_bytealign_S (w3[3], 0, offset); + c3[2] = hc_bytealign_S (w3[2], w3[3], offset); + c3[1] = hc_bytealign_S (w3[1], w3[2], offset); + c3[0] = hc_bytealign_S (w3[0], w3[1], offset); + c2[3] = hc_bytealign_S (w2[3], w3[0], offset); + c2[2] = hc_bytealign_S (w2[2], w2[3], offset); + c2[1] = hc_bytealign_S (w2[1], w2[2], offset); + c2[0] = hc_bytealign_S (w2[0], w2[1], offset); + c1[3] = hc_bytealign_S (w1[3], w2[0], offset); + c1[2] = hc_bytealign_S (w1[2], w1[3], offset); + c1[1] = hc_bytealign_S (w1[1], w1[2], offset); + c1[0] = hc_bytealign_S (w1[0], w1[1], offset); + c0[3] = hc_bytealign_S (w0[3], w1[0], offset); + c0[2] = hc_bytealign_S (w0[2], w0[3], offset); + c0[1] = hc_bytealign_S (w0[1], w0[2], offset); + c0[0] = hc_bytealign_S (w0[0], w0[1], offset); + w3[3] = hc_bytealign_S ( 0, w0[0], offset); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -34362,89 +34362,89 @@ DECLSPEC void switch_buffer_by_offset_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 switch (offset_switch) { case 0: - c0[0] = __byte_perm_S ( 0, w3[3], selector); - w3[3] = __byte_perm_S (w3[3], w3[2], selector); - w3[2] = __byte_perm_S (w3[2], w3[1], selector); - w3[1] = __byte_perm_S (w3[1], w3[0], selector); - w3[0] = __byte_perm_S (w3[0], w2[3], selector); - w2[3] = __byte_perm_S (w2[3], w2[2], selector); - w2[2] = __byte_perm_S (w2[2], w2[1], selector); - w2[1] = __byte_perm_S (w2[1], w2[0], selector); - w2[0] = __byte_perm_S (w2[0], w1[3], selector); - w1[3] = __byte_perm_S (w1[3], w1[2], selector); - w1[2] = __byte_perm_S (w1[2], w1[1], selector); - w1[1] = __byte_perm_S (w1[1], w1[0], selector); - w1[0] = __byte_perm_S (w1[0], w0[3], selector); - w0[3] = __byte_perm_S (w0[3], w0[2], selector); - w0[2] = __byte_perm_S (w0[2], w0[1], selector); - w0[1] = __byte_perm_S (w0[1], w0[0], selector); - w0[0] = __byte_perm_S (w0[0], 0, selector); + c0[0] = hc_byte_perm_S ( 0, w3[3], selector); + w3[3] = hc_byte_perm_S (w3[3], w3[2], selector); + w3[2] = hc_byte_perm_S (w3[2], w3[1], selector); + w3[1] = hc_byte_perm_S (w3[1], w3[0], selector); + w3[0] = hc_byte_perm_S (w3[0], w2[3], selector); + w2[3] = hc_byte_perm_S (w2[3], w2[2], selector); + w2[2] = hc_byte_perm_S (w2[2], w2[1], selector); + w2[1] = hc_byte_perm_S (w2[1], w2[0], selector); + w2[0] = hc_byte_perm_S (w2[0], w1[3], selector); + w1[3] = hc_byte_perm_S (w1[3], w1[2], selector); + w1[2] = hc_byte_perm_S (w1[2], w1[1], selector); + w1[1] = hc_byte_perm_S (w1[1], w1[0], selector); + w1[0] = hc_byte_perm_S (w1[0], w0[3], selector); + w0[3] = hc_byte_perm_S (w0[3], w0[2], selector); + w0[2] = hc_byte_perm_S (w0[2], w0[1], selector); + w0[1] = hc_byte_perm_S (w0[1], w0[0], selector); + w0[0] = hc_byte_perm_S (w0[0], 0, selector); break; case 1: - c0[1] = __byte_perm_S ( 0, w3[3], selector); - c0[0] = __byte_perm_S (w3[3], w3[2], selector); - w3[3] = __byte_perm_S (w3[2], w3[1], selector); - w3[2] = __byte_perm_S (w3[1], w3[0], selector); - w3[1] = __byte_perm_S (w3[0], w2[3], selector); - w3[0] = __byte_perm_S (w2[3], w2[2], selector); - w2[3] = __byte_perm_S (w2[2], w2[1], selector); - w2[2] = __byte_perm_S (w2[1], w2[0], selector); - w2[1] = __byte_perm_S (w2[0], w1[3], selector); - w2[0] = __byte_perm_S (w1[3], w1[2], selector); - w1[3] = __byte_perm_S (w1[2], w1[1], selector); - w1[2] = __byte_perm_S (w1[1], w1[0], selector); - w1[1] = __byte_perm_S (w1[0], w0[3], selector); - w1[0] = __byte_perm_S (w0[3], w0[2], selector); - w0[3] = __byte_perm_S (w0[2], w0[1], selector); - w0[2] = __byte_perm_S (w0[1], w0[0], selector); - w0[1] = __byte_perm_S (w0[0], 0, selector); + c0[1] = hc_byte_perm_S ( 0, w3[3], selector); + c0[0] = hc_byte_perm_S (w3[3], w3[2], selector); + w3[3] = hc_byte_perm_S (w3[2], w3[1], selector); + w3[2] = hc_byte_perm_S (w3[1], w3[0], selector); + w3[1] = hc_byte_perm_S (w3[0], w2[3], selector); + w3[0] = hc_byte_perm_S (w2[3], w2[2], selector); + w2[3] = hc_byte_perm_S (w2[2], w2[1], selector); + w2[2] = hc_byte_perm_S (w2[1], w2[0], selector); + w2[1] = hc_byte_perm_S (w2[0], w1[3], selector); + w2[0] = hc_byte_perm_S (w1[3], w1[2], selector); + w1[3] = hc_byte_perm_S (w1[2], w1[1], selector); + w1[2] = hc_byte_perm_S (w1[1], w1[0], selector); + w1[1] = hc_byte_perm_S (w1[0], w0[3], selector); + w1[0] = hc_byte_perm_S (w0[3], w0[2], selector); + w0[3] = hc_byte_perm_S (w0[2], w0[1], selector); + w0[2] = hc_byte_perm_S (w0[1], w0[0], selector); + w0[1] = hc_byte_perm_S (w0[0], 0, selector); w0[0] = 0; break; case 2: - c0[2] = __byte_perm_S ( 0, w3[3], selector); - c0[1] = __byte_perm_S (w3[3], w3[2], selector); - c0[0] = __byte_perm_S (w3[2], w3[1], selector); - w3[3] = __byte_perm_S (w3[1], w3[0], selector); - w3[2] = __byte_perm_S (w3[0], w2[3], selector); - w3[1] = __byte_perm_S (w2[3], w2[2], selector); - w3[0] = __byte_perm_S (w2[2], w2[1], selector); - w2[3] = __byte_perm_S (w2[1], w2[0], selector); - w2[2] = __byte_perm_S (w2[0], w1[3], selector); - w2[1] = __byte_perm_S (w1[3], w1[2], selector); - w2[0] = __byte_perm_S (w1[2], w1[1], selector); - w1[3] = __byte_perm_S (w1[1], w1[0], selector); - w1[2] = __byte_perm_S (w1[0], w0[3], selector); - w1[1] = __byte_perm_S (w0[3], w0[2], selector); - w1[0] = __byte_perm_S (w0[2], w0[1], selector); - w0[3] = __byte_perm_S (w0[1], w0[0], selector); - w0[2] = __byte_perm_S (w0[0], 0, selector); + c0[2] = hc_byte_perm_S ( 0, w3[3], selector); + c0[1] = hc_byte_perm_S (w3[3], w3[2], selector); + c0[0] = hc_byte_perm_S (w3[2], w3[1], selector); + w3[3] = hc_byte_perm_S (w3[1], w3[0], selector); + w3[2] = hc_byte_perm_S (w3[0], w2[3], selector); + w3[1] = hc_byte_perm_S (w2[3], w2[2], selector); + w3[0] = hc_byte_perm_S (w2[2], w2[1], selector); + w2[3] = hc_byte_perm_S (w2[1], w2[0], selector); + w2[2] = hc_byte_perm_S (w2[0], w1[3], selector); + w2[1] = hc_byte_perm_S (w1[3], w1[2], selector); + w2[0] = hc_byte_perm_S (w1[2], w1[1], selector); + w1[3] = hc_byte_perm_S (w1[1], w1[0], selector); + w1[2] = hc_byte_perm_S (w1[0], w0[3], selector); + w1[1] = hc_byte_perm_S (w0[3], w0[2], selector); + w1[0] = hc_byte_perm_S (w0[2], w0[1], selector); + w0[3] = hc_byte_perm_S (w0[1], w0[0], selector); + w0[2] = hc_byte_perm_S (w0[0], 0, selector); w0[1] = 0; w0[0] = 0; break; case 3: - c0[3] = __byte_perm_S ( 0, w3[3], selector); - c0[2] = __byte_perm_S (w3[3], w3[2], selector); - c0[1] = __byte_perm_S (w3[2], w3[1], selector); - c0[0] = __byte_perm_S (w3[1], w3[0], selector); - w3[3] = __byte_perm_S (w3[0], w2[3], selector); - w3[2] = __byte_perm_S (w2[3], w2[2], selector); - w3[1] = __byte_perm_S (w2[2], w2[1], selector); - w3[0] = __byte_perm_S (w2[1], w2[0], selector); - w2[3] = __byte_perm_S (w2[0], w1[3], selector); - w2[2] = __byte_perm_S (w1[3], w1[2], selector); - w2[1] = __byte_perm_S (w1[2], w1[1], selector); - w2[0] = __byte_perm_S (w1[1], w1[0], selector); - w1[3] = __byte_perm_S (w1[0], w0[3], selector); - w1[2] = __byte_perm_S (w0[3], w0[2], selector); - w1[1] = __byte_perm_S (w0[2], w0[1], selector); - w1[0] = __byte_perm_S (w0[1], w0[0], selector); - w0[3] = __byte_perm_S (w0[0], 0, selector); + c0[3] = hc_byte_perm_S ( 0, w3[3], selector); + c0[2] = hc_byte_perm_S (w3[3], w3[2], selector); + c0[1] = hc_byte_perm_S (w3[2], w3[1], selector); + c0[0] = hc_byte_perm_S (w3[1], w3[0], selector); + w3[3] = hc_byte_perm_S (w3[0], w2[3], selector); + w3[2] = hc_byte_perm_S (w2[3], w2[2], selector); + w3[1] = hc_byte_perm_S (w2[2], w2[1], selector); + w3[0] = hc_byte_perm_S (w2[1], w2[0], selector); + w2[3] = hc_byte_perm_S (w2[0], w1[3], selector); + w2[2] = hc_byte_perm_S (w1[3], w1[2], selector); + w2[1] = hc_byte_perm_S (w1[2], w1[1], selector); + w2[0] = hc_byte_perm_S (w1[1], w1[0], selector); + w1[3] = hc_byte_perm_S (w1[0], w0[3], selector); + w1[2] = hc_byte_perm_S (w0[3], w0[2], selector); + w1[1] = hc_byte_perm_S (w0[2], w0[1], selector); + w1[0] = hc_byte_perm_S (w0[1], w0[0], selector); + w0[3] = hc_byte_perm_S (w0[0], 0, selector); w0[2] = 0; w0[1] = 0; w0[0] = 0; @@ -34452,23 +34452,23 @@ DECLSPEC void switch_buffer_by_offset_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 4: - c1[0] = __byte_perm_S ( 0, w3[3], selector); - c0[3] = __byte_perm_S (w3[3], w3[2], selector); - c0[2] = __byte_perm_S (w3[2], w3[1], selector); - c0[1] = __byte_perm_S (w3[1], w3[0], selector); - c0[0] = __byte_perm_S (w3[0], w2[3], selector); - w3[3] = __byte_perm_S (w2[3], w2[2], selector); - w3[2] = __byte_perm_S (w2[2], w2[1], selector); - w3[1] = __byte_perm_S (w2[1], w2[0], selector); - w3[0] = __byte_perm_S (w2[0], w1[3], selector); - w2[3] = __byte_perm_S (w1[3], w1[2], selector); - w2[2] = __byte_perm_S (w1[2], w1[1], selector); - w2[1] = __byte_perm_S (w1[1], w1[0], selector); - w2[0] = __byte_perm_S (w1[0], w0[3], selector); - w1[3] = __byte_perm_S (w0[3], w0[2], selector); - w1[2] = __byte_perm_S (w0[2], w0[1], selector); - w1[1] = __byte_perm_S (w0[1], w0[0], selector); - w1[0] = __byte_perm_S (w0[0], 0, selector); + c1[0] = hc_byte_perm_S ( 0, w3[3], selector); + c0[3] = hc_byte_perm_S (w3[3], w3[2], selector); + c0[2] = hc_byte_perm_S (w3[2], w3[1], selector); + c0[1] = hc_byte_perm_S (w3[1], w3[0], selector); + c0[0] = hc_byte_perm_S (w3[0], w2[3], selector); + w3[3] = hc_byte_perm_S (w2[3], w2[2], selector); + w3[2] = hc_byte_perm_S (w2[2], w2[1], selector); + w3[1] = hc_byte_perm_S (w2[1], w2[0], selector); + w3[0] = hc_byte_perm_S (w2[0], w1[3], selector); + w2[3] = hc_byte_perm_S (w1[3], w1[2], selector); + w2[2] = hc_byte_perm_S (w1[2], w1[1], selector); + w2[1] = hc_byte_perm_S (w1[1], w1[0], selector); + w2[0] = hc_byte_perm_S (w1[0], w0[3], selector); + w1[3] = hc_byte_perm_S (w0[3], w0[2], selector); + w1[2] = hc_byte_perm_S (w0[2], w0[1], selector); + w1[1] = hc_byte_perm_S (w0[1], w0[0], selector); + w1[0] = hc_byte_perm_S (w0[0], 0, selector); w0[3] = 0; w0[2] = 0; w0[1] = 0; @@ -34477,23 +34477,23 @@ DECLSPEC void switch_buffer_by_offset_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 5: - c1[1] = __byte_perm_S ( 0, w3[3], selector); - c1[0] = __byte_perm_S (w3[3], w3[2], selector); - c0[3] = __byte_perm_S (w3[2], w3[1], selector); - c0[2] = __byte_perm_S (w3[1], w3[0], selector); - c0[1] = __byte_perm_S (w3[0], w2[3], selector); - c0[0] = __byte_perm_S (w2[3], w2[2], selector); - w3[3] = __byte_perm_S (w2[2], w2[1], selector); - w3[2] = __byte_perm_S (w2[1], w2[0], selector); - w3[1] = __byte_perm_S (w2[0], w1[3], selector); - w3[0] = __byte_perm_S (w1[3], w1[2], selector); - w2[3] = __byte_perm_S (w1[2], w1[1], selector); - w2[2] = __byte_perm_S (w1[1], w1[0], selector); - w2[1] = __byte_perm_S (w1[0], w0[3], selector); - w2[0] = __byte_perm_S (w0[3], w0[2], selector); - w1[3] = __byte_perm_S (w0[2], w0[1], selector); - w1[2] = __byte_perm_S (w0[1], w0[0], selector); - w1[1] = __byte_perm_S (w0[0], 0, selector); + c1[1] = hc_byte_perm_S ( 0, w3[3], selector); + c1[0] = hc_byte_perm_S (w3[3], w3[2], selector); + c0[3] = hc_byte_perm_S (w3[2], w3[1], selector); + c0[2] = hc_byte_perm_S (w3[1], w3[0], selector); + c0[1] = hc_byte_perm_S (w3[0], w2[3], selector); + c0[0] = hc_byte_perm_S (w2[3], w2[2], selector); + w3[3] = hc_byte_perm_S (w2[2], w2[1], selector); + w3[2] = hc_byte_perm_S (w2[1], w2[0], selector); + w3[1] = hc_byte_perm_S (w2[0], w1[3], selector); + w3[0] = hc_byte_perm_S (w1[3], w1[2], selector); + w2[3] = hc_byte_perm_S (w1[2], w1[1], selector); + w2[2] = hc_byte_perm_S (w1[1], w1[0], selector); + w2[1] = hc_byte_perm_S (w1[0], w0[3], selector); + w2[0] = hc_byte_perm_S (w0[3], w0[2], selector); + w1[3] = hc_byte_perm_S (w0[2], w0[1], selector); + w1[2] = hc_byte_perm_S (w0[1], w0[0], selector); + w1[1] = hc_byte_perm_S (w0[0], 0, selector); w1[0] = 0; w0[3] = 0; w0[2] = 0; @@ -34503,23 +34503,23 @@ DECLSPEC void switch_buffer_by_offset_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 6: - c1[2] = __byte_perm_S ( 0, w3[3], selector); - c1[1] = __byte_perm_S (w3[3], w3[2], selector); - c1[0] = __byte_perm_S (w3[2], w3[1], selector); - c0[3] = __byte_perm_S (w3[1], w3[0], selector); - c0[2] = __byte_perm_S (w3[0], w2[3], selector); - c0[1] = __byte_perm_S (w2[3], w2[2], selector); - c0[0] = __byte_perm_S (w2[2], w2[1], selector); - w3[3] = __byte_perm_S (w2[1], w2[0], selector); - w3[2] = __byte_perm_S (w2[0], w1[3], selector); - w3[1] = __byte_perm_S (w1[3], w1[2], selector); - w3[0] = __byte_perm_S (w1[2], w1[1], selector); - w2[3] = __byte_perm_S (w1[1], w1[0], selector); - w2[2] = __byte_perm_S (w1[0], w0[3], selector); - w2[1] = __byte_perm_S (w0[3], w0[2], selector); - w2[0] = __byte_perm_S (w0[2], w0[1], selector); - w1[3] = __byte_perm_S (w0[1], w0[0], selector); - w1[2] = __byte_perm_S (w0[0], 0, selector); + c1[2] = hc_byte_perm_S ( 0, w3[3], selector); + c1[1] = hc_byte_perm_S (w3[3], w3[2], selector); + c1[0] = hc_byte_perm_S (w3[2], w3[1], selector); + c0[3] = hc_byte_perm_S (w3[1], w3[0], selector); + c0[2] = hc_byte_perm_S (w3[0], w2[3], selector); + c0[1] = hc_byte_perm_S (w2[3], w2[2], selector); + c0[0] = hc_byte_perm_S (w2[2], w2[1], selector); + w3[3] = hc_byte_perm_S (w2[1], w2[0], selector); + w3[2] = hc_byte_perm_S (w2[0], w1[3], selector); + w3[1] = hc_byte_perm_S (w1[3], w1[2], selector); + w3[0] = hc_byte_perm_S (w1[2], w1[1], selector); + w2[3] = hc_byte_perm_S (w1[1], w1[0], selector); + w2[2] = hc_byte_perm_S (w1[0], w0[3], selector); + w2[1] = hc_byte_perm_S (w0[3], w0[2], selector); + w2[0] = hc_byte_perm_S (w0[2], w0[1], selector); + w1[3] = hc_byte_perm_S (w0[1], w0[0], selector); + w1[2] = hc_byte_perm_S (w0[0], 0, selector); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -34530,23 +34530,23 @@ DECLSPEC void switch_buffer_by_offset_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 7: - c1[3] = __byte_perm_S ( 0, w3[3], selector); - c1[2] = __byte_perm_S (w3[3], w3[2], selector); - c1[1] = __byte_perm_S (w3[2], w3[1], selector); - c1[0] = __byte_perm_S (w3[1], w3[0], selector); - c0[3] = __byte_perm_S (w3[0], w2[3], selector); - c0[2] = __byte_perm_S (w2[3], w2[2], selector); - c0[1] = __byte_perm_S (w2[2], w2[1], selector); - c0[0] = __byte_perm_S (w2[1], w2[0], selector); - w3[3] = __byte_perm_S (w2[0], w1[3], selector); - w3[2] = __byte_perm_S (w1[3], w1[2], selector); - w3[1] = __byte_perm_S (w1[2], w1[1], selector); - w3[0] = __byte_perm_S (w1[1], w1[0], selector); - w2[3] = __byte_perm_S (w1[0], w0[3], selector); - w2[2] = __byte_perm_S (w0[3], w0[2], selector); - w2[1] = __byte_perm_S (w0[2], w0[1], selector); - w2[0] = __byte_perm_S (w0[1], w0[0], selector); - w1[3] = __byte_perm_S (w0[0], 0, selector); + c1[3] = hc_byte_perm_S ( 0, w3[3], selector); + c1[2] = hc_byte_perm_S (w3[3], w3[2], selector); + c1[1] = hc_byte_perm_S (w3[2], w3[1], selector); + c1[0] = hc_byte_perm_S (w3[1], w3[0], selector); + c0[3] = hc_byte_perm_S (w3[0], w2[3], selector); + c0[2] = hc_byte_perm_S (w2[3], w2[2], selector); + c0[1] = hc_byte_perm_S (w2[2], w2[1], selector); + c0[0] = hc_byte_perm_S (w2[1], w2[0], selector); + w3[3] = hc_byte_perm_S (w2[0], w1[3], selector); + w3[2] = hc_byte_perm_S (w1[3], w1[2], selector); + w3[1] = hc_byte_perm_S (w1[2], w1[1], selector); + w3[0] = hc_byte_perm_S (w1[1], w1[0], selector); + w2[3] = hc_byte_perm_S (w1[0], w0[3], selector); + w2[2] = hc_byte_perm_S (w0[3], w0[2], selector); + w2[1] = hc_byte_perm_S (w0[2], w0[1], selector); + w2[0] = hc_byte_perm_S (w0[1], w0[0], selector); + w1[3] = hc_byte_perm_S (w0[0], 0, selector); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -34558,23 +34558,23 @@ DECLSPEC void switch_buffer_by_offset_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 8: - c2[0] = __byte_perm_S ( 0, w3[3], selector); - c1[3] = __byte_perm_S (w3[3], w3[2], selector); - c1[2] = __byte_perm_S (w3[2], w3[1], selector); - c1[1] = __byte_perm_S (w3[1], w3[0], selector); - c1[0] = __byte_perm_S (w3[0], w2[3], selector); - c0[3] = __byte_perm_S (w2[3], w2[2], selector); - c0[2] = __byte_perm_S (w2[2], w2[1], selector); - c0[1] = __byte_perm_S (w2[1], w2[0], selector); - c0[0] = __byte_perm_S (w2[0], w1[3], selector); - w3[3] = __byte_perm_S (w1[3], w1[2], selector); - w3[2] = __byte_perm_S (w1[2], w1[1], selector); - w3[1] = __byte_perm_S (w1[1], w1[0], selector); - w3[0] = __byte_perm_S (w1[0], w0[3], selector); - w2[3] = __byte_perm_S (w0[3], w0[2], selector); - w2[2] = __byte_perm_S (w0[2], w0[1], selector); - w2[1] = __byte_perm_S (w0[1], w0[0], selector); - w2[0] = __byte_perm_S (w0[0], 0, selector); + c2[0] = hc_byte_perm_S ( 0, w3[3], selector); + c1[3] = hc_byte_perm_S (w3[3], w3[2], selector); + c1[2] = hc_byte_perm_S (w3[2], w3[1], selector); + c1[1] = hc_byte_perm_S (w3[1], w3[0], selector); + c1[0] = hc_byte_perm_S (w3[0], w2[3], selector); + c0[3] = hc_byte_perm_S (w2[3], w2[2], selector); + c0[2] = hc_byte_perm_S (w2[2], w2[1], selector); + c0[1] = hc_byte_perm_S (w2[1], w2[0], selector); + c0[0] = hc_byte_perm_S (w2[0], w1[3], selector); + w3[3] = hc_byte_perm_S (w1[3], w1[2], selector); + w3[2] = hc_byte_perm_S (w1[2], w1[1], selector); + w3[1] = hc_byte_perm_S (w1[1], w1[0], selector); + w3[0] = hc_byte_perm_S (w1[0], w0[3], selector); + w2[3] = hc_byte_perm_S (w0[3], w0[2], selector); + w2[2] = hc_byte_perm_S (w0[2], w0[1], selector); + w2[1] = hc_byte_perm_S (w0[1], w0[0], selector); + w2[0] = hc_byte_perm_S (w0[0], 0, selector); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -34587,23 +34587,23 @@ DECLSPEC void switch_buffer_by_offset_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 9: - c2[1] = __byte_perm_S ( 0, w3[3], selector); - c2[0] = __byte_perm_S (w3[3], w3[2], selector); - c1[3] = __byte_perm_S (w3[2], w3[1], selector); - c1[2] = __byte_perm_S (w3[1], w3[0], selector); - c1[1] = __byte_perm_S (w3[0], w2[3], selector); - c1[0] = __byte_perm_S (w2[3], w2[2], selector); - c0[3] = __byte_perm_S (w2[2], w2[1], selector); - c0[2] = __byte_perm_S (w2[1], w2[0], selector); - c0[1] = __byte_perm_S (w2[0], w1[3], selector); - c0[0] = __byte_perm_S (w1[3], w1[2], selector); - w3[3] = __byte_perm_S (w1[2], w1[1], selector); - w3[2] = __byte_perm_S (w1[1], w1[0], selector); - w3[1] = __byte_perm_S (w1[0], w0[3], selector); - w3[0] = __byte_perm_S (w0[3], w0[2], selector); - w2[3] = __byte_perm_S (w0[2], w0[1], selector); - w2[2] = __byte_perm_S (w0[1], w0[0], selector); - w2[1] = __byte_perm_S (w0[0], 0, selector); + c2[1] = hc_byte_perm_S ( 0, w3[3], selector); + c2[0] = hc_byte_perm_S (w3[3], w3[2], selector); + c1[3] = hc_byte_perm_S (w3[2], w3[1], selector); + c1[2] = hc_byte_perm_S (w3[1], w3[0], selector); + c1[1] = hc_byte_perm_S (w3[0], w2[3], selector); + c1[0] = hc_byte_perm_S (w2[3], w2[2], selector); + c0[3] = hc_byte_perm_S (w2[2], w2[1], selector); + c0[2] = hc_byte_perm_S (w2[1], w2[0], selector); + c0[1] = hc_byte_perm_S (w2[0], w1[3], selector); + c0[0] = hc_byte_perm_S (w1[3], w1[2], selector); + w3[3] = hc_byte_perm_S (w1[2], w1[1], selector); + w3[2] = hc_byte_perm_S (w1[1], w1[0], selector); + w3[1] = hc_byte_perm_S (w1[0], w0[3], selector); + w3[0] = hc_byte_perm_S (w0[3], w0[2], selector); + w2[3] = hc_byte_perm_S (w0[2], w0[1], selector); + w2[2] = hc_byte_perm_S (w0[1], w0[0], selector); + w2[1] = hc_byte_perm_S (w0[0], 0, selector); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -34617,23 +34617,23 @@ DECLSPEC void switch_buffer_by_offset_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 10: - c2[2] = __byte_perm_S ( 0, w3[3], selector); - c2[1] = __byte_perm_S (w3[3], w3[2], selector); - c2[0] = __byte_perm_S (w3[2], w3[1], selector); - c1[3] = __byte_perm_S (w3[1], w3[0], selector); - c1[2] = __byte_perm_S (w3[0], w2[3], selector); - c1[1] = __byte_perm_S (w2[3], w2[2], selector); - c1[0] = __byte_perm_S (w2[2], w2[1], selector); - c0[3] = __byte_perm_S (w2[1], w2[0], selector); - c0[2] = __byte_perm_S (w2[0], w1[3], selector); - c0[1] = __byte_perm_S (w1[3], w1[2], selector); - c0[0] = __byte_perm_S (w1[2], w1[1], selector); - w3[3] = __byte_perm_S (w1[1], w1[0], selector); - w3[2] = __byte_perm_S (w1[0], w0[3], selector); - w3[1] = __byte_perm_S (w0[3], w0[2], selector); - w3[0] = __byte_perm_S (w0[2], w0[1], selector); - w2[3] = __byte_perm_S (w0[1], w0[0], selector); - w2[2] = __byte_perm_S (w0[0], 0, selector); + c2[2] = hc_byte_perm_S ( 0, w3[3], selector); + c2[1] = hc_byte_perm_S (w3[3], w3[2], selector); + c2[0] = hc_byte_perm_S (w3[2], w3[1], selector); + c1[3] = hc_byte_perm_S (w3[1], w3[0], selector); + c1[2] = hc_byte_perm_S (w3[0], w2[3], selector); + c1[1] = hc_byte_perm_S (w2[3], w2[2], selector); + c1[0] = hc_byte_perm_S (w2[2], w2[1], selector); + c0[3] = hc_byte_perm_S (w2[1], w2[0], selector); + c0[2] = hc_byte_perm_S (w2[0], w1[3], selector); + c0[1] = hc_byte_perm_S (w1[3], w1[2], selector); + c0[0] = hc_byte_perm_S (w1[2], w1[1], selector); + w3[3] = hc_byte_perm_S (w1[1], w1[0], selector); + w3[2] = hc_byte_perm_S (w1[0], w0[3], selector); + w3[1] = hc_byte_perm_S (w0[3], w0[2], selector); + w3[0] = hc_byte_perm_S (w0[2], w0[1], selector); + w2[3] = hc_byte_perm_S (w0[1], w0[0], selector); + w2[2] = hc_byte_perm_S (w0[0], 0, selector); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -34648,23 +34648,23 @@ DECLSPEC void switch_buffer_by_offset_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 11: - c2[3] = __byte_perm_S ( 0, w3[3], selector); - c2[2] = __byte_perm_S (w3[3], w3[2], selector); - c2[1] = __byte_perm_S (w3[2], w3[1], selector); - c2[0] = __byte_perm_S (w3[1], w3[0], selector); - c1[3] = __byte_perm_S (w3[0], w2[3], selector); - c1[2] = __byte_perm_S (w2[3], w2[2], selector); - c1[1] = __byte_perm_S (w2[2], w2[1], selector); - c1[0] = __byte_perm_S (w2[1], w2[0], selector); - c0[3] = __byte_perm_S (w2[0], w1[3], selector); - c0[2] = __byte_perm_S (w1[3], w1[2], selector); - c0[1] = __byte_perm_S (w1[2], w1[1], selector); - c0[0] = __byte_perm_S (w1[1], w1[0], selector); - w3[3] = __byte_perm_S (w1[0], w0[3], selector); - w3[2] = __byte_perm_S (w0[3], w0[2], selector); - w3[1] = __byte_perm_S (w0[2], w0[1], selector); - w3[0] = __byte_perm_S (w0[1], w0[0], selector); - w2[3] = __byte_perm_S (w0[0], 0, selector); + c2[3] = hc_byte_perm_S ( 0, w3[3], selector); + c2[2] = hc_byte_perm_S (w3[3], w3[2], selector); + c2[1] = hc_byte_perm_S (w3[2], w3[1], selector); + c2[0] = hc_byte_perm_S (w3[1], w3[0], selector); + c1[3] = hc_byte_perm_S (w3[0], w2[3], selector); + c1[2] = hc_byte_perm_S (w2[3], w2[2], selector); + c1[1] = hc_byte_perm_S (w2[2], w2[1], selector); + c1[0] = hc_byte_perm_S (w2[1], w2[0], selector); + c0[3] = hc_byte_perm_S (w2[0], w1[3], selector); + c0[2] = hc_byte_perm_S (w1[3], w1[2], selector); + c0[1] = hc_byte_perm_S (w1[2], w1[1], selector); + c0[0] = hc_byte_perm_S (w1[1], w1[0], selector); + w3[3] = hc_byte_perm_S (w1[0], w0[3], selector); + w3[2] = hc_byte_perm_S (w0[3], w0[2], selector); + w3[1] = hc_byte_perm_S (w0[2], w0[1], selector); + w3[0] = hc_byte_perm_S (w0[1], w0[0], selector); + w2[3] = hc_byte_perm_S (w0[0], 0, selector); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -34680,23 +34680,23 @@ DECLSPEC void switch_buffer_by_offset_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 12: - c3[0] = __byte_perm_S ( 0, w3[3], selector); - c2[3] = __byte_perm_S (w3[3], w3[2], selector); - c2[2] = __byte_perm_S (w3[2], w3[1], selector); - c2[1] = __byte_perm_S (w3[1], w3[0], selector); - c2[0] = __byte_perm_S (w3[0], w2[3], selector); - c1[3] = __byte_perm_S (w2[3], w2[2], selector); - c1[2] = __byte_perm_S (w2[2], w2[1], selector); - c1[1] = __byte_perm_S (w2[1], w2[0], selector); - c1[0] = __byte_perm_S (w2[0], w1[3], selector); - c0[3] = __byte_perm_S (w1[3], w1[2], selector); - c0[2] = __byte_perm_S (w1[2], w1[1], selector); - c0[1] = __byte_perm_S (w1[1], w1[0], selector); - c0[0] = __byte_perm_S (w1[0], w0[3], selector); - w3[3] = __byte_perm_S (w0[3], w0[2], selector); - w3[2] = __byte_perm_S (w0[2], w0[1], selector); - w3[1] = __byte_perm_S (w0[1], w0[0], selector); - w3[0] = __byte_perm_S (w0[0], 0, selector); + c3[0] = hc_byte_perm_S ( 0, w3[3], selector); + c2[3] = hc_byte_perm_S (w3[3], w3[2], selector); + c2[2] = hc_byte_perm_S (w3[2], w3[1], selector); + c2[1] = hc_byte_perm_S (w3[1], w3[0], selector); + c2[0] = hc_byte_perm_S (w3[0], w2[3], selector); + c1[3] = hc_byte_perm_S (w2[3], w2[2], selector); + c1[2] = hc_byte_perm_S (w2[2], w2[1], selector); + c1[1] = hc_byte_perm_S (w2[1], w2[0], selector); + c1[0] = hc_byte_perm_S (w2[0], w1[3], selector); + c0[3] = hc_byte_perm_S (w1[3], w1[2], selector); + c0[2] = hc_byte_perm_S (w1[2], w1[1], selector); + c0[1] = hc_byte_perm_S (w1[1], w1[0], selector); + c0[0] = hc_byte_perm_S (w1[0], w0[3], selector); + w3[3] = hc_byte_perm_S (w0[3], w0[2], selector); + w3[2] = hc_byte_perm_S (w0[2], w0[1], selector); + w3[1] = hc_byte_perm_S (w0[1], w0[0], selector); + w3[0] = hc_byte_perm_S (w0[0], 0, selector); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -34713,23 +34713,23 @@ DECLSPEC void switch_buffer_by_offset_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 13: - c3[1] = __byte_perm_S ( 0, w3[3], selector); - c3[0] = __byte_perm_S (w3[3], w3[2], selector); - c2[3] = __byte_perm_S (w3[2], w3[1], selector); - c2[2] = __byte_perm_S (w3[1], w3[0], selector); - c2[1] = __byte_perm_S (w3[0], w2[3], selector); - c2[0] = __byte_perm_S (w2[3], w2[2], selector); - c1[3] = __byte_perm_S (w2[2], w2[1], selector); - c1[2] = __byte_perm_S (w2[1], w2[0], selector); - c1[1] = __byte_perm_S (w2[0], w1[3], selector); - c1[0] = __byte_perm_S (w1[3], w1[2], selector); - c0[3] = __byte_perm_S (w1[2], w1[1], selector); - c0[2] = __byte_perm_S (w1[1], w1[0], selector); - c0[1] = __byte_perm_S (w1[0], w0[3], selector); - c0[0] = __byte_perm_S (w0[3], w0[2], selector); - w3[3] = __byte_perm_S (w0[2], w0[1], selector); - w3[2] = __byte_perm_S (w0[1], w0[0], selector); - w3[1] = __byte_perm_S (w0[0], 0, selector); + c3[1] = hc_byte_perm_S ( 0, w3[3], selector); + c3[0] = hc_byte_perm_S (w3[3], w3[2], selector); + c2[3] = hc_byte_perm_S (w3[2], w3[1], selector); + c2[2] = hc_byte_perm_S (w3[1], w3[0], selector); + c2[1] = hc_byte_perm_S (w3[0], w2[3], selector); + c2[0] = hc_byte_perm_S (w2[3], w2[2], selector); + c1[3] = hc_byte_perm_S (w2[2], w2[1], selector); + c1[2] = hc_byte_perm_S (w2[1], w2[0], selector); + c1[1] = hc_byte_perm_S (w2[0], w1[3], selector); + c1[0] = hc_byte_perm_S (w1[3], w1[2], selector); + c0[3] = hc_byte_perm_S (w1[2], w1[1], selector); + c0[2] = hc_byte_perm_S (w1[1], w1[0], selector); + c0[1] = hc_byte_perm_S (w1[0], w0[3], selector); + c0[0] = hc_byte_perm_S (w0[3], w0[2], selector); + w3[3] = hc_byte_perm_S (w0[2], w0[1], selector); + w3[2] = hc_byte_perm_S (w0[1], w0[0], selector); + w3[1] = hc_byte_perm_S (w0[0], 0, selector); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -34747,23 +34747,23 @@ DECLSPEC void switch_buffer_by_offset_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 14: - c3[2] = __byte_perm_S ( 0, w3[3], selector); - c3[1] = __byte_perm_S (w3[3], w3[2], selector); - c3[0] = __byte_perm_S (w3[2], w3[1], selector); - c2[3] = __byte_perm_S (w3[1], w3[0], selector); - c2[2] = __byte_perm_S (w3[0], w2[3], selector); - c2[1] = __byte_perm_S (w2[3], w2[2], selector); - c2[0] = __byte_perm_S (w2[2], w2[1], selector); - c1[3] = __byte_perm_S (w2[1], w2[0], selector); - c1[2] = __byte_perm_S (w2[0], w1[3], selector); - c1[1] = __byte_perm_S (w1[3], w1[2], selector); - c1[0] = __byte_perm_S (w1[2], w1[1], selector); - c0[3] = __byte_perm_S (w1[1], w1[0], selector); - c0[2] = __byte_perm_S (w1[0], w0[3], selector); - c0[1] = __byte_perm_S (w0[3], w0[2], selector); - c0[0] = __byte_perm_S (w0[2], w0[1], selector); - w3[3] = __byte_perm_S (w0[1], w0[0], selector); - w3[2] = __byte_perm_S (w0[0], 0, selector); + c3[2] = hc_byte_perm_S ( 0, w3[3], selector); + c3[1] = hc_byte_perm_S (w3[3], w3[2], selector); + c3[0] = hc_byte_perm_S (w3[2], w3[1], selector); + c2[3] = hc_byte_perm_S (w3[1], w3[0], selector); + c2[2] = hc_byte_perm_S (w3[0], w2[3], selector); + c2[1] = hc_byte_perm_S (w2[3], w2[2], selector); + c2[0] = hc_byte_perm_S (w2[2], w2[1], selector); + c1[3] = hc_byte_perm_S (w2[1], w2[0], selector); + c1[2] = hc_byte_perm_S (w2[0], w1[3], selector); + c1[1] = hc_byte_perm_S (w1[3], w1[2], selector); + c1[0] = hc_byte_perm_S (w1[2], w1[1], selector); + c0[3] = hc_byte_perm_S (w1[1], w1[0], selector); + c0[2] = hc_byte_perm_S (w1[0], w0[3], selector); + c0[1] = hc_byte_perm_S (w0[3], w0[2], selector); + c0[0] = hc_byte_perm_S (w0[2], w0[1], selector); + w3[3] = hc_byte_perm_S (w0[1], w0[0], selector); + w3[2] = hc_byte_perm_S (w0[0], 0, selector); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -34782,23 +34782,23 @@ DECLSPEC void switch_buffer_by_offset_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 15: - c3[3] = __byte_perm_S ( 0, w3[3], selector); - c3[2] = __byte_perm_S (w3[3], w3[2], selector); - c3[1] = __byte_perm_S (w3[2], w3[1], selector); - c3[0] = __byte_perm_S (w3[1], w3[0], selector); - c2[3] = __byte_perm_S (w3[0], w2[3], selector); - c2[2] = __byte_perm_S (w2[3], w2[2], selector); - c2[1] = __byte_perm_S (w2[2], w2[1], selector); - c2[0] = __byte_perm_S (w2[1], w2[0], selector); - c1[3] = __byte_perm_S (w2[0], w1[3], selector); - c1[2] = __byte_perm_S (w1[3], w1[2], selector); - c1[1] = __byte_perm_S (w1[2], w1[1], selector); - c1[0] = __byte_perm_S (w1[1], w1[0], selector); - c0[3] = __byte_perm_S (w1[0], w0[3], selector); - c0[2] = __byte_perm_S (w0[3], w0[2], selector); - c0[1] = __byte_perm_S (w0[2], w0[1], selector); - c0[0] = __byte_perm_S (w0[1], w0[0], selector); - w3[3] = __byte_perm_S (w0[0], 0, selector); + c3[3] = hc_byte_perm_S ( 0, w3[3], selector); + c3[2] = hc_byte_perm_S (w3[3], w3[2], selector); + c3[1] = hc_byte_perm_S (w3[2], w3[1], selector); + c3[0] = hc_byte_perm_S (w3[1], w3[0], selector); + c2[3] = hc_byte_perm_S (w3[0], w2[3], selector); + c2[2] = hc_byte_perm_S (w2[3], w2[2], selector); + c2[1] = hc_byte_perm_S (w2[2], w2[1], selector); + c2[0] = hc_byte_perm_S (w2[1], w2[0], selector); + c1[3] = hc_byte_perm_S (w2[0], w1[3], selector); + c1[2] = hc_byte_perm_S (w1[3], w1[2], selector); + c1[1] = hc_byte_perm_S (w1[2], w1[1], selector); + c1[0] = hc_byte_perm_S (w1[1], w1[0], selector); + c0[3] = hc_byte_perm_S (w1[0], w0[3], selector); + c0[2] = hc_byte_perm_S (w0[3], w0[2], selector); + c0[1] = hc_byte_perm_S (w0[2], w0[1], selector); + c0[0] = hc_byte_perm_S (w0[1], w0[0], selector); + w3[3] = hc_byte_perm_S (w0[0], 0, selector); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -34865,143 +34865,143 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * switch (offset_switch) { case 0: - w7[3] = amd_bytealign_S (w7[2], w7[3], offset); - w7[2] = amd_bytealign_S (w7[1], w7[2], offset); - w7[1] = amd_bytealign_S (w7[0], w7[1], offset); - w7[0] = amd_bytealign_S (w6[3], w7[0], offset); - w6[3] = amd_bytealign_S (w6[2], w6[3], offset); - w6[2] = amd_bytealign_S (w6[1], w6[2], offset); - w6[1] = amd_bytealign_S (w6[0], w6[1], offset); - w6[0] = amd_bytealign_S (w5[3], w6[0], offset); - w5[3] = amd_bytealign_S (w5[2], w5[3], offset); - w5[2] = amd_bytealign_S (w5[1], w5[2], offset); - w5[1] = amd_bytealign_S (w5[0], w5[1], offset); - w5[0] = amd_bytealign_S (w4[3], w5[0], offset); - w4[3] = amd_bytealign_S (w4[2], w4[3], offset); - w4[2] = amd_bytealign_S (w4[1], w4[2], offset); - w4[1] = amd_bytealign_S (w4[0], w4[1], offset); - w4[0] = amd_bytealign_S (w3[3], w4[0], offset); - w3[3] = amd_bytealign_S (w3[2], w3[3], offset); - w3[2] = amd_bytealign_S (w3[1], w3[2], offset); - w3[1] = amd_bytealign_S (w3[0], w3[1], offset); - w3[0] = amd_bytealign_S (w2[3], w3[0], offset); - w2[3] = amd_bytealign_S (w2[2], w2[3], offset); - w2[2] = amd_bytealign_S (w2[1], w2[2], offset); - w2[1] = amd_bytealign_S (w2[0], w2[1], offset); - w2[0] = amd_bytealign_S (w1[3], w2[0], offset); - w1[3] = amd_bytealign_S (w1[2], w1[3], offset); - w1[2] = amd_bytealign_S (w1[1], w1[2], offset); - w1[1] = amd_bytealign_S (w1[0], w1[1], offset); - w1[0] = amd_bytealign_S (w0[3], w1[0], offset); - w0[3] = amd_bytealign_S (w0[2], w0[3], offset); - w0[2] = amd_bytealign_S (w0[1], w0[2], offset); - w0[1] = amd_bytealign_S (w0[0], w0[1], offset); - w0[0] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w7[2], w7[3], offset); + w7[2] = hc_bytealign_S (w7[1], w7[2], offset); + w7[1] = hc_bytealign_S (w7[0], w7[1], offset); + w7[0] = hc_bytealign_S (w6[3], w7[0], offset); + w6[3] = hc_bytealign_S (w6[2], w6[3], offset); + w6[2] = hc_bytealign_S (w6[1], w6[2], offset); + w6[1] = hc_bytealign_S (w6[0], w6[1], offset); + w6[0] = hc_bytealign_S (w5[3], w6[0], offset); + w5[3] = hc_bytealign_S (w5[2], w5[3], offset); + w5[2] = hc_bytealign_S (w5[1], w5[2], offset); + w5[1] = hc_bytealign_S (w5[0], w5[1], offset); + w5[0] = hc_bytealign_S (w4[3], w5[0], offset); + w4[3] = hc_bytealign_S (w4[2], w4[3], offset); + w4[2] = hc_bytealign_S (w4[1], w4[2], offset); + w4[1] = hc_bytealign_S (w4[0], w4[1], offset); + w4[0] = hc_bytealign_S (w3[3], w4[0], offset); + w3[3] = hc_bytealign_S (w3[2], w3[3], offset); + w3[2] = hc_bytealign_S (w3[1], w3[2], offset); + w3[1] = hc_bytealign_S (w3[0], w3[1], offset); + w3[0] = hc_bytealign_S (w2[3], w3[0], offset); + w2[3] = hc_bytealign_S (w2[2], w2[3], offset); + w2[2] = hc_bytealign_S (w2[1], w2[2], offset); + w2[1] = hc_bytealign_S (w2[0], w2[1], offset); + w2[0] = hc_bytealign_S (w1[3], w2[0], offset); + w1[3] = hc_bytealign_S (w1[2], w1[3], offset); + w1[2] = hc_bytealign_S (w1[1], w1[2], offset); + w1[1] = hc_bytealign_S (w1[0], w1[1], offset); + w1[0] = hc_bytealign_S (w0[3], w1[0], offset); + w0[3] = hc_bytealign_S (w0[2], w0[3], offset); + w0[2] = hc_bytealign_S (w0[1], w0[2], offset); + w0[1] = hc_bytealign_S (w0[0], w0[1], offset); + w0[0] = hc_bytealign_S ( 0, w0[0], offset); break; case 1: - w7[3] = amd_bytealign_S (w7[1], w7[2], offset); - w7[2] = amd_bytealign_S (w7[0], w7[1], offset); - w7[1] = amd_bytealign_S (w6[3], w7[0], offset); - w7[0] = amd_bytealign_S (w6[2], w6[3], offset); - w6[3] = amd_bytealign_S (w6[1], w6[2], offset); - w6[2] = amd_bytealign_S (w6[0], w6[1], offset); - w6[1] = amd_bytealign_S (w5[3], w6[0], offset); - w6[0] = amd_bytealign_S (w5[2], w5[3], offset); - w5[3] = amd_bytealign_S (w5[1], w5[2], offset); - w5[2] = amd_bytealign_S (w5[0], w5[1], offset); - w5[1] = amd_bytealign_S (w4[3], w5[0], offset); - w5[0] = amd_bytealign_S (w4[2], w4[3], offset); - w4[3] = amd_bytealign_S (w4[1], w4[2], offset); - w4[2] = amd_bytealign_S (w4[0], w4[1], offset); - w4[1] = amd_bytealign_S (w3[3], w4[0], offset); - w4[0] = amd_bytealign_S (w3[2], w3[3], offset); - w3[3] = amd_bytealign_S (w3[1], w3[2], offset); - w3[2] = amd_bytealign_S (w3[0], w3[1], offset); - w3[1] = amd_bytealign_S (w2[3], w3[0], offset); - w3[0] = amd_bytealign_S (w2[2], w2[3], offset); - w2[3] = amd_bytealign_S (w2[1], w2[2], offset); - w2[2] = amd_bytealign_S (w2[0], w2[1], offset); - w2[1] = amd_bytealign_S (w1[3], w2[0], offset); - w2[0] = amd_bytealign_S (w1[2], w1[3], offset); - w1[3] = amd_bytealign_S (w1[1], w1[2], offset); - w1[2] = amd_bytealign_S (w1[0], w1[1], offset); - w1[1] = amd_bytealign_S (w0[3], w1[0], offset); - w1[0] = amd_bytealign_S (w0[2], w0[3], offset); - w0[3] = amd_bytealign_S (w0[1], w0[2], offset); - w0[2] = amd_bytealign_S (w0[0], w0[1], offset); - w0[1] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w7[1], w7[2], offset); + w7[2] = hc_bytealign_S (w7[0], w7[1], offset); + w7[1] = hc_bytealign_S (w6[3], w7[0], offset); + w7[0] = hc_bytealign_S (w6[2], w6[3], offset); + w6[3] = hc_bytealign_S (w6[1], w6[2], offset); + w6[2] = hc_bytealign_S (w6[0], w6[1], offset); + w6[1] = hc_bytealign_S (w5[3], w6[0], offset); + w6[0] = hc_bytealign_S (w5[2], w5[3], offset); + w5[3] = hc_bytealign_S (w5[1], w5[2], offset); + w5[2] = hc_bytealign_S (w5[0], w5[1], offset); + w5[1] = hc_bytealign_S (w4[3], w5[0], offset); + w5[0] = hc_bytealign_S (w4[2], w4[3], offset); + w4[3] = hc_bytealign_S (w4[1], w4[2], offset); + w4[2] = hc_bytealign_S (w4[0], w4[1], offset); + w4[1] = hc_bytealign_S (w3[3], w4[0], offset); + w4[0] = hc_bytealign_S (w3[2], w3[3], offset); + w3[3] = hc_bytealign_S (w3[1], w3[2], offset); + w3[2] = hc_bytealign_S (w3[0], w3[1], offset); + w3[1] = hc_bytealign_S (w2[3], w3[0], offset); + w3[0] = hc_bytealign_S (w2[2], w2[3], offset); + w2[3] = hc_bytealign_S (w2[1], w2[2], offset); + w2[2] = hc_bytealign_S (w2[0], w2[1], offset); + w2[1] = hc_bytealign_S (w1[3], w2[0], offset); + w2[0] = hc_bytealign_S (w1[2], w1[3], offset); + w1[3] = hc_bytealign_S (w1[1], w1[2], offset); + w1[2] = hc_bytealign_S (w1[0], w1[1], offset); + w1[1] = hc_bytealign_S (w0[3], w1[0], offset); + w1[0] = hc_bytealign_S (w0[2], w0[3], offset); + w0[3] = hc_bytealign_S (w0[1], w0[2], offset); + w0[2] = hc_bytealign_S (w0[0], w0[1], offset); + w0[1] = hc_bytealign_S ( 0, w0[0], offset); w0[0] = 0; break; case 2: - w7[3] = amd_bytealign_S (w7[0], w7[1], offset); - w7[2] = amd_bytealign_S (w6[3], w7[0], offset); - w7[1] = amd_bytealign_S (w6[2], w6[3], offset); - w7[0] = amd_bytealign_S (w6[1], w6[2], offset); - w6[3] = amd_bytealign_S (w6[0], w6[1], offset); - w6[2] = amd_bytealign_S (w5[3], w6[0], offset); - w6[1] = amd_bytealign_S (w5[2], w5[3], offset); - w6[0] = amd_bytealign_S (w5[1], w5[2], offset); - w5[3] = amd_bytealign_S (w5[0], w5[1], offset); - w5[2] = amd_bytealign_S (w4[3], w5[0], offset); - w5[1] = amd_bytealign_S (w4[2], w4[3], offset); - w5[0] = amd_bytealign_S (w4[1], w4[2], offset); - w4[3] = amd_bytealign_S (w4[0], w4[1], offset); - w4[2] = amd_bytealign_S (w3[3], w4[0], offset); - w4[1] = amd_bytealign_S (w3[2], w3[3], offset); - w4[0] = amd_bytealign_S (w3[1], w3[2], offset); - w3[3] = amd_bytealign_S (w3[0], w3[1], offset); - w3[2] = amd_bytealign_S (w2[3], w3[0], offset); - w3[1] = amd_bytealign_S (w2[2], w2[3], offset); - w3[0] = amd_bytealign_S (w2[1], w2[2], offset); - w2[3] = amd_bytealign_S (w2[0], w2[1], offset); - w2[2] = amd_bytealign_S (w1[3], w2[0], offset); - w2[1] = amd_bytealign_S (w1[2], w1[3], offset); - w2[0] = amd_bytealign_S (w1[1], w1[2], offset); - w1[3] = amd_bytealign_S (w1[0], w1[1], offset); - w1[2] = amd_bytealign_S (w0[3], w1[0], offset); - w1[1] = amd_bytealign_S (w0[2], w0[3], offset); - w1[0] = amd_bytealign_S (w0[1], w0[2], offset); - w0[3] = amd_bytealign_S (w0[0], w0[1], offset); - w0[2] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w7[0], w7[1], offset); + w7[2] = hc_bytealign_S (w6[3], w7[0], offset); + w7[1] = hc_bytealign_S (w6[2], w6[3], offset); + w7[0] = hc_bytealign_S (w6[1], w6[2], offset); + w6[3] = hc_bytealign_S (w6[0], w6[1], offset); + w6[2] = hc_bytealign_S (w5[3], w6[0], offset); + w6[1] = hc_bytealign_S (w5[2], w5[3], offset); + w6[0] = hc_bytealign_S (w5[1], w5[2], offset); + w5[3] = hc_bytealign_S (w5[0], w5[1], offset); + w5[2] = hc_bytealign_S (w4[3], w5[0], offset); + w5[1] = hc_bytealign_S (w4[2], w4[3], offset); + w5[0] = hc_bytealign_S (w4[1], w4[2], offset); + w4[3] = hc_bytealign_S (w4[0], w4[1], offset); + w4[2] = hc_bytealign_S (w3[3], w4[0], offset); + w4[1] = hc_bytealign_S (w3[2], w3[3], offset); + w4[0] = hc_bytealign_S (w3[1], w3[2], offset); + w3[3] = hc_bytealign_S (w3[0], w3[1], offset); + w3[2] = hc_bytealign_S (w2[3], w3[0], offset); + w3[1] = hc_bytealign_S (w2[2], w2[3], offset); + w3[0] = hc_bytealign_S (w2[1], w2[2], offset); + w2[3] = hc_bytealign_S (w2[0], w2[1], offset); + w2[2] = hc_bytealign_S (w1[3], w2[0], offset); + w2[1] = hc_bytealign_S (w1[2], w1[3], offset); + w2[0] = hc_bytealign_S (w1[1], w1[2], offset); + w1[3] = hc_bytealign_S (w1[0], w1[1], offset); + w1[2] = hc_bytealign_S (w0[3], w1[0], offset); + w1[1] = hc_bytealign_S (w0[2], w0[3], offset); + w1[0] = hc_bytealign_S (w0[1], w0[2], offset); + w0[3] = hc_bytealign_S (w0[0], w0[1], offset); + w0[2] = hc_bytealign_S ( 0, w0[0], offset); w0[1] = 0; w0[0] = 0; break; case 3: - w7[3] = amd_bytealign_S (w6[3], w7[0], offset); - w7[2] = amd_bytealign_S (w6[2], w6[3], offset); - w7[1] = amd_bytealign_S (w6[1], w6[2], offset); - w7[0] = amd_bytealign_S (w6[0], w6[1], offset); - w6[3] = amd_bytealign_S (w5[3], w6[0], offset); - w6[2] = amd_bytealign_S (w5[2], w5[3], offset); - w6[1] = amd_bytealign_S (w5[1], w5[2], offset); - w6[0] = amd_bytealign_S (w5[0], w5[1], offset); - w5[3] = amd_bytealign_S (w4[3], w5[0], offset); - w5[2] = amd_bytealign_S (w4[2], w4[3], offset); - w5[1] = amd_bytealign_S (w4[1], w4[2], offset); - w5[0] = amd_bytealign_S (w4[0], w4[1], offset); - w4[3] = amd_bytealign_S (w3[3], w4[0], offset); - w4[2] = amd_bytealign_S (w3[2], w3[3], offset); - w4[1] = amd_bytealign_S (w3[1], w3[2], offset); - w4[0] = amd_bytealign_S (w3[0], w3[1], offset); - w3[3] = amd_bytealign_S (w2[3], w3[0], offset); - w3[2] = amd_bytealign_S (w2[2], w2[3], offset); - w3[1] = amd_bytealign_S (w2[1], w2[2], offset); - w3[0] = amd_bytealign_S (w2[0], w2[1], offset); - w2[3] = amd_bytealign_S (w1[3], w2[0], offset); - w2[2] = amd_bytealign_S (w1[2], w1[3], offset); - w2[1] = amd_bytealign_S (w1[1], w1[2], offset); - w2[0] = amd_bytealign_S (w1[0], w1[1], offset); - w1[3] = amd_bytealign_S (w0[3], w1[0], offset); - w1[2] = amd_bytealign_S (w0[2], w0[3], offset); - w1[1] = amd_bytealign_S (w0[1], w0[2], offset); - w1[0] = amd_bytealign_S (w0[0], w0[1], offset); - w0[3] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w6[3], w7[0], offset); + w7[2] = hc_bytealign_S (w6[2], w6[3], offset); + w7[1] = hc_bytealign_S (w6[1], w6[2], offset); + w7[0] = hc_bytealign_S (w6[0], w6[1], offset); + w6[3] = hc_bytealign_S (w5[3], w6[0], offset); + w6[2] = hc_bytealign_S (w5[2], w5[3], offset); + w6[1] = hc_bytealign_S (w5[1], w5[2], offset); + w6[0] = hc_bytealign_S (w5[0], w5[1], offset); + w5[3] = hc_bytealign_S (w4[3], w5[0], offset); + w5[2] = hc_bytealign_S (w4[2], w4[3], offset); + w5[1] = hc_bytealign_S (w4[1], w4[2], offset); + w5[0] = hc_bytealign_S (w4[0], w4[1], offset); + w4[3] = hc_bytealign_S (w3[3], w4[0], offset); + w4[2] = hc_bytealign_S (w3[2], w3[3], offset); + w4[1] = hc_bytealign_S (w3[1], w3[2], offset); + w4[0] = hc_bytealign_S (w3[0], w3[1], offset); + w3[3] = hc_bytealign_S (w2[3], w3[0], offset); + w3[2] = hc_bytealign_S (w2[2], w2[3], offset); + w3[1] = hc_bytealign_S (w2[1], w2[2], offset); + w3[0] = hc_bytealign_S (w2[0], w2[1], offset); + w2[3] = hc_bytealign_S (w1[3], w2[0], offset); + w2[2] = hc_bytealign_S (w1[2], w1[3], offset); + w2[1] = hc_bytealign_S (w1[1], w1[2], offset); + w2[0] = hc_bytealign_S (w1[0], w1[1], offset); + w1[3] = hc_bytealign_S (w0[3], w1[0], offset); + w1[2] = hc_bytealign_S (w0[2], w0[3], offset); + w1[1] = hc_bytealign_S (w0[1], w0[2], offset); + w1[0] = hc_bytealign_S (w0[0], w0[1], offset); + w0[3] = hc_bytealign_S ( 0, w0[0], offset); w0[2] = 0; w0[1] = 0; w0[0] = 0; @@ -35009,34 +35009,34 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 4: - w7[3] = amd_bytealign_S (w6[2], w6[3], offset); - w7[2] = amd_bytealign_S (w6[1], w6[2], offset); - w7[1] = amd_bytealign_S (w6[0], w6[1], offset); - w7[0] = amd_bytealign_S (w5[3], w6[0], offset); - w6[3] = amd_bytealign_S (w5[2], w5[3], offset); - w6[2] = amd_bytealign_S (w5[1], w5[2], offset); - w6[1] = amd_bytealign_S (w5[0], w5[1], offset); - w6[0] = amd_bytealign_S (w4[3], w5[0], offset); - w5[3] = amd_bytealign_S (w4[2], w4[3], offset); - w5[2] = amd_bytealign_S (w4[1], w4[2], offset); - w5[1] = amd_bytealign_S (w4[0], w4[1], offset); - w5[0] = amd_bytealign_S (w3[3], w4[0], offset); - w4[3] = amd_bytealign_S (w3[2], w3[3], offset); - w4[2] = amd_bytealign_S (w3[1], w3[2], offset); - w4[1] = amd_bytealign_S (w3[0], w3[1], offset); - w4[0] = amd_bytealign_S (w2[3], w3[0], offset); - w3[3] = amd_bytealign_S (w2[2], w2[3], offset); - w3[2] = amd_bytealign_S (w2[1], w2[2], offset); - w3[1] = amd_bytealign_S (w2[0], w2[1], offset); - w3[0] = amd_bytealign_S (w1[3], w2[0], offset); - w2[3] = amd_bytealign_S (w1[2], w1[3], offset); - w2[2] = amd_bytealign_S (w1[1], w1[2], offset); - w2[1] = amd_bytealign_S (w1[0], w1[1], offset); - w2[0] = amd_bytealign_S (w0[3], w1[0], offset); - w1[3] = amd_bytealign_S (w0[2], w0[3], offset); - w1[2] = amd_bytealign_S (w0[1], w0[2], offset); - w1[1] = amd_bytealign_S (w0[0], w0[1], offset); - w1[0] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w6[2], w6[3], offset); + w7[2] = hc_bytealign_S (w6[1], w6[2], offset); + w7[1] = hc_bytealign_S (w6[0], w6[1], offset); + w7[0] = hc_bytealign_S (w5[3], w6[0], offset); + w6[3] = hc_bytealign_S (w5[2], w5[3], offset); + w6[2] = hc_bytealign_S (w5[1], w5[2], offset); + w6[1] = hc_bytealign_S (w5[0], w5[1], offset); + w6[0] = hc_bytealign_S (w4[3], w5[0], offset); + w5[3] = hc_bytealign_S (w4[2], w4[3], offset); + w5[2] = hc_bytealign_S (w4[1], w4[2], offset); + w5[1] = hc_bytealign_S (w4[0], w4[1], offset); + w5[0] = hc_bytealign_S (w3[3], w4[0], offset); + w4[3] = hc_bytealign_S (w3[2], w3[3], offset); + w4[2] = hc_bytealign_S (w3[1], w3[2], offset); + w4[1] = hc_bytealign_S (w3[0], w3[1], offset); + w4[0] = hc_bytealign_S (w2[3], w3[0], offset); + w3[3] = hc_bytealign_S (w2[2], w2[3], offset); + w3[2] = hc_bytealign_S (w2[1], w2[2], offset); + w3[1] = hc_bytealign_S (w2[0], w2[1], offset); + w3[0] = hc_bytealign_S (w1[3], w2[0], offset); + w2[3] = hc_bytealign_S (w1[2], w1[3], offset); + w2[2] = hc_bytealign_S (w1[1], w1[2], offset); + w2[1] = hc_bytealign_S (w1[0], w1[1], offset); + w2[0] = hc_bytealign_S (w0[3], w1[0], offset); + w1[3] = hc_bytealign_S (w0[2], w0[3], offset); + w1[2] = hc_bytealign_S (w0[1], w0[2], offset); + w1[1] = hc_bytealign_S (w0[0], w0[1], offset); + w1[0] = hc_bytealign_S ( 0, w0[0], offset); w0[3] = 0; w0[2] = 0; w0[1] = 0; @@ -35045,33 +35045,33 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 5: - w7[3] = amd_bytealign_S (w6[1], w6[2], offset); - w7[2] = amd_bytealign_S (w6[0], w6[1], offset); - w7[1] = amd_bytealign_S (w5[3], w6[0], offset); - w7[0] = amd_bytealign_S (w5[2], w5[3], offset); - w6[3] = amd_bytealign_S (w5[1], w5[2], offset); - w6[2] = amd_bytealign_S (w5[0], w5[1], offset); - w6[1] = amd_bytealign_S (w4[3], w5[0], offset); - w6[0] = amd_bytealign_S (w4[2], w4[3], offset); - w5[3] = amd_bytealign_S (w4[1], w4[2], offset); - w5[2] = amd_bytealign_S (w4[0], w4[1], offset); - w5[1] = amd_bytealign_S (w3[3], w4[0], offset); - w5[0] = amd_bytealign_S (w3[2], w3[3], offset); - w4[3] = amd_bytealign_S (w3[1], w3[2], offset); - w4[2] = amd_bytealign_S (w3[0], w3[1], offset); - w4[1] = amd_bytealign_S (w2[3], w3[0], offset); - w4[0] = amd_bytealign_S (w2[2], w2[3], offset); - w3[3] = amd_bytealign_S (w2[1], w2[2], offset); - w3[2] = amd_bytealign_S (w2[0], w2[1], offset); - w3[1] = amd_bytealign_S (w1[3], w2[0], offset); - w3[0] = amd_bytealign_S (w1[2], w1[3], offset); - w2[3] = amd_bytealign_S (w1[1], w1[2], offset); - w2[2] = amd_bytealign_S (w1[0], w1[1], offset); - w2[1] = amd_bytealign_S (w0[3], w1[0], offset); - w2[0] = amd_bytealign_S (w0[2], w0[3], offset); - w1[3] = amd_bytealign_S (w0[1], w0[2], offset); - w1[2] = amd_bytealign_S (w0[0], w0[1], offset); - w1[1] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w6[1], w6[2], offset); + w7[2] = hc_bytealign_S (w6[0], w6[1], offset); + w7[1] = hc_bytealign_S (w5[3], w6[0], offset); + w7[0] = hc_bytealign_S (w5[2], w5[3], offset); + w6[3] = hc_bytealign_S (w5[1], w5[2], offset); + w6[2] = hc_bytealign_S (w5[0], w5[1], offset); + w6[1] = hc_bytealign_S (w4[3], w5[0], offset); + w6[0] = hc_bytealign_S (w4[2], w4[3], offset); + w5[3] = hc_bytealign_S (w4[1], w4[2], offset); + w5[2] = hc_bytealign_S (w4[0], w4[1], offset); + w5[1] = hc_bytealign_S (w3[3], w4[0], offset); + w5[0] = hc_bytealign_S (w3[2], w3[3], offset); + w4[3] = hc_bytealign_S (w3[1], w3[2], offset); + w4[2] = hc_bytealign_S (w3[0], w3[1], offset); + w4[1] = hc_bytealign_S (w2[3], w3[0], offset); + w4[0] = hc_bytealign_S (w2[2], w2[3], offset); + w3[3] = hc_bytealign_S (w2[1], w2[2], offset); + w3[2] = hc_bytealign_S (w2[0], w2[1], offset); + w3[1] = hc_bytealign_S (w1[3], w2[0], offset); + w3[0] = hc_bytealign_S (w1[2], w1[3], offset); + w2[3] = hc_bytealign_S (w1[1], w1[2], offset); + w2[2] = hc_bytealign_S (w1[0], w1[1], offset); + w2[1] = hc_bytealign_S (w0[3], w1[0], offset); + w2[0] = hc_bytealign_S (w0[2], w0[3], offset); + w1[3] = hc_bytealign_S (w0[1], w0[2], offset); + w1[2] = hc_bytealign_S (w0[0], w0[1], offset); + w1[1] = hc_bytealign_S ( 0, w0[0], offset); w1[0] = 0; w0[3] = 0; w0[2] = 0; @@ -35081,32 +35081,32 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 6: - w7[3] = amd_bytealign_S (w6[0], w6[1], offset); - w7[2] = amd_bytealign_S (w5[3], w6[0], offset); - w7[1] = amd_bytealign_S (w5[2], w5[3], offset); - w7[0] = amd_bytealign_S (w5[1], w5[2], offset); - w6[3] = amd_bytealign_S (w5[0], w5[1], offset); - w6[2] = amd_bytealign_S (w4[3], w5[0], offset); - w6[1] = amd_bytealign_S (w4[2], w4[3], offset); - w6[0] = amd_bytealign_S (w4[1], w4[2], offset); - w5[3] = amd_bytealign_S (w4[0], w4[1], offset); - w5[2] = amd_bytealign_S (w3[3], w4[0], offset); - w5[1] = amd_bytealign_S (w3[2], w3[3], offset); - w5[0] = amd_bytealign_S (w3[1], w3[2], offset); - w4[3] = amd_bytealign_S (w3[0], w3[1], offset); - w4[2] = amd_bytealign_S (w2[3], w3[0], offset); - w4[1] = amd_bytealign_S (w2[2], w2[3], offset); - w4[0] = amd_bytealign_S (w2[1], w2[2], offset); - w3[3] = amd_bytealign_S (w2[0], w2[1], offset); - w3[2] = amd_bytealign_S (w1[3], w2[0], offset); - w3[1] = amd_bytealign_S (w1[2], w1[3], offset); - w3[0] = amd_bytealign_S (w1[1], w1[2], offset); - w2[3] = amd_bytealign_S (w1[0], w1[1], offset); - w2[2] = amd_bytealign_S (w0[3], w1[0], offset); - w2[1] = amd_bytealign_S (w0[2], w0[3], offset); - w2[0] = amd_bytealign_S (w0[1], w0[2], offset); - w1[3] = amd_bytealign_S (w0[0], w0[1], offset); - w1[2] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w6[0], w6[1], offset); + w7[2] = hc_bytealign_S (w5[3], w6[0], offset); + w7[1] = hc_bytealign_S (w5[2], w5[3], offset); + w7[0] = hc_bytealign_S (w5[1], w5[2], offset); + w6[3] = hc_bytealign_S (w5[0], w5[1], offset); + w6[2] = hc_bytealign_S (w4[3], w5[0], offset); + w6[1] = hc_bytealign_S (w4[2], w4[3], offset); + w6[0] = hc_bytealign_S (w4[1], w4[2], offset); + w5[3] = hc_bytealign_S (w4[0], w4[1], offset); + w5[2] = hc_bytealign_S (w3[3], w4[0], offset); + w5[1] = hc_bytealign_S (w3[2], w3[3], offset); + w5[0] = hc_bytealign_S (w3[1], w3[2], offset); + w4[3] = hc_bytealign_S (w3[0], w3[1], offset); + w4[2] = hc_bytealign_S (w2[3], w3[0], offset); + w4[1] = hc_bytealign_S (w2[2], w2[3], offset); + w4[0] = hc_bytealign_S (w2[1], w2[2], offset); + w3[3] = hc_bytealign_S (w2[0], w2[1], offset); + w3[2] = hc_bytealign_S (w1[3], w2[0], offset); + w3[1] = hc_bytealign_S (w1[2], w1[3], offset); + w3[0] = hc_bytealign_S (w1[1], w1[2], offset); + w2[3] = hc_bytealign_S (w1[0], w1[1], offset); + w2[2] = hc_bytealign_S (w0[3], w1[0], offset); + w2[1] = hc_bytealign_S (w0[2], w0[3], offset); + w2[0] = hc_bytealign_S (w0[1], w0[2], offset); + w1[3] = hc_bytealign_S (w0[0], w0[1], offset); + w1[2] = hc_bytealign_S ( 0, w0[0], offset); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -35117,31 +35117,31 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 7: - w7[3] = amd_bytealign_S (w5[3], w6[0], offset); - w7[2] = amd_bytealign_S (w5[2], w5[3], offset); - w7[1] = amd_bytealign_S (w5[1], w5[2], offset); - w7[0] = amd_bytealign_S (w5[0], w5[1], offset); - w6[3] = amd_bytealign_S (w4[3], w5[0], offset); - w6[2] = amd_bytealign_S (w4[2], w4[3], offset); - w6[1] = amd_bytealign_S (w4[1], w4[2], offset); - w6[0] = amd_bytealign_S (w4[0], w4[1], offset); - w5[3] = amd_bytealign_S (w3[3], w4[0], offset); - w5[2] = amd_bytealign_S (w3[2], w3[3], offset); - w5[1] = amd_bytealign_S (w3[1], w3[2], offset); - w5[0] = amd_bytealign_S (w3[0], w3[1], offset); - w4[3] = amd_bytealign_S (w2[3], w3[0], offset); - w4[2] = amd_bytealign_S (w2[2], w2[3], offset); - w4[1] = amd_bytealign_S (w2[1], w2[2], offset); - w4[0] = amd_bytealign_S (w2[0], w2[1], offset); - w3[3] = amd_bytealign_S (w1[3], w2[0], offset); - w3[2] = amd_bytealign_S (w1[2], w1[3], offset); - w3[1] = amd_bytealign_S (w1[1], w1[2], offset); - w3[0] = amd_bytealign_S (w1[0], w1[1], offset); - w2[3] = amd_bytealign_S (w0[3], w1[0], offset); - w2[2] = amd_bytealign_S (w0[2], w0[3], offset); - w2[1] = amd_bytealign_S (w0[1], w0[2], offset); - w2[0] = amd_bytealign_S (w0[0], w0[1], offset); - w1[3] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w5[3], w6[0], offset); + w7[2] = hc_bytealign_S (w5[2], w5[3], offset); + w7[1] = hc_bytealign_S (w5[1], w5[2], offset); + w7[0] = hc_bytealign_S (w5[0], w5[1], offset); + w6[3] = hc_bytealign_S (w4[3], w5[0], offset); + w6[2] = hc_bytealign_S (w4[2], w4[3], offset); + w6[1] = hc_bytealign_S (w4[1], w4[2], offset); + w6[0] = hc_bytealign_S (w4[0], w4[1], offset); + w5[3] = hc_bytealign_S (w3[3], w4[0], offset); + w5[2] = hc_bytealign_S (w3[2], w3[3], offset); + w5[1] = hc_bytealign_S (w3[1], w3[2], offset); + w5[0] = hc_bytealign_S (w3[0], w3[1], offset); + w4[3] = hc_bytealign_S (w2[3], w3[0], offset); + w4[2] = hc_bytealign_S (w2[2], w2[3], offset); + w4[1] = hc_bytealign_S (w2[1], w2[2], offset); + w4[0] = hc_bytealign_S (w2[0], w2[1], offset); + w3[3] = hc_bytealign_S (w1[3], w2[0], offset); + w3[2] = hc_bytealign_S (w1[2], w1[3], offset); + w3[1] = hc_bytealign_S (w1[1], w1[2], offset); + w3[0] = hc_bytealign_S (w1[0], w1[1], offset); + w2[3] = hc_bytealign_S (w0[3], w1[0], offset); + w2[2] = hc_bytealign_S (w0[2], w0[3], offset); + w2[1] = hc_bytealign_S (w0[1], w0[2], offset); + w2[0] = hc_bytealign_S (w0[0], w0[1], offset); + w1[3] = hc_bytealign_S ( 0, w0[0], offset); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -35153,30 +35153,30 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 8: - w7[3] = amd_bytealign_S (w5[2], w5[3], offset); - w7[2] = amd_bytealign_S (w5[1], w5[2], offset); - w7[1] = amd_bytealign_S (w5[0], w5[1], offset); - w7[0] = amd_bytealign_S (w4[3], w5[0], offset); - w6[3] = amd_bytealign_S (w4[2], w4[3], offset); - w6[2] = amd_bytealign_S (w4[1], w4[2], offset); - w6[1] = amd_bytealign_S (w4[0], w4[1], offset); - w6[0] = amd_bytealign_S (w3[3], w4[0], offset); - w5[3] = amd_bytealign_S (w3[2], w3[3], offset); - w5[2] = amd_bytealign_S (w3[1], w3[2], offset); - w5[1] = amd_bytealign_S (w3[0], w3[1], offset); - w5[0] = amd_bytealign_S (w2[3], w3[0], offset); - w4[3] = amd_bytealign_S (w2[2], w2[3], offset); - w4[2] = amd_bytealign_S (w2[1], w2[2], offset); - w4[1] = amd_bytealign_S (w2[0], w2[1], offset); - w4[0] = amd_bytealign_S (w1[3], w2[0], offset); - w3[3] = amd_bytealign_S (w1[2], w1[3], offset); - w3[2] = amd_bytealign_S (w1[1], w1[2], offset); - w3[1] = amd_bytealign_S (w1[0], w1[1], offset); - w3[0] = amd_bytealign_S (w0[3], w1[0], offset); - w2[3] = amd_bytealign_S (w0[2], w0[3], offset); - w2[2] = amd_bytealign_S (w0[1], w0[2], offset); - w2[1] = amd_bytealign_S (w0[0], w0[1], offset); - w2[0] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w5[2], w5[3], offset); + w7[2] = hc_bytealign_S (w5[1], w5[2], offset); + w7[1] = hc_bytealign_S (w5[0], w5[1], offset); + w7[0] = hc_bytealign_S (w4[3], w5[0], offset); + w6[3] = hc_bytealign_S (w4[2], w4[3], offset); + w6[2] = hc_bytealign_S (w4[1], w4[2], offset); + w6[1] = hc_bytealign_S (w4[0], w4[1], offset); + w6[0] = hc_bytealign_S (w3[3], w4[0], offset); + w5[3] = hc_bytealign_S (w3[2], w3[3], offset); + w5[2] = hc_bytealign_S (w3[1], w3[2], offset); + w5[1] = hc_bytealign_S (w3[0], w3[1], offset); + w5[0] = hc_bytealign_S (w2[3], w3[0], offset); + w4[3] = hc_bytealign_S (w2[2], w2[3], offset); + w4[2] = hc_bytealign_S (w2[1], w2[2], offset); + w4[1] = hc_bytealign_S (w2[0], w2[1], offset); + w4[0] = hc_bytealign_S (w1[3], w2[0], offset); + w3[3] = hc_bytealign_S (w1[2], w1[3], offset); + w3[2] = hc_bytealign_S (w1[1], w1[2], offset); + w3[1] = hc_bytealign_S (w1[0], w1[1], offset); + w3[0] = hc_bytealign_S (w0[3], w1[0], offset); + w2[3] = hc_bytealign_S (w0[2], w0[3], offset); + w2[2] = hc_bytealign_S (w0[1], w0[2], offset); + w2[1] = hc_bytealign_S (w0[0], w0[1], offset); + w2[0] = hc_bytealign_S ( 0, w0[0], offset); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -35189,29 +35189,29 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 9: - w7[3] = amd_bytealign_S (w5[1], w5[2], offset); - w7[2] = amd_bytealign_S (w5[0], w5[1], offset); - w7[1] = amd_bytealign_S (w4[3], w5[0], offset); - w7[0] = amd_bytealign_S (w4[2], w4[3], offset); - w6[3] = amd_bytealign_S (w4[1], w4[2], offset); - w6[2] = amd_bytealign_S (w4[0], w4[1], offset); - w6[1] = amd_bytealign_S (w3[3], w4[0], offset); - w6[0] = amd_bytealign_S (w3[2], w3[3], offset); - w5[3] = amd_bytealign_S (w3[1], w3[2], offset); - w5[2] = amd_bytealign_S (w3[0], w3[1], offset); - w5[1] = amd_bytealign_S (w2[3], w3[0], offset); - w5[0] = amd_bytealign_S (w2[2], w2[3], offset); - w4[3] = amd_bytealign_S (w2[1], w2[2], offset); - w4[2] = amd_bytealign_S (w2[0], w2[1], offset); - w4[1] = amd_bytealign_S (w1[3], w2[0], offset); - w4[0] = amd_bytealign_S (w1[2], w1[3], offset); - w3[3] = amd_bytealign_S (w1[1], w1[2], offset); - w3[2] = amd_bytealign_S (w1[0], w1[1], offset); - w3[1] = amd_bytealign_S (w0[3], w1[0], offset); - w3[0] = amd_bytealign_S (w0[2], w0[3], offset); - w2[3] = amd_bytealign_S (w0[1], w0[2], offset); - w2[2] = amd_bytealign_S (w0[0], w0[1], offset); - w2[1] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w5[1], w5[2], offset); + w7[2] = hc_bytealign_S (w5[0], w5[1], offset); + w7[1] = hc_bytealign_S (w4[3], w5[0], offset); + w7[0] = hc_bytealign_S (w4[2], w4[3], offset); + w6[3] = hc_bytealign_S (w4[1], w4[2], offset); + w6[2] = hc_bytealign_S (w4[0], w4[1], offset); + w6[1] = hc_bytealign_S (w3[3], w4[0], offset); + w6[0] = hc_bytealign_S (w3[2], w3[3], offset); + w5[3] = hc_bytealign_S (w3[1], w3[2], offset); + w5[2] = hc_bytealign_S (w3[0], w3[1], offset); + w5[1] = hc_bytealign_S (w2[3], w3[0], offset); + w5[0] = hc_bytealign_S (w2[2], w2[3], offset); + w4[3] = hc_bytealign_S (w2[1], w2[2], offset); + w4[2] = hc_bytealign_S (w2[0], w2[1], offset); + w4[1] = hc_bytealign_S (w1[3], w2[0], offset); + w4[0] = hc_bytealign_S (w1[2], w1[3], offset); + w3[3] = hc_bytealign_S (w1[1], w1[2], offset); + w3[2] = hc_bytealign_S (w1[0], w1[1], offset); + w3[1] = hc_bytealign_S (w0[3], w1[0], offset); + w3[0] = hc_bytealign_S (w0[2], w0[3], offset); + w2[3] = hc_bytealign_S (w0[1], w0[2], offset); + w2[2] = hc_bytealign_S (w0[0], w0[1], offset); + w2[1] = hc_bytealign_S ( 0, w0[0], offset); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -35225,28 +35225,28 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 10: - w7[3] = amd_bytealign_S (w5[0], w5[1], offset); - w7[2] = amd_bytealign_S (w4[3], w5[0], offset); - w7[1] = amd_bytealign_S (w4[2], w4[3], offset); - w7[0] = amd_bytealign_S (w4[1], w4[2], offset); - w6[3] = amd_bytealign_S (w4[0], w4[1], offset); - w6[2] = amd_bytealign_S (w3[3], w4[0], offset); - w6[1] = amd_bytealign_S (w3[2], w3[3], offset); - w6[0] = amd_bytealign_S (w3[1], w3[2], offset); - w5[3] = amd_bytealign_S (w3[0], w3[1], offset); - w5[2] = amd_bytealign_S (w2[3], w3[0], offset); - w5[1] = amd_bytealign_S (w2[2], w2[3], offset); - w5[0] = amd_bytealign_S (w2[1], w2[2], offset); - w4[3] = amd_bytealign_S (w2[0], w2[1], offset); - w4[2] = amd_bytealign_S (w1[3], w2[0], offset); - w4[1] = amd_bytealign_S (w1[2], w1[3], offset); - w4[0] = amd_bytealign_S (w1[1], w1[2], offset); - w3[3] = amd_bytealign_S (w1[0], w1[1], offset); - w3[2] = amd_bytealign_S (w0[3], w1[0], offset); - w3[1] = amd_bytealign_S (w0[2], w0[3], offset); - w3[0] = amd_bytealign_S (w0[1], w0[2], offset); - w2[3] = amd_bytealign_S (w0[0], w0[1], offset); - w2[2] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w5[0], w5[1], offset); + w7[2] = hc_bytealign_S (w4[3], w5[0], offset); + w7[1] = hc_bytealign_S (w4[2], w4[3], offset); + w7[0] = hc_bytealign_S (w4[1], w4[2], offset); + w6[3] = hc_bytealign_S (w4[0], w4[1], offset); + w6[2] = hc_bytealign_S (w3[3], w4[0], offset); + w6[1] = hc_bytealign_S (w3[2], w3[3], offset); + w6[0] = hc_bytealign_S (w3[1], w3[2], offset); + w5[3] = hc_bytealign_S (w3[0], w3[1], offset); + w5[2] = hc_bytealign_S (w2[3], w3[0], offset); + w5[1] = hc_bytealign_S (w2[2], w2[3], offset); + w5[0] = hc_bytealign_S (w2[1], w2[2], offset); + w4[3] = hc_bytealign_S (w2[0], w2[1], offset); + w4[2] = hc_bytealign_S (w1[3], w2[0], offset); + w4[1] = hc_bytealign_S (w1[2], w1[3], offset); + w4[0] = hc_bytealign_S (w1[1], w1[2], offset); + w3[3] = hc_bytealign_S (w1[0], w1[1], offset); + w3[2] = hc_bytealign_S (w0[3], w1[0], offset); + w3[1] = hc_bytealign_S (w0[2], w0[3], offset); + w3[0] = hc_bytealign_S (w0[1], w0[2], offset); + w2[3] = hc_bytealign_S (w0[0], w0[1], offset); + w2[2] = hc_bytealign_S ( 0, w0[0], offset); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -35261,27 +35261,27 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 11: - w7[3] = amd_bytealign_S (w4[3], w5[0], offset); - w7[2] = amd_bytealign_S (w4[2], w4[3], offset); - w7[1] = amd_bytealign_S (w4[1], w4[2], offset); - w7[0] = amd_bytealign_S (w4[0], w4[1], offset); - w6[3] = amd_bytealign_S (w3[3], w4[0], offset); - w6[2] = amd_bytealign_S (w3[2], w3[3], offset); - w6[1] = amd_bytealign_S (w3[1], w3[2], offset); - w6[0] = amd_bytealign_S (w3[0], w3[1], offset); - w5[3] = amd_bytealign_S (w2[3], w3[0], offset); - w5[2] = amd_bytealign_S (w2[2], w2[3], offset); - w5[1] = amd_bytealign_S (w2[1], w2[2], offset); - w5[0] = amd_bytealign_S (w2[0], w2[1], offset); - w4[3] = amd_bytealign_S (w1[3], w2[0], offset); - w4[2] = amd_bytealign_S (w1[2], w1[3], offset); - w4[1] = amd_bytealign_S (w1[1], w1[2], offset); - w4[0] = amd_bytealign_S (w1[0], w1[1], offset); - w3[3] = amd_bytealign_S (w0[3], w1[0], offset); - w3[2] = amd_bytealign_S (w0[2], w0[3], offset); - w3[1] = amd_bytealign_S (w0[1], w0[2], offset); - w3[0] = amd_bytealign_S (w0[0], w0[1], offset); - w2[3] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w4[3], w5[0], offset); + w7[2] = hc_bytealign_S (w4[2], w4[3], offset); + w7[1] = hc_bytealign_S (w4[1], w4[2], offset); + w7[0] = hc_bytealign_S (w4[0], w4[1], offset); + w6[3] = hc_bytealign_S (w3[3], w4[0], offset); + w6[2] = hc_bytealign_S (w3[2], w3[3], offset); + w6[1] = hc_bytealign_S (w3[1], w3[2], offset); + w6[0] = hc_bytealign_S (w3[0], w3[1], offset); + w5[3] = hc_bytealign_S (w2[3], w3[0], offset); + w5[2] = hc_bytealign_S (w2[2], w2[3], offset); + w5[1] = hc_bytealign_S (w2[1], w2[2], offset); + w5[0] = hc_bytealign_S (w2[0], w2[1], offset); + w4[3] = hc_bytealign_S (w1[3], w2[0], offset); + w4[2] = hc_bytealign_S (w1[2], w1[3], offset); + w4[1] = hc_bytealign_S (w1[1], w1[2], offset); + w4[0] = hc_bytealign_S (w1[0], w1[1], offset); + w3[3] = hc_bytealign_S (w0[3], w1[0], offset); + w3[2] = hc_bytealign_S (w0[2], w0[3], offset); + w3[1] = hc_bytealign_S (w0[1], w0[2], offset); + w3[0] = hc_bytealign_S (w0[0], w0[1], offset); + w2[3] = hc_bytealign_S ( 0, w0[0], offset); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -35297,26 +35297,26 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 12: - w7[3] = amd_bytealign_S (w4[2], w4[3], offset); - w7[2] = amd_bytealign_S (w4[1], w4[2], offset); - w7[1] = amd_bytealign_S (w4[0], w4[1], offset); - w7[0] = amd_bytealign_S (w3[3], w4[0], offset); - w6[3] = amd_bytealign_S (w3[2], w3[3], offset); - w6[2] = amd_bytealign_S (w3[1], w3[2], offset); - w6[1] = amd_bytealign_S (w3[0], w3[1], offset); - w6[0] = amd_bytealign_S (w2[3], w3[0], offset); - w5[3] = amd_bytealign_S (w2[2], w2[3], offset); - w5[2] = amd_bytealign_S (w2[1], w2[2], offset); - w5[1] = amd_bytealign_S (w2[0], w2[1], offset); - w5[0] = amd_bytealign_S (w1[3], w2[0], offset); - w4[3] = amd_bytealign_S (w1[2], w1[3], offset); - w4[2] = amd_bytealign_S (w1[1], w1[2], offset); - w4[1] = amd_bytealign_S (w1[0], w1[1], offset); - w4[0] = amd_bytealign_S (w0[3], w1[0], offset); - w3[3] = amd_bytealign_S (w0[2], w0[3], offset); - w3[2] = amd_bytealign_S (w0[1], w0[2], offset); - w3[1] = amd_bytealign_S (w0[0], w0[1], offset); - w3[0] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w4[2], w4[3], offset); + w7[2] = hc_bytealign_S (w4[1], w4[2], offset); + w7[1] = hc_bytealign_S (w4[0], w4[1], offset); + w7[0] = hc_bytealign_S (w3[3], w4[0], offset); + w6[3] = hc_bytealign_S (w3[2], w3[3], offset); + w6[2] = hc_bytealign_S (w3[1], w3[2], offset); + w6[1] = hc_bytealign_S (w3[0], w3[1], offset); + w6[0] = hc_bytealign_S (w2[3], w3[0], offset); + w5[3] = hc_bytealign_S (w2[2], w2[3], offset); + w5[2] = hc_bytealign_S (w2[1], w2[2], offset); + w5[1] = hc_bytealign_S (w2[0], w2[1], offset); + w5[0] = hc_bytealign_S (w1[3], w2[0], offset); + w4[3] = hc_bytealign_S (w1[2], w1[3], offset); + w4[2] = hc_bytealign_S (w1[1], w1[2], offset); + w4[1] = hc_bytealign_S (w1[0], w1[1], offset); + w4[0] = hc_bytealign_S (w0[3], w1[0], offset); + w3[3] = hc_bytealign_S (w0[2], w0[3], offset); + w3[2] = hc_bytealign_S (w0[1], w0[2], offset); + w3[1] = hc_bytealign_S (w0[0], w0[1], offset); + w3[0] = hc_bytealign_S ( 0, w0[0], offset); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -35333,25 +35333,25 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 13: - w7[3] = amd_bytealign_S (w4[1], w4[2], offset); - w7[2] = amd_bytealign_S (w4[0], w4[1], offset); - w7[1] = amd_bytealign_S (w3[3], w4[0], offset); - w7[0] = amd_bytealign_S (w3[2], w3[3], offset); - w6[3] = amd_bytealign_S (w3[1], w3[2], offset); - w6[2] = amd_bytealign_S (w3[0], w3[1], offset); - w6[1] = amd_bytealign_S (w2[3], w3[0], offset); - w6[0] = amd_bytealign_S (w2[2], w2[3], offset); - w5[3] = amd_bytealign_S (w2[1], w2[2], offset); - w5[2] = amd_bytealign_S (w2[0], w2[1], offset); - w5[1] = amd_bytealign_S (w1[3], w2[0], offset); - w5[0] = amd_bytealign_S (w1[2], w1[3], offset); - w4[3] = amd_bytealign_S (w1[1], w1[2], offset); - w4[2] = amd_bytealign_S (w1[0], w1[1], offset); - w4[1] = amd_bytealign_S (w0[3], w1[0], offset); - w4[0] = amd_bytealign_S (w0[2], w0[3], offset); - w3[3] = amd_bytealign_S (w0[1], w0[2], offset); - w3[2] = amd_bytealign_S (w0[0], w0[1], offset); - w3[1] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w4[1], w4[2], offset); + w7[2] = hc_bytealign_S (w4[0], w4[1], offset); + w7[1] = hc_bytealign_S (w3[3], w4[0], offset); + w7[0] = hc_bytealign_S (w3[2], w3[3], offset); + w6[3] = hc_bytealign_S (w3[1], w3[2], offset); + w6[2] = hc_bytealign_S (w3[0], w3[1], offset); + w6[1] = hc_bytealign_S (w2[3], w3[0], offset); + w6[0] = hc_bytealign_S (w2[2], w2[3], offset); + w5[3] = hc_bytealign_S (w2[1], w2[2], offset); + w5[2] = hc_bytealign_S (w2[0], w2[1], offset); + w5[1] = hc_bytealign_S (w1[3], w2[0], offset); + w5[0] = hc_bytealign_S (w1[2], w1[3], offset); + w4[3] = hc_bytealign_S (w1[1], w1[2], offset); + w4[2] = hc_bytealign_S (w1[0], w1[1], offset); + w4[1] = hc_bytealign_S (w0[3], w1[0], offset); + w4[0] = hc_bytealign_S (w0[2], w0[3], offset); + w3[3] = hc_bytealign_S (w0[1], w0[2], offset); + w3[2] = hc_bytealign_S (w0[0], w0[1], offset); + w3[1] = hc_bytealign_S ( 0, w0[0], offset); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -35369,24 +35369,24 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 14: - w7[3] = amd_bytealign_S (w4[0], w4[1], offset); - w7[2] = amd_bytealign_S (w3[3], w4[0], offset); - w7[1] = amd_bytealign_S (w3[2], w3[3], offset); - w7[0] = amd_bytealign_S (w3[1], w3[2], offset); - w6[3] = amd_bytealign_S (w3[0], w3[1], offset); - w6[2] = amd_bytealign_S (w2[3], w3[0], offset); - w6[1] = amd_bytealign_S (w2[2], w2[3], offset); - w6[0] = amd_bytealign_S (w2[1], w2[2], offset); - w5[3] = amd_bytealign_S (w2[0], w2[1], offset); - w5[2] = amd_bytealign_S (w1[3], w2[0], offset); - w5[1] = amd_bytealign_S (w1[2], w1[3], offset); - w5[0] = amd_bytealign_S (w1[1], w1[2], offset); - w4[3] = amd_bytealign_S (w1[0], w1[1], offset); - w4[2] = amd_bytealign_S (w0[3], w1[0], offset); - w4[1] = amd_bytealign_S (w0[2], w0[3], offset); - w4[0] = amd_bytealign_S (w0[1], w0[2], offset); - w3[3] = amd_bytealign_S (w0[0], w0[1], offset); - w3[2] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w4[0], w4[1], offset); + w7[2] = hc_bytealign_S (w3[3], w4[0], offset); + w7[1] = hc_bytealign_S (w3[2], w3[3], offset); + w7[0] = hc_bytealign_S (w3[1], w3[2], offset); + w6[3] = hc_bytealign_S (w3[0], w3[1], offset); + w6[2] = hc_bytealign_S (w2[3], w3[0], offset); + w6[1] = hc_bytealign_S (w2[2], w2[3], offset); + w6[0] = hc_bytealign_S (w2[1], w2[2], offset); + w5[3] = hc_bytealign_S (w2[0], w2[1], offset); + w5[2] = hc_bytealign_S (w1[3], w2[0], offset); + w5[1] = hc_bytealign_S (w1[2], w1[3], offset); + w5[0] = hc_bytealign_S (w1[1], w1[2], offset); + w4[3] = hc_bytealign_S (w1[0], w1[1], offset); + w4[2] = hc_bytealign_S (w0[3], w1[0], offset); + w4[1] = hc_bytealign_S (w0[2], w0[3], offset); + w4[0] = hc_bytealign_S (w0[1], w0[2], offset); + w3[3] = hc_bytealign_S (w0[0], w0[1], offset); + w3[2] = hc_bytealign_S ( 0, w0[0], offset); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -35405,23 +35405,23 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 15: - w7[3] = amd_bytealign_S (w3[3], w4[0], offset); - w7[2] = amd_bytealign_S (w3[2], w3[3], offset); - w7[1] = amd_bytealign_S (w3[1], w3[2], offset); - w7[0] = amd_bytealign_S (w3[0], w3[1], offset); - w6[3] = amd_bytealign_S (w2[3], w3[0], offset); - w6[2] = amd_bytealign_S (w2[2], w2[3], offset); - w6[1] = amd_bytealign_S (w2[1], w2[2], offset); - w6[0] = amd_bytealign_S (w2[0], w2[1], offset); - w5[3] = amd_bytealign_S (w1[3], w2[0], offset); - w5[2] = amd_bytealign_S (w1[2], w1[3], offset); - w5[1] = amd_bytealign_S (w1[1], w1[2], offset); - w5[0] = amd_bytealign_S (w1[0], w1[1], offset); - w4[3] = amd_bytealign_S (w0[3], w1[0], offset); - w4[2] = amd_bytealign_S (w0[2], w0[3], offset); - w4[1] = amd_bytealign_S (w0[1], w0[2], offset); - w4[0] = amd_bytealign_S (w0[0], w0[1], offset); - w3[3] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w3[3], w4[0], offset); + w7[2] = hc_bytealign_S (w3[2], w3[3], offset); + w7[1] = hc_bytealign_S (w3[1], w3[2], offset); + w7[0] = hc_bytealign_S (w3[0], w3[1], offset); + w6[3] = hc_bytealign_S (w2[3], w3[0], offset); + w6[2] = hc_bytealign_S (w2[2], w2[3], offset); + w6[1] = hc_bytealign_S (w2[1], w2[2], offset); + w6[0] = hc_bytealign_S (w2[0], w2[1], offset); + w5[3] = hc_bytealign_S (w1[3], w2[0], offset); + w5[2] = hc_bytealign_S (w1[2], w1[3], offset); + w5[1] = hc_bytealign_S (w1[1], w1[2], offset); + w5[0] = hc_bytealign_S (w1[0], w1[1], offset); + w4[3] = hc_bytealign_S (w0[3], w1[0], offset); + w4[2] = hc_bytealign_S (w0[2], w0[3], offset); + w4[1] = hc_bytealign_S (w0[1], w0[2], offset); + w4[0] = hc_bytealign_S (w0[0], w0[1], offset); + w3[3] = hc_bytealign_S ( 0, w0[0], offset); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -35441,22 +35441,22 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 16: - w7[3] = amd_bytealign_S (w3[2], w3[3], offset); - w7[2] = amd_bytealign_S (w3[1], w3[2], offset); - w7[1] = amd_bytealign_S (w3[0], w3[1], offset); - w7[0] = amd_bytealign_S (w2[3], w3[0], offset); - w6[3] = amd_bytealign_S (w2[2], w2[3], offset); - w6[2] = amd_bytealign_S (w2[1], w2[2], offset); - w6[1] = amd_bytealign_S (w2[0], w2[1], offset); - w6[0] = amd_bytealign_S (w1[3], w2[0], offset); - w5[3] = amd_bytealign_S (w1[2], w1[3], offset); - w5[2] = amd_bytealign_S (w1[1], w1[2], offset); - w5[1] = amd_bytealign_S (w1[0], w1[1], offset); - w5[0] = amd_bytealign_S (w0[3], w1[0], offset); - w4[3] = amd_bytealign_S (w0[2], w0[3], offset); - w4[2] = amd_bytealign_S (w0[1], w0[2], offset); - w4[1] = amd_bytealign_S (w0[0], w0[1], offset); - w4[0] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w3[2], w3[3], offset); + w7[2] = hc_bytealign_S (w3[1], w3[2], offset); + w7[1] = hc_bytealign_S (w3[0], w3[1], offset); + w7[0] = hc_bytealign_S (w2[3], w3[0], offset); + w6[3] = hc_bytealign_S (w2[2], w2[3], offset); + w6[2] = hc_bytealign_S (w2[1], w2[2], offset); + w6[1] = hc_bytealign_S (w2[0], w2[1], offset); + w6[0] = hc_bytealign_S (w1[3], w2[0], offset); + w5[3] = hc_bytealign_S (w1[2], w1[3], offset); + w5[2] = hc_bytealign_S (w1[1], w1[2], offset); + w5[1] = hc_bytealign_S (w1[0], w1[1], offset); + w5[0] = hc_bytealign_S (w0[3], w1[0], offset); + w4[3] = hc_bytealign_S (w0[2], w0[3], offset); + w4[2] = hc_bytealign_S (w0[1], w0[2], offset); + w4[1] = hc_bytealign_S (w0[0], w0[1], offset); + w4[0] = hc_bytealign_S ( 0, w0[0], offset); w3[3] = 0; w3[2] = 0; w3[1] = 0; @@ -35477,21 +35477,21 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 17: - w7[3] = amd_bytealign_S (w3[1], w3[2], offset); - w7[2] = amd_bytealign_S (w3[0], w3[1], offset); - w7[1] = amd_bytealign_S (w2[3], w3[0], offset); - w7[0] = amd_bytealign_S (w2[2], w2[3], offset); - w6[3] = amd_bytealign_S (w2[1], w2[2], offset); - w6[2] = amd_bytealign_S (w2[0], w2[1], offset); - w6[1] = amd_bytealign_S (w1[3], w2[0], offset); - w6[0] = amd_bytealign_S (w1[2], w1[3], offset); - w5[3] = amd_bytealign_S (w1[1], w1[2], offset); - w5[2] = amd_bytealign_S (w1[0], w1[1], offset); - w5[1] = amd_bytealign_S (w0[3], w1[0], offset); - w5[0] = amd_bytealign_S (w0[2], w0[3], offset); - w4[3] = amd_bytealign_S (w0[1], w0[2], offset); - w4[2] = amd_bytealign_S (w0[0], w0[1], offset); - w4[1] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w3[1], w3[2], offset); + w7[2] = hc_bytealign_S (w3[0], w3[1], offset); + w7[1] = hc_bytealign_S (w2[3], w3[0], offset); + w7[0] = hc_bytealign_S (w2[2], w2[3], offset); + w6[3] = hc_bytealign_S (w2[1], w2[2], offset); + w6[2] = hc_bytealign_S (w2[0], w2[1], offset); + w6[1] = hc_bytealign_S (w1[3], w2[0], offset); + w6[0] = hc_bytealign_S (w1[2], w1[3], offset); + w5[3] = hc_bytealign_S (w1[1], w1[2], offset); + w5[2] = hc_bytealign_S (w1[0], w1[1], offset); + w5[1] = hc_bytealign_S (w0[3], w1[0], offset); + w5[0] = hc_bytealign_S (w0[2], w0[3], offset); + w4[3] = hc_bytealign_S (w0[1], w0[2], offset); + w4[2] = hc_bytealign_S (w0[0], w0[1], offset); + w4[1] = hc_bytealign_S ( 0, w0[0], offset); w4[0] = 0; w3[3] = 0; w3[2] = 0; @@ -35513,20 +35513,20 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 18: - w7[3] = amd_bytealign_S (w3[0], w3[1], offset); - w7[2] = amd_bytealign_S (w2[3], w3[0], offset); - w7[1] = amd_bytealign_S (w2[2], w2[3], offset); - w7[0] = amd_bytealign_S (w2[1], w2[2], offset); - w6[3] = amd_bytealign_S (w2[0], w2[1], offset); - w6[2] = amd_bytealign_S (w1[3], w2[0], offset); - w6[1] = amd_bytealign_S (w1[2], w1[3], offset); - w6[0] = amd_bytealign_S (w1[1], w1[2], offset); - w5[3] = amd_bytealign_S (w1[0], w1[1], offset); - w5[2] = amd_bytealign_S (w0[3], w1[0], offset); - w5[1] = amd_bytealign_S (w0[2], w0[3], offset); - w5[0] = amd_bytealign_S (w0[1], w0[2], offset); - w4[3] = amd_bytealign_S (w0[0], w0[1], offset); - w4[2] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w3[0], w3[1], offset); + w7[2] = hc_bytealign_S (w2[3], w3[0], offset); + w7[1] = hc_bytealign_S (w2[2], w2[3], offset); + w7[0] = hc_bytealign_S (w2[1], w2[2], offset); + w6[3] = hc_bytealign_S (w2[0], w2[1], offset); + w6[2] = hc_bytealign_S (w1[3], w2[0], offset); + w6[1] = hc_bytealign_S (w1[2], w1[3], offset); + w6[0] = hc_bytealign_S (w1[1], w1[2], offset); + w5[3] = hc_bytealign_S (w1[0], w1[1], offset); + w5[2] = hc_bytealign_S (w0[3], w1[0], offset); + w5[1] = hc_bytealign_S (w0[2], w0[3], offset); + w5[0] = hc_bytealign_S (w0[1], w0[2], offset); + w4[3] = hc_bytealign_S (w0[0], w0[1], offset); + w4[2] = hc_bytealign_S ( 0, w0[0], offset); w4[1] = 0; w4[0] = 0; w3[3] = 0; @@ -35549,19 +35549,19 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 19: - w7[3] = amd_bytealign_S (w2[3], w3[0], offset); - w7[2] = amd_bytealign_S (w2[2], w2[3], offset); - w7[1] = amd_bytealign_S (w2[1], w2[2], offset); - w7[0] = amd_bytealign_S (w2[0], w2[1], offset); - w6[3] = amd_bytealign_S (w1[3], w2[0], offset); - w6[2] = amd_bytealign_S (w1[2], w1[3], offset); - w6[1] = amd_bytealign_S (w1[1], w1[2], offset); - w6[0] = amd_bytealign_S (w1[0], w1[1], offset); - w5[3] = amd_bytealign_S (w0[3], w1[0], offset); - w5[2] = amd_bytealign_S (w0[2], w0[3], offset); - w5[1] = amd_bytealign_S (w0[1], w0[2], offset); - w5[0] = amd_bytealign_S (w0[0], w0[1], offset); - w4[3] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w2[3], w3[0], offset); + w7[2] = hc_bytealign_S (w2[2], w2[3], offset); + w7[1] = hc_bytealign_S (w2[1], w2[2], offset); + w7[0] = hc_bytealign_S (w2[0], w2[1], offset); + w6[3] = hc_bytealign_S (w1[3], w2[0], offset); + w6[2] = hc_bytealign_S (w1[2], w1[3], offset); + w6[1] = hc_bytealign_S (w1[1], w1[2], offset); + w6[0] = hc_bytealign_S (w1[0], w1[1], offset); + w5[3] = hc_bytealign_S (w0[3], w1[0], offset); + w5[2] = hc_bytealign_S (w0[2], w0[3], offset); + w5[1] = hc_bytealign_S (w0[1], w0[2], offset); + w5[0] = hc_bytealign_S (w0[0], w0[1], offset); + w4[3] = hc_bytealign_S ( 0, w0[0], offset); w4[2] = 0; w4[1] = 0; w4[0] = 0; @@ -35585,18 +35585,18 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 20: - w7[3] = amd_bytealign_S (w2[2], w2[3], offset); - w7[2] = amd_bytealign_S (w2[1], w2[2], offset); - w7[1] = amd_bytealign_S (w2[0], w2[1], offset); - w7[0] = amd_bytealign_S (w1[3], w2[0], offset); - w6[3] = amd_bytealign_S (w1[2], w1[3], offset); - w6[2] = amd_bytealign_S (w1[1], w1[2], offset); - w6[1] = amd_bytealign_S (w1[0], w1[1], offset); - w6[0] = amd_bytealign_S (w0[3], w1[0], offset); - w5[3] = amd_bytealign_S (w0[2], w0[3], offset); - w5[2] = amd_bytealign_S (w0[1], w0[2], offset); - w5[1] = amd_bytealign_S (w0[0], w0[1], offset); - w5[0] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w2[2], w2[3], offset); + w7[2] = hc_bytealign_S (w2[1], w2[2], offset); + w7[1] = hc_bytealign_S (w2[0], w2[1], offset); + w7[0] = hc_bytealign_S (w1[3], w2[0], offset); + w6[3] = hc_bytealign_S (w1[2], w1[3], offset); + w6[2] = hc_bytealign_S (w1[1], w1[2], offset); + w6[1] = hc_bytealign_S (w1[0], w1[1], offset); + w6[0] = hc_bytealign_S (w0[3], w1[0], offset); + w5[3] = hc_bytealign_S (w0[2], w0[3], offset); + w5[2] = hc_bytealign_S (w0[1], w0[2], offset); + w5[1] = hc_bytealign_S (w0[0], w0[1], offset); + w5[0] = hc_bytealign_S ( 0, w0[0], offset); w4[3] = 0; w4[2] = 0; w4[1] = 0; @@ -35621,17 +35621,17 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 21: - w7[3] = amd_bytealign_S (w2[1], w2[2], offset); - w7[2] = amd_bytealign_S (w2[0], w2[1], offset); - w7[1] = amd_bytealign_S (w1[3], w2[0], offset); - w7[0] = amd_bytealign_S (w1[2], w1[3], offset); - w6[3] = amd_bytealign_S (w1[1], w1[2], offset); - w6[2] = amd_bytealign_S (w1[0], w1[1], offset); - w6[1] = amd_bytealign_S (w0[3], w1[0], offset); - w6[0] = amd_bytealign_S (w0[2], w0[3], offset); - w5[3] = amd_bytealign_S (w0[1], w0[2], offset); - w5[2] = amd_bytealign_S (w0[0], w0[1], offset); - w5[1] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w2[1], w2[2], offset); + w7[2] = hc_bytealign_S (w2[0], w2[1], offset); + w7[1] = hc_bytealign_S (w1[3], w2[0], offset); + w7[0] = hc_bytealign_S (w1[2], w1[3], offset); + w6[3] = hc_bytealign_S (w1[1], w1[2], offset); + w6[2] = hc_bytealign_S (w1[0], w1[1], offset); + w6[1] = hc_bytealign_S (w0[3], w1[0], offset); + w6[0] = hc_bytealign_S (w0[2], w0[3], offset); + w5[3] = hc_bytealign_S (w0[1], w0[2], offset); + w5[2] = hc_bytealign_S (w0[0], w0[1], offset); + w5[1] = hc_bytealign_S ( 0, w0[0], offset); w5[0] = 0; w4[3] = 0; w4[2] = 0; @@ -35657,16 +35657,16 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 22: - w7[3] = amd_bytealign_S (w2[0], w2[1], offset); - w7[2] = amd_bytealign_S (w1[3], w2[0], offset); - w7[1] = amd_bytealign_S (w1[2], w1[3], offset); - w7[0] = amd_bytealign_S (w1[1], w1[2], offset); - w6[3] = amd_bytealign_S (w1[0], w1[1], offset); - w6[2] = amd_bytealign_S (w0[3], w1[0], offset); - w6[1] = amd_bytealign_S (w0[2], w0[3], offset); - w6[0] = amd_bytealign_S (w0[1], w0[2], offset); - w5[3] = amd_bytealign_S (w0[0], w0[1], offset); - w5[2] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w2[0], w2[1], offset); + w7[2] = hc_bytealign_S (w1[3], w2[0], offset); + w7[1] = hc_bytealign_S (w1[2], w1[3], offset); + w7[0] = hc_bytealign_S (w1[1], w1[2], offset); + w6[3] = hc_bytealign_S (w1[0], w1[1], offset); + w6[2] = hc_bytealign_S (w0[3], w1[0], offset); + w6[1] = hc_bytealign_S (w0[2], w0[3], offset); + w6[0] = hc_bytealign_S (w0[1], w0[2], offset); + w5[3] = hc_bytealign_S (w0[0], w0[1], offset); + w5[2] = hc_bytealign_S ( 0, w0[0], offset); w5[1] = 0; w5[0] = 0; w4[3] = 0; @@ -35693,15 +35693,15 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 23: - w7[3] = amd_bytealign_S (w1[3], w2[0], offset); - w7[2] = amd_bytealign_S (w1[2], w1[3], offset); - w7[1] = amd_bytealign_S (w1[1], w1[2], offset); - w7[0] = amd_bytealign_S (w1[0], w1[1], offset); - w6[3] = amd_bytealign_S (w0[3], w1[0], offset); - w6[2] = amd_bytealign_S (w0[2], w0[3], offset); - w6[1] = amd_bytealign_S (w0[1], w0[2], offset); - w6[0] = amd_bytealign_S (w0[0], w0[1], offset); - w5[3] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w1[3], w2[0], offset); + w7[2] = hc_bytealign_S (w1[2], w1[3], offset); + w7[1] = hc_bytealign_S (w1[1], w1[2], offset); + w7[0] = hc_bytealign_S (w1[0], w1[1], offset); + w6[3] = hc_bytealign_S (w0[3], w1[0], offset); + w6[2] = hc_bytealign_S (w0[2], w0[3], offset); + w6[1] = hc_bytealign_S (w0[1], w0[2], offset); + w6[0] = hc_bytealign_S (w0[0], w0[1], offset); + w5[3] = hc_bytealign_S ( 0, w0[0], offset); w5[2] = 0; w5[1] = 0; w5[0] = 0; @@ -35729,14 +35729,14 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 24: - w7[3] = amd_bytealign_S (w1[2], w1[3], offset); - w7[2] = amd_bytealign_S (w1[1], w1[2], offset); - w7[1] = amd_bytealign_S (w1[0], w1[1], offset); - w7[0] = amd_bytealign_S (w0[3], w1[0], offset); - w6[3] = amd_bytealign_S (w0[2], w0[3], offset); - w6[2] = amd_bytealign_S (w0[1], w0[2], offset); - w6[1] = amd_bytealign_S (w0[0], w0[1], offset); - w6[0] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w1[2], w1[3], offset); + w7[2] = hc_bytealign_S (w1[1], w1[2], offset); + w7[1] = hc_bytealign_S (w1[0], w1[1], offset); + w7[0] = hc_bytealign_S (w0[3], w1[0], offset); + w6[3] = hc_bytealign_S (w0[2], w0[3], offset); + w6[2] = hc_bytealign_S (w0[1], w0[2], offset); + w6[1] = hc_bytealign_S (w0[0], w0[1], offset); + w6[0] = hc_bytealign_S ( 0, w0[0], offset); w5[3] = 0; w5[2] = 0; w5[1] = 0; @@ -35765,13 +35765,13 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 25: - w7[3] = amd_bytealign_S (w1[1], w1[2], offset); - w7[2] = amd_bytealign_S (w1[0], w1[1], offset); - w7[1] = amd_bytealign_S (w0[3], w1[0], offset); - w7[0] = amd_bytealign_S (w0[2], w0[3], offset); - w6[3] = amd_bytealign_S (w0[1], w0[2], offset); - w6[2] = amd_bytealign_S (w0[0], w0[1], offset); - w6[1] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w1[1], w1[2], offset); + w7[2] = hc_bytealign_S (w1[0], w1[1], offset); + w7[1] = hc_bytealign_S (w0[3], w1[0], offset); + w7[0] = hc_bytealign_S (w0[2], w0[3], offset); + w6[3] = hc_bytealign_S (w0[1], w0[2], offset); + w6[2] = hc_bytealign_S (w0[0], w0[1], offset); + w6[1] = hc_bytealign_S ( 0, w0[0], offset); w6[0] = 0; w5[3] = 0; w5[2] = 0; @@ -35801,12 +35801,12 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 26: - w7[3] = amd_bytealign_S (w1[0], w1[1], offset); - w7[2] = amd_bytealign_S (w0[3], w1[0], offset); - w7[1] = amd_bytealign_S (w0[2], w0[3], offset); - w7[0] = amd_bytealign_S (w0[1], w0[2], offset); - w6[3] = amd_bytealign_S (w0[0], w0[1], offset); - w6[2] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w1[0], w1[1], offset); + w7[2] = hc_bytealign_S (w0[3], w1[0], offset); + w7[1] = hc_bytealign_S (w0[2], w0[3], offset); + w7[0] = hc_bytealign_S (w0[1], w0[2], offset); + w6[3] = hc_bytealign_S (w0[0], w0[1], offset); + w6[2] = hc_bytealign_S ( 0, w0[0], offset); w6[1] = 0; w6[0] = 0; w5[3] = 0; @@ -35837,11 +35837,11 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 27: - w7[3] = amd_bytealign_S (w0[3], w1[0], offset); - w7[2] = amd_bytealign_S (w0[2], w0[3], offset); - w7[1] = amd_bytealign_S (w0[1], w0[2], offset); - w7[0] = amd_bytealign_S (w0[0], w0[1], offset); - w6[3] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w0[3], w1[0], offset); + w7[2] = hc_bytealign_S (w0[2], w0[3], offset); + w7[1] = hc_bytealign_S (w0[1], w0[2], offset); + w7[0] = hc_bytealign_S (w0[0], w0[1], offset); + w6[3] = hc_bytealign_S ( 0, w0[0], offset); w6[2] = 0; w6[1] = 0; w6[0] = 0; @@ -35873,10 +35873,10 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 28: - w7[3] = amd_bytealign_S (w0[2], w0[3], offset); - w7[2] = amd_bytealign_S (w0[1], w0[2], offset); - w7[1] = amd_bytealign_S (w0[0], w0[1], offset); - w7[0] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w0[2], w0[3], offset); + w7[2] = hc_bytealign_S (w0[1], w0[2], offset); + w7[1] = hc_bytealign_S (w0[0], w0[1], offset); + w7[0] = hc_bytealign_S ( 0, w0[0], offset); w6[3] = 0; w6[2] = 0; w6[1] = 0; @@ -35909,9 +35909,9 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 29: - w7[3] = amd_bytealign_S (w0[1], w0[2], offset); - w7[2] = amd_bytealign_S (w0[0], w0[1], offset); - w7[1] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w0[1], w0[2], offset); + w7[2] = hc_bytealign_S (w0[0], w0[1], offset); + w7[1] = hc_bytealign_S ( 0, w0[0], offset); w7[0] = 0; w6[3] = 0; w6[2] = 0; @@ -35945,8 +35945,8 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 30: - w7[3] = amd_bytealign_S (w0[0], w0[1], offset); - w7[2] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w0[0], w0[1], offset); + w7[2] = hc_bytealign_S ( 0, w0[0], offset); w7[1] = 0; w7[0] = 0; w6[3] = 0; @@ -35981,7 +35981,7 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 31: - w7[3] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S ( 0, w0[0], offset); w7[2] = 0; w7[1] = 0; w7[0] = 0; @@ -36064,174 +36064,174 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * switch (offset_switch) { case 0: - w7[3] = __byte_perm_S (w7[2], w7[3], selector); - w7[2] = __byte_perm_S (w7[1], w7[2], selector); - w7[1] = __byte_perm_S (w7[0], w7[1], selector); - w7[0] = __byte_perm_S (w6[3], w7[0], selector); - w6[3] = __byte_perm_S (w6[2], w6[3], selector); - w6[2] = __byte_perm_S (w6[1], w6[2], selector); - w6[1] = __byte_perm_S (w6[0], w6[1], selector); - w6[0] = __byte_perm_S (w5[3], w6[0], selector); - w5[3] = __byte_perm_S (w5[2], w5[3], selector); - w5[2] = __byte_perm_S (w5[1], w5[2], selector); - w5[1] = __byte_perm_S (w5[0], w5[1], selector); - w5[0] = __byte_perm_S (w4[3], w5[0], selector); - w4[3] = __byte_perm_S (w4[2], w4[3], selector); - w4[2] = __byte_perm_S (w4[1], w4[2], selector); - w4[1] = __byte_perm_S (w4[0], w4[1], selector); - w4[0] = __byte_perm_S (w3[3], w4[0], selector); - w3[3] = __byte_perm_S (w3[2], w3[3], selector); - w3[2] = __byte_perm_S (w3[1], w3[2], selector); - w3[1] = __byte_perm_S (w3[0], w3[1], selector); - w3[0] = __byte_perm_S (w2[3], w3[0], selector); - w2[3] = __byte_perm_S (w2[2], w2[3], selector); - w2[2] = __byte_perm_S (w2[1], w2[2], selector); - w2[1] = __byte_perm_S (w2[0], w2[1], selector); - w2[0] = __byte_perm_S (w1[3], w2[0], selector); - w1[3] = __byte_perm_S (w1[2], w1[3], selector); - w1[2] = __byte_perm_S (w1[1], w1[2], selector); - w1[1] = __byte_perm_S (w1[0], w1[1], selector); - w1[0] = __byte_perm_S (w0[3], w1[0], selector); - w0[3] = __byte_perm_S (w0[2], w0[3], selector); - w0[2] = __byte_perm_S (w0[1], w0[2], selector); - w0[1] = __byte_perm_S (w0[0], w0[1], selector); - w0[0] = __byte_perm_S ( 0, w0[0], selector); + w7[3] = hc_byte_perm_S (w7[2], w7[3], selector); + w7[2] = hc_byte_perm_S (w7[1], w7[2], selector); + w7[1] = hc_byte_perm_S (w7[0], w7[1], selector); + w7[0] = hc_byte_perm_S (w6[3], w7[0], selector); + w6[3] = hc_byte_perm_S (w6[2], w6[3], selector); + w6[2] = hc_byte_perm_S (w6[1], w6[2], selector); + w6[1] = hc_byte_perm_S (w6[0], w6[1], selector); + w6[0] = hc_byte_perm_S (w5[3], w6[0], selector); + w5[3] = hc_byte_perm_S (w5[2], w5[3], selector); + w5[2] = hc_byte_perm_S (w5[1], w5[2], selector); + w5[1] = hc_byte_perm_S (w5[0], w5[1], selector); + w5[0] = hc_byte_perm_S (w4[3], w5[0], selector); + w4[3] = hc_byte_perm_S (w4[2], w4[3], selector); + w4[2] = hc_byte_perm_S (w4[1], w4[2], selector); + w4[1] = hc_byte_perm_S (w4[0], w4[1], selector); + w4[0] = hc_byte_perm_S (w3[3], w4[0], selector); + w3[3] = hc_byte_perm_S (w3[2], w3[3], selector); + w3[2] = hc_byte_perm_S (w3[1], w3[2], selector); + w3[1] = hc_byte_perm_S (w3[0], w3[1], selector); + w3[0] = hc_byte_perm_S (w2[3], w3[0], selector); + w2[3] = hc_byte_perm_S (w2[2], w2[3], selector); + w2[2] = hc_byte_perm_S (w2[1], w2[2], selector); + w2[1] = hc_byte_perm_S (w2[0], w2[1], selector); + w2[0] = hc_byte_perm_S (w1[3], w2[0], selector); + w1[3] = hc_byte_perm_S (w1[2], w1[3], selector); + w1[2] = hc_byte_perm_S (w1[1], w1[2], selector); + w1[1] = hc_byte_perm_S (w1[0], w1[1], selector); + w1[0] = hc_byte_perm_S (w0[3], w1[0], selector); + w0[3] = hc_byte_perm_S (w0[2], w0[3], selector); + w0[2] = hc_byte_perm_S (w0[1], w0[2], selector); + w0[1] = hc_byte_perm_S (w0[0], w0[1], selector); + w0[0] = hc_byte_perm_S ( 0, w0[0], selector); break; case 1: - w7[3] = __byte_perm_S (w7[1], w7[2], selector); - w7[2] = __byte_perm_S (w7[0], w7[1], selector); - w7[1] = __byte_perm_S (w6[3], w7[0], selector); - w7[0] = __byte_perm_S (w6[2], w6[3], selector); - w6[3] = __byte_perm_S (w6[1], w6[2], selector); - w6[2] = __byte_perm_S (w6[0], w6[1], selector); - w6[1] = __byte_perm_S (w5[3], w6[0], selector); - w6[0] = __byte_perm_S (w5[2], w5[3], selector); - w5[3] = __byte_perm_S (w5[1], w5[2], selector); - w5[2] = __byte_perm_S (w5[0], w5[1], selector); - w5[1] = __byte_perm_S (w4[3], w5[0], selector); - w5[0] = __byte_perm_S (w4[2], w4[3], selector); - w4[3] = __byte_perm_S (w4[1], w4[2], selector); - w4[2] = __byte_perm_S (w4[0], w4[1], selector); - w4[1] = __byte_perm_S (w3[3], w4[0], selector); - w4[0] = __byte_perm_S (w3[2], w3[3], selector); - w3[3] = __byte_perm_S (w3[1], w3[2], selector); - w3[2] = __byte_perm_S (w3[0], w3[1], selector); - w3[1] = __byte_perm_S (w2[3], w3[0], selector); - w3[0] = __byte_perm_S (w2[2], w2[3], selector); - w2[3] = __byte_perm_S (w2[1], w2[2], selector); - w2[2] = __byte_perm_S (w2[0], w2[1], selector); - w2[1] = __byte_perm_S (w1[3], w2[0], selector); - w2[0] = __byte_perm_S (w1[2], w1[3], selector); - w1[3] = __byte_perm_S (w1[1], w1[2], selector); - w1[2] = __byte_perm_S (w1[0], w1[1], selector); - w1[1] = __byte_perm_S (w0[3], w1[0], selector); - w1[0] = __byte_perm_S (w0[2], w0[3], selector); - w0[3] = __byte_perm_S (w0[1], w0[2], selector); - w0[2] = __byte_perm_S (w0[0], w0[1], selector); - w0[1] = __byte_perm_S ( 0, w0[0], selector); + w7[3] = hc_byte_perm_S (w7[1], w7[2], selector); + w7[2] = hc_byte_perm_S (w7[0], w7[1], selector); + w7[1] = hc_byte_perm_S (w6[3], w7[0], selector); + w7[0] = hc_byte_perm_S (w6[2], w6[3], selector); + w6[3] = hc_byte_perm_S (w6[1], w6[2], selector); + w6[2] = hc_byte_perm_S (w6[0], w6[1], selector); + w6[1] = hc_byte_perm_S (w5[3], w6[0], selector); + w6[0] = hc_byte_perm_S (w5[2], w5[3], selector); + w5[3] = hc_byte_perm_S (w5[1], w5[2], selector); + w5[2] = hc_byte_perm_S (w5[0], w5[1], selector); + w5[1] = hc_byte_perm_S (w4[3], w5[0], selector); + w5[0] = hc_byte_perm_S (w4[2], w4[3], selector); + w4[3] = hc_byte_perm_S (w4[1], w4[2], selector); + w4[2] = hc_byte_perm_S (w4[0], w4[1], selector); + w4[1] = hc_byte_perm_S (w3[3], w4[0], selector); + w4[0] = hc_byte_perm_S (w3[2], w3[3], selector); + w3[3] = hc_byte_perm_S (w3[1], w3[2], selector); + w3[2] = hc_byte_perm_S (w3[0], w3[1], selector); + w3[1] = hc_byte_perm_S (w2[3], w3[0], selector); + w3[0] = hc_byte_perm_S (w2[2], w2[3], selector); + w2[3] = hc_byte_perm_S (w2[1], w2[2], selector); + w2[2] = hc_byte_perm_S (w2[0], w2[1], selector); + w2[1] = hc_byte_perm_S (w1[3], w2[0], selector); + w2[0] = hc_byte_perm_S (w1[2], w1[3], selector); + w1[3] = hc_byte_perm_S (w1[1], w1[2], selector); + w1[2] = hc_byte_perm_S (w1[0], w1[1], selector); + w1[1] = hc_byte_perm_S (w0[3], w1[0], selector); + w1[0] = hc_byte_perm_S (w0[2], w0[3], selector); + w0[3] = hc_byte_perm_S (w0[1], w0[2], selector); + w0[2] = hc_byte_perm_S (w0[0], w0[1], selector); + w0[1] = hc_byte_perm_S ( 0, w0[0], selector); w0[0] = 0; break; case 2: - w7[3] = __byte_perm_S (w7[0], w7[1], selector); - w7[2] = __byte_perm_S (w6[3], w7[0], selector); - w7[1] = __byte_perm_S (w6[2], w6[3], selector); - w7[0] = __byte_perm_S (w6[1], w6[2], selector); - w6[3] = __byte_perm_S (w6[0], w6[1], selector); - w6[2] = __byte_perm_S (w5[3], w6[0], selector); - w6[1] = __byte_perm_S (w5[2], w5[3], selector); - w6[0] = __byte_perm_S (w5[1], w5[2], selector); - w5[3] = __byte_perm_S (w5[0], w5[1], selector); - w5[2] = __byte_perm_S (w4[3], w5[0], selector); - w5[1] = __byte_perm_S (w4[2], w4[3], selector); - w5[0] = __byte_perm_S (w4[1], w4[2], selector); - w4[3] = __byte_perm_S (w4[0], w4[1], selector); - w4[2] = __byte_perm_S (w3[3], w4[0], selector); - w4[1] = __byte_perm_S (w3[2], w3[3], selector); - w4[0] = __byte_perm_S (w3[1], w3[2], selector); - w3[3] = __byte_perm_S (w3[0], w3[1], selector); - w3[2] = __byte_perm_S (w2[3], w3[0], selector); - w3[1] = __byte_perm_S (w2[2], w2[3], selector); - w3[0] = __byte_perm_S (w2[1], w2[2], selector); - w2[3] = __byte_perm_S (w2[0], w2[1], selector); - w2[2] = __byte_perm_S (w1[3], w2[0], selector); - w2[1] = __byte_perm_S (w1[2], w1[3], selector); - w2[0] = __byte_perm_S (w1[1], w1[2], selector); - w1[3] = __byte_perm_S (w1[0], w1[1], selector); - w1[2] = __byte_perm_S (w0[3], w1[0], selector); - w1[1] = __byte_perm_S (w0[2], w0[3], selector); - w1[0] = __byte_perm_S (w0[1], w0[2], selector); - w0[3] = __byte_perm_S (w0[0], w0[1], selector); - w0[2] = __byte_perm_S ( 0, w0[0], selector); + w7[3] = hc_byte_perm_S (w7[0], w7[1], selector); + w7[2] = hc_byte_perm_S (w6[3], w7[0], selector); + w7[1] = hc_byte_perm_S (w6[2], w6[3], selector); + w7[0] = hc_byte_perm_S (w6[1], w6[2], selector); + w6[3] = hc_byte_perm_S (w6[0], w6[1], selector); + w6[2] = hc_byte_perm_S (w5[3], w6[0], selector); + w6[1] = hc_byte_perm_S (w5[2], w5[3], selector); + w6[0] = hc_byte_perm_S (w5[1], w5[2], selector); + w5[3] = hc_byte_perm_S (w5[0], w5[1], selector); + w5[2] = hc_byte_perm_S (w4[3], w5[0], selector); + w5[1] = hc_byte_perm_S (w4[2], w4[3], selector); + w5[0] = hc_byte_perm_S (w4[1], w4[2], selector); + w4[3] = hc_byte_perm_S (w4[0], w4[1], selector); + w4[2] = hc_byte_perm_S (w3[3], w4[0], selector); + w4[1] = hc_byte_perm_S (w3[2], w3[3], selector); + w4[0] = hc_byte_perm_S (w3[1], w3[2], selector); + w3[3] = hc_byte_perm_S (w3[0], w3[1], selector); + w3[2] = hc_byte_perm_S (w2[3], w3[0], selector); + w3[1] = hc_byte_perm_S (w2[2], w2[3], selector); + w3[0] = hc_byte_perm_S (w2[1], w2[2], selector); + w2[3] = hc_byte_perm_S (w2[0], w2[1], selector); + w2[2] = hc_byte_perm_S (w1[3], w2[0], selector); + w2[1] = hc_byte_perm_S (w1[2], w1[3], selector); + w2[0] = hc_byte_perm_S (w1[1], w1[2], selector); + w1[3] = hc_byte_perm_S (w1[0], w1[1], selector); + w1[2] = hc_byte_perm_S (w0[3], w1[0], selector); + w1[1] = hc_byte_perm_S (w0[2], w0[3], selector); + w1[0] = hc_byte_perm_S (w0[1], w0[2], selector); + w0[3] = hc_byte_perm_S (w0[0], w0[1], selector); + w0[2] = hc_byte_perm_S ( 0, w0[0], selector); w0[1] = 0; w0[0] = 0; break; case 3: - w7[3] = __byte_perm_S (w6[3], w7[0], selector); - w7[2] = __byte_perm_S (w6[2], w6[3], selector); - w7[1] = __byte_perm_S (w6[1], w6[2], selector); - w7[0] = __byte_perm_S (w6[0], w6[1], selector); - w6[3] = __byte_perm_S (w5[3], w6[0], selector); - w6[2] = __byte_perm_S (w5[2], w5[3], selector); - w6[1] = __byte_perm_S (w5[1], w5[2], selector); - w6[0] = __byte_perm_S (w5[0], w5[1], selector); - w5[3] = __byte_perm_S (w4[3], w5[0], selector); - w5[2] = __byte_perm_S (w4[2], w4[3], selector); - w5[1] = __byte_perm_S (w4[1], w4[2], selector); - w5[0] = __byte_perm_S (w4[0], w4[1], selector); - w4[3] = __byte_perm_S (w3[3], w4[0], selector); - w4[2] = __byte_perm_S (w3[2], w3[3], selector); - w4[1] = __byte_perm_S (w3[1], w3[2], selector); - w4[0] = __byte_perm_S (w3[0], w3[1], selector); - w3[3] = __byte_perm_S (w2[3], w3[0], selector); - w3[2] = __byte_perm_S (w2[2], w2[3], selector); - w3[1] = __byte_perm_S (w2[1], w2[2], selector); - w3[0] = __byte_perm_S (w2[0], w2[1], selector); - w2[3] = __byte_perm_S (w1[3], w2[0], selector); - w2[2] = __byte_perm_S (w1[2], w1[3], selector); - w2[1] = __byte_perm_S (w1[1], w1[2], selector); - w2[0] = __byte_perm_S (w1[0], w1[1], selector); - w1[3] = __byte_perm_S (w0[3], w1[0], selector); - w1[2] = __byte_perm_S (w0[2], w0[3], selector); - w1[1] = __byte_perm_S (w0[1], w0[2], selector); - w1[0] = __byte_perm_S (w0[0], w0[1], selector); - w0[3] = __byte_perm_S ( 0, w0[0], selector); + w7[3] = hc_byte_perm_S (w6[3], w7[0], selector); + w7[2] = hc_byte_perm_S (w6[2], w6[3], selector); + w7[1] = hc_byte_perm_S (w6[1], w6[2], selector); + w7[0] = hc_byte_perm_S (w6[0], w6[1], selector); + w6[3] = hc_byte_perm_S (w5[3], w6[0], selector); + w6[2] = hc_byte_perm_S (w5[2], w5[3], selector); + w6[1] = hc_byte_perm_S (w5[1], w5[2], selector); + w6[0] = hc_byte_perm_S (w5[0], w5[1], selector); + w5[3] = hc_byte_perm_S (w4[3], w5[0], selector); + w5[2] = hc_byte_perm_S (w4[2], w4[3], selector); + w5[1] = hc_byte_perm_S (w4[1], w4[2], selector); + w5[0] = hc_byte_perm_S (w4[0], w4[1], selector); + w4[3] = hc_byte_perm_S (w3[3], w4[0], selector); + w4[2] = hc_byte_perm_S (w3[2], w3[3], selector); + w4[1] = hc_byte_perm_S (w3[1], w3[2], selector); + w4[0] = hc_byte_perm_S (w3[0], w3[1], selector); + w3[3] = hc_byte_perm_S (w2[3], w3[0], selector); + w3[2] = hc_byte_perm_S (w2[2], w2[3], selector); + w3[1] = hc_byte_perm_S (w2[1], w2[2], selector); + w3[0] = hc_byte_perm_S (w2[0], w2[1], selector); + w2[3] = hc_byte_perm_S (w1[3], w2[0], selector); + w2[2] = hc_byte_perm_S (w1[2], w1[3], selector); + w2[1] = hc_byte_perm_S (w1[1], w1[2], selector); + w2[0] = hc_byte_perm_S (w1[0], w1[1], selector); + w1[3] = hc_byte_perm_S (w0[3], w1[0], selector); + w1[2] = hc_byte_perm_S (w0[2], w0[3], selector); + w1[1] = hc_byte_perm_S (w0[1], w0[2], selector); + w1[0] = hc_byte_perm_S (w0[0], w0[1], selector); + w0[3] = hc_byte_perm_S ( 0, w0[0], selector); w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 4: - w7[3] = __byte_perm_S (w6[2], w6[3], selector); - w7[2] = __byte_perm_S (w6[1], w6[2], selector); - w7[1] = __byte_perm_S (w6[0], w6[1], selector); - w7[0] = __byte_perm_S (w5[3], w6[0], selector); - w6[3] = __byte_perm_S (w5[2], w5[3], selector); - w6[2] = __byte_perm_S (w5[1], w5[2], selector); - w6[1] = __byte_perm_S (w5[0], w5[1], selector); - w6[0] = __byte_perm_S (w4[3], w5[0], selector); - w5[3] = __byte_perm_S (w4[2], w4[3], selector); - w5[2] = __byte_perm_S (w4[1], w4[2], selector); - w5[1] = __byte_perm_S (w4[0], w4[1], selector); - w5[0] = __byte_perm_S (w3[3], w4[0], selector); - w4[3] = __byte_perm_S (w3[2], w3[3], selector); - w4[2] = __byte_perm_S (w3[1], w3[2], selector); - w4[1] = __byte_perm_S (w3[0], w3[1], selector); - w4[0] = __byte_perm_S (w2[3], w3[0], selector); - w3[3] = __byte_perm_S (w2[2], w2[3], selector); - w3[2] = __byte_perm_S (w2[1], w2[2], selector); - w3[1] = __byte_perm_S (w2[0], w2[1], selector); - w3[0] = __byte_perm_S (w1[3], w2[0], selector); - w2[3] = __byte_perm_S (w1[2], w1[3], selector); - w2[2] = __byte_perm_S (w1[1], w1[2], selector); - w2[1] = __byte_perm_S (w1[0], w1[1], selector); - w2[0] = __byte_perm_S (w0[3], w1[0], selector); - w1[3] = __byte_perm_S (w0[2], w0[3], selector); - w1[2] = __byte_perm_S (w0[1], w0[2], selector); - w1[1] = __byte_perm_S (w0[0], w0[1], selector); - w1[0] = __byte_perm_S ( 0, w0[0], selector); + w7[3] = hc_byte_perm_S (w6[2], w6[3], selector); + w7[2] = hc_byte_perm_S (w6[1], w6[2], selector); + w7[1] = hc_byte_perm_S (w6[0], w6[1], selector); + w7[0] = hc_byte_perm_S (w5[3], w6[0], selector); + w6[3] = hc_byte_perm_S (w5[2], w5[3], selector); + w6[2] = hc_byte_perm_S (w5[1], w5[2], selector); + w6[1] = hc_byte_perm_S (w5[0], w5[1], selector); + w6[0] = hc_byte_perm_S (w4[3], w5[0], selector); + w5[3] = hc_byte_perm_S (w4[2], w4[3], selector); + w5[2] = hc_byte_perm_S (w4[1], w4[2], selector); + w5[1] = hc_byte_perm_S (w4[0], w4[1], selector); + w5[0] = hc_byte_perm_S (w3[3], w4[0], selector); + w4[3] = hc_byte_perm_S (w3[2], w3[3], selector); + w4[2] = hc_byte_perm_S (w3[1], w3[2], selector); + w4[1] = hc_byte_perm_S (w3[0], w3[1], selector); + w4[0] = hc_byte_perm_S (w2[3], w3[0], selector); + w3[3] = hc_byte_perm_S (w2[2], w2[3], selector); + w3[2] = hc_byte_perm_S (w2[1], w2[2], selector); + w3[1] = hc_byte_perm_S (w2[0], w2[1], selector); + w3[0] = hc_byte_perm_S (w1[3], w2[0], selector); + w2[3] = hc_byte_perm_S (w1[2], w1[3], selector); + w2[2] = hc_byte_perm_S (w1[1], w1[2], selector); + w2[1] = hc_byte_perm_S (w1[0], w1[1], selector); + w2[0] = hc_byte_perm_S (w0[3], w1[0], selector); + w1[3] = hc_byte_perm_S (w0[2], w0[3], selector); + w1[2] = hc_byte_perm_S (w0[1], w0[2], selector); + w1[1] = hc_byte_perm_S (w0[0], w0[1], selector); + w1[0] = hc_byte_perm_S ( 0, w0[0], selector); w0[3] = 0; w0[2] = 0; w0[1] = 0; @@ -36239,33 +36239,33 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 5: - w7[3] = __byte_perm_S (w6[1], w6[2], selector); - w7[2] = __byte_perm_S (w6[0], w6[1], selector); - w7[1] = __byte_perm_S (w5[3], w6[0], selector); - w7[0] = __byte_perm_S (w5[2], w5[3], selector); - w6[3] = __byte_perm_S (w5[1], w5[2], selector); - w6[2] = __byte_perm_S (w5[0], w5[1], selector); - w6[1] = __byte_perm_S (w4[3], w5[0], selector); - w6[0] = __byte_perm_S (w4[2], w4[3], selector); - w5[3] = __byte_perm_S (w4[1], w4[2], selector); - w5[2] = __byte_perm_S (w4[0], w4[1], selector); - w5[1] = __byte_perm_S (w3[3], w4[0], selector); - w5[0] = __byte_perm_S (w3[2], w3[3], selector); - w4[3] = __byte_perm_S (w3[1], w3[2], selector); - w4[2] = __byte_perm_S (w3[0], w3[1], selector); - w4[1] = __byte_perm_S (w2[3], w3[0], selector); - w4[0] = __byte_perm_S (w2[2], w2[3], selector); - w3[3] = __byte_perm_S (w2[1], w2[2], selector); - w3[2] = __byte_perm_S (w2[0], w2[1], selector); - w3[1] = __byte_perm_S (w1[3], w2[0], selector); - w3[0] = __byte_perm_S (w1[2], w1[3], selector); - w2[3] = __byte_perm_S (w1[1], w1[2], selector); - w2[2] = __byte_perm_S (w1[0], w1[1], selector); - w2[1] = __byte_perm_S (w0[3], w1[0], selector); - w2[0] = __byte_perm_S (w0[2], w0[3], selector); - w1[3] = __byte_perm_S (w0[1], w0[2], selector); - w1[2] = __byte_perm_S (w0[0], w0[1], selector); - w1[1] = __byte_perm_S ( 0, w0[0], selector); + w7[3] = hc_byte_perm_S (w6[1], w6[2], selector); + w7[2] = hc_byte_perm_S (w6[0], w6[1], selector); + w7[1] = hc_byte_perm_S (w5[3], w6[0], selector); + w7[0] = hc_byte_perm_S (w5[2], w5[3], selector); + w6[3] = hc_byte_perm_S (w5[1], w5[2], selector); + w6[2] = hc_byte_perm_S (w5[0], w5[1], selector); + w6[1] = hc_byte_perm_S (w4[3], w5[0], selector); + w6[0] = hc_byte_perm_S (w4[2], w4[3], selector); + w5[3] = hc_byte_perm_S (w4[1], w4[2], selector); + w5[2] = hc_byte_perm_S (w4[0], w4[1], selector); + w5[1] = hc_byte_perm_S (w3[3], w4[0], selector); + w5[0] = hc_byte_perm_S (w3[2], w3[3], selector); + w4[3] = hc_byte_perm_S (w3[1], w3[2], selector); + w4[2] = hc_byte_perm_S (w3[0], w3[1], selector); + w4[1] = hc_byte_perm_S (w2[3], w3[0], selector); + w4[0] = hc_byte_perm_S (w2[2], w2[3], selector); + w3[3] = hc_byte_perm_S (w2[1], w2[2], selector); + w3[2] = hc_byte_perm_S (w2[0], w2[1], selector); + w3[1] = hc_byte_perm_S (w1[3], w2[0], selector); + w3[0] = hc_byte_perm_S (w1[2], w1[3], selector); + w2[3] = hc_byte_perm_S (w1[1], w1[2], selector); + w2[2] = hc_byte_perm_S (w1[0], w1[1], selector); + w2[1] = hc_byte_perm_S (w0[3], w1[0], selector); + w2[0] = hc_byte_perm_S (w0[2], w0[3], selector); + w1[3] = hc_byte_perm_S (w0[1], w0[2], selector); + w1[2] = hc_byte_perm_S (w0[0], w0[1], selector); + w1[1] = hc_byte_perm_S ( 0, w0[0], selector); w1[0] = 0; w0[3] = 0; w0[2] = 0; @@ -36274,32 +36274,32 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 6: - w7[3] = __byte_perm_S (w6[0], w6[1], selector); - w7[2] = __byte_perm_S (w5[3], w6[0], selector); - w7[1] = __byte_perm_S (w5[2], w5[3], selector); - w7[0] = __byte_perm_S (w5[1], w5[2], selector); - w6[3] = __byte_perm_S (w5[0], w5[1], selector); - w6[2] = __byte_perm_S (w4[3], w5[0], selector); - w6[1] = __byte_perm_S (w4[2], w4[3], selector); - w6[0] = __byte_perm_S (w4[1], w4[2], selector); - w5[3] = __byte_perm_S (w4[0], w4[1], selector); - w5[2] = __byte_perm_S (w3[3], w4[0], selector); - w5[1] = __byte_perm_S (w3[2], w3[3], selector); - w5[0] = __byte_perm_S (w3[1], w3[2], selector); - w4[3] = __byte_perm_S (w3[0], w3[1], selector); - w4[2] = __byte_perm_S (w2[3], w3[0], selector); - w4[1] = __byte_perm_S (w2[2], w2[3], selector); - w4[0] = __byte_perm_S (w2[1], w2[2], selector); - w3[3] = __byte_perm_S (w2[0], w2[1], selector); - w3[2] = __byte_perm_S (w1[3], w2[0], selector); - w3[1] = __byte_perm_S (w1[2], w1[3], selector); - w3[0] = __byte_perm_S (w1[1], w1[2], selector); - w2[3] = __byte_perm_S (w1[0], w1[1], selector); - w2[2] = __byte_perm_S (w0[3], w1[0], selector); - w2[1] = __byte_perm_S (w0[2], w0[3], selector); - w2[0] = __byte_perm_S (w0[1], w0[2], selector); - w1[3] = __byte_perm_S (w0[0], w0[1], selector); - w1[2] = __byte_perm_S ( 0, w0[0], selector); + w7[3] = hc_byte_perm_S (w6[0], w6[1], selector); + w7[2] = hc_byte_perm_S (w5[3], w6[0], selector); + w7[1] = hc_byte_perm_S (w5[2], w5[3], selector); + w7[0] = hc_byte_perm_S (w5[1], w5[2], selector); + w6[3] = hc_byte_perm_S (w5[0], w5[1], selector); + w6[2] = hc_byte_perm_S (w4[3], w5[0], selector); + w6[1] = hc_byte_perm_S (w4[2], w4[3], selector); + w6[0] = hc_byte_perm_S (w4[1], w4[2], selector); + w5[3] = hc_byte_perm_S (w4[0], w4[1], selector); + w5[2] = hc_byte_perm_S (w3[3], w4[0], selector); + w5[1] = hc_byte_perm_S (w3[2], w3[3], selector); + w5[0] = hc_byte_perm_S (w3[1], w3[2], selector); + w4[3] = hc_byte_perm_S (w3[0], w3[1], selector); + w4[2] = hc_byte_perm_S (w2[3], w3[0], selector); + w4[1] = hc_byte_perm_S (w2[2], w2[3], selector); + w4[0] = hc_byte_perm_S (w2[1], w2[2], selector); + w3[3] = hc_byte_perm_S (w2[0], w2[1], selector); + w3[2] = hc_byte_perm_S (w1[3], w2[0], selector); + w3[1] = hc_byte_perm_S (w1[2], w1[3], selector); + w3[0] = hc_byte_perm_S (w1[1], w1[2], selector); + w2[3] = hc_byte_perm_S (w1[0], w1[1], selector); + w2[2] = hc_byte_perm_S (w0[3], w1[0], selector); + w2[1] = hc_byte_perm_S (w0[2], w0[3], selector); + w2[0] = hc_byte_perm_S (w0[1], w0[2], selector); + w1[3] = hc_byte_perm_S (w0[0], w0[1], selector); + w1[2] = hc_byte_perm_S ( 0, w0[0], selector); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -36309,31 +36309,31 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 7: - w7[3] = __byte_perm_S (w5[3], w6[0], selector); - w7[2] = __byte_perm_S (w5[2], w5[3], selector); - w7[1] = __byte_perm_S (w5[1], w5[2], selector); - w7[0] = __byte_perm_S (w5[0], w5[1], selector); - w6[3] = __byte_perm_S (w4[3], w5[0], selector); - w6[2] = __byte_perm_S (w4[2], w4[3], selector); - w6[1] = __byte_perm_S (w4[1], w4[2], selector); - w6[0] = __byte_perm_S (w4[0], w4[1], selector); - w5[3] = __byte_perm_S (w3[3], w4[0], selector); - w5[2] = __byte_perm_S (w3[2], w3[3], selector); - w5[1] = __byte_perm_S (w3[1], w3[2], selector); - w5[0] = __byte_perm_S (w3[0], w3[1], selector); - w4[3] = __byte_perm_S (w2[3], w3[0], selector); - w4[2] = __byte_perm_S (w2[2], w2[3], selector); - w4[1] = __byte_perm_S (w2[1], w2[2], selector); - w4[0] = __byte_perm_S (w2[0], w2[1], selector); - w3[3] = __byte_perm_S (w1[3], w2[0], selector); - w3[2] = __byte_perm_S (w1[2], w1[3], selector); - w3[1] = __byte_perm_S (w1[1], w1[2], selector); - w3[0] = __byte_perm_S (w1[0], w1[1], selector); - w2[3] = __byte_perm_S (w0[3], w1[0], selector); - w2[2] = __byte_perm_S (w0[2], w0[3], selector); - w2[1] = __byte_perm_S (w0[1], w0[2], selector); - w2[0] = __byte_perm_S (w0[0], w0[1], selector); - w1[3] = __byte_perm_S ( 0, w0[0], selector); + w7[3] = hc_byte_perm_S (w5[3], w6[0], selector); + w7[2] = hc_byte_perm_S (w5[2], w5[3], selector); + w7[1] = hc_byte_perm_S (w5[1], w5[2], selector); + w7[0] = hc_byte_perm_S (w5[0], w5[1], selector); + w6[3] = hc_byte_perm_S (w4[3], w5[0], selector); + w6[2] = hc_byte_perm_S (w4[2], w4[3], selector); + w6[1] = hc_byte_perm_S (w4[1], w4[2], selector); + w6[0] = hc_byte_perm_S (w4[0], w4[1], selector); + w5[3] = hc_byte_perm_S (w3[3], w4[0], selector); + w5[2] = hc_byte_perm_S (w3[2], w3[3], selector); + w5[1] = hc_byte_perm_S (w3[1], w3[2], selector); + w5[0] = hc_byte_perm_S (w3[0], w3[1], selector); + w4[3] = hc_byte_perm_S (w2[3], w3[0], selector); + w4[2] = hc_byte_perm_S (w2[2], w2[3], selector); + w4[1] = hc_byte_perm_S (w2[1], w2[2], selector); + w4[0] = hc_byte_perm_S (w2[0], w2[1], selector); + w3[3] = hc_byte_perm_S (w1[3], w2[0], selector); + w3[2] = hc_byte_perm_S (w1[2], w1[3], selector); + w3[1] = hc_byte_perm_S (w1[1], w1[2], selector); + w3[0] = hc_byte_perm_S (w1[0], w1[1], selector); + w2[3] = hc_byte_perm_S (w0[3], w1[0], selector); + w2[2] = hc_byte_perm_S (w0[2], w0[3], selector); + w2[1] = hc_byte_perm_S (w0[1], w0[2], selector); + w2[0] = hc_byte_perm_S (w0[0], w0[1], selector); + w1[3] = hc_byte_perm_S ( 0, w0[0], selector); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -36344,30 +36344,30 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 8: - w7[3] = __byte_perm_S (w5[2], w5[3], selector); - w7[2] = __byte_perm_S (w5[1], w5[2], selector); - w7[1] = __byte_perm_S (w5[0], w5[1], selector); - w7[0] = __byte_perm_S (w4[3], w5[0], selector); - w6[3] = __byte_perm_S (w4[2], w4[3], selector); - w6[2] = __byte_perm_S (w4[1], w4[2], selector); - w6[1] = __byte_perm_S (w4[0], w4[1], selector); - w6[0] = __byte_perm_S (w3[3], w4[0], selector); - w5[3] = __byte_perm_S (w3[2], w3[3], selector); - w5[2] = __byte_perm_S (w3[1], w3[2], selector); - w5[1] = __byte_perm_S (w3[0], w3[1], selector); - w5[0] = __byte_perm_S (w2[3], w3[0], selector); - w4[3] = __byte_perm_S (w2[2], w2[3], selector); - w4[2] = __byte_perm_S (w2[1], w2[2], selector); - w4[1] = __byte_perm_S (w2[0], w2[1], selector); - w4[0] = __byte_perm_S (w1[3], w2[0], selector); - w3[3] = __byte_perm_S (w1[2], w1[3], selector); - w3[2] = __byte_perm_S (w1[1], w1[2], selector); - w3[1] = __byte_perm_S (w1[0], w1[1], selector); - w3[0] = __byte_perm_S (w0[3], w1[0], selector); - w2[3] = __byte_perm_S (w0[2], w0[3], selector); - w2[2] = __byte_perm_S (w0[1], w0[2], selector); - w2[1] = __byte_perm_S (w0[0], w0[1], selector); - w2[0] = __byte_perm_S ( 0, w0[0], selector); + w7[3] = hc_byte_perm_S (w5[2], w5[3], selector); + w7[2] = hc_byte_perm_S (w5[1], w5[2], selector); + w7[1] = hc_byte_perm_S (w5[0], w5[1], selector); + w7[0] = hc_byte_perm_S (w4[3], w5[0], selector); + w6[3] = hc_byte_perm_S (w4[2], w4[3], selector); + w6[2] = hc_byte_perm_S (w4[1], w4[2], selector); + w6[1] = hc_byte_perm_S (w4[0], w4[1], selector); + w6[0] = hc_byte_perm_S (w3[3], w4[0], selector); + w5[3] = hc_byte_perm_S (w3[2], w3[3], selector); + w5[2] = hc_byte_perm_S (w3[1], w3[2], selector); + w5[1] = hc_byte_perm_S (w3[0], w3[1], selector); + w5[0] = hc_byte_perm_S (w2[3], w3[0], selector); + w4[3] = hc_byte_perm_S (w2[2], w2[3], selector); + w4[2] = hc_byte_perm_S (w2[1], w2[2], selector); + w4[1] = hc_byte_perm_S (w2[0], w2[1], selector); + w4[0] = hc_byte_perm_S (w1[3], w2[0], selector); + w3[3] = hc_byte_perm_S (w1[2], w1[3], selector); + w3[2] = hc_byte_perm_S (w1[1], w1[2], selector); + w3[1] = hc_byte_perm_S (w1[0], w1[1], selector); + w3[0] = hc_byte_perm_S (w0[3], w1[0], selector); + w2[3] = hc_byte_perm_S (w0[2], w0[3], selector); + w2[2] = hc_byte_perm_S (w0[1], w0[2], selector); + w2[1] = hc_byte_perm_S (w0[0], w0[1], selector); + w2[0] = hc_byte_perm_S ( 0, w0[0], selector); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -36379,29 +36379,29 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 9: - w7[3] = __byte_perm_S (w5[1], w5[2], selector); - w7[2] = __byte_perm_S (w5[0], w5[1], selector); - w7[1] = __byte_perm_S (w4[3], w5[0], selector); - w7[0] = __byte_perm_S (w4[2], w4[3], selector); - w6[3] = __byte_perm_S (w4[1], w4[2], selector); - w6[2] = __byte_perm_S (w4[0], w4[1], selector); - w6[1] = __byte_perm_S (w3[3], w4[0], selector); - w6[0] = __byte_perm_S (w3[2], w3[3], selector); - w5[3] = __byte_perm_S (w3[1], w3[2], selector); - w5[2] = __byte_perm_S (w3[0], w3[1], selector); - w5[1] = __byte_perm_S (w2[3], w3[0], selector); - w5[0] = __byte_perm_S (w2[2], w2[3], selector); - w4[3] = __byte_perm_S (w2[1], w2[2], selector); - w4[2] = __byte_perm_S (w2[0], w2[1], selector); - w4[1] = __byte_perm_S (w1[3], w2[0], selector); - w4[0] = __byte_perm_S (w1[2], w1[3], selector); - w3[3] = __byte_perm_S (w1[1], w1[2], selector); - w3[2] = __byte_perm_S (w1[0], w1[1], selector); - w3[1] = __byte_perm_S (w0[3], w1[0], selector); - w3[0] = __byte_perm_S (w0[2], w0[3], selector); - w2[3] = __byte_perm_S (w0[1], w0[2], selector); - w2[2] = __byte_perm_S (w0[0], w0[1], selector); - w2[1] = __byte_perm_S ( 0, w0[0], selector); + w7[3] = hc_byte_perm_S (w5[1], w5[2], selector); + w7[2] = hc_byte_perm_S (w5[0], w5[1], selector); + w7[1] = hc_byte_perm_S (w4[3], w5[0], selector); + w7[0] = hc_byte_perm_S (w4[2], w4[3], selector); + w6[3] = hc_byte_perm_S (w4[1], w4[2], selector); + w6[2] = hc_byte_perm_S (w4[0], w4[1], selector); + w6[1] = hc_byte_perm_S (w3[3], w4[0], selector); + w6[0] = hc_byte_perm_S (w3[2], w3[3], selector); + w5[3] = hc_byte_perm_S (w3[1], w3[2], selector); + w5[2] = hc_byte_perm_S (w3[0], w3[1], selector); + w5[1] = hc_byte_perm_S (w2[3], w3[0], selector); + w5[0] = hc_byte_perm_S (w2[2], w2[3], selector); + w4[3] = hc_byte_perm_S (w2[1], w2[2], selector); + w4[2] = hc_byte_perm_S (w2[0], w2[1], selector); + w4[1] = hc_byte_perm_S (w1[3], w2[0], selector); + w4[0] = hc_byte_perm_S (w1[2], w1[3], selector); + w3[3] = hc_byte_perm_S (w1[1], w1[2], selector); + w3[2] = hc_byte_perm_S (w1[0], w1[1], selector); + w3[1] = hc_byte_perm_S (w0[3], w1[0], selector); + w3[0] = hc_byte_perm_S (w0[2], w0[3], selector); + w2[3] = hc_byte_perm_S (w0[1], w0[2], selector); + w2[2] = hc_byte_perm_S (w0[0], w0[1], selector); + w2[1] = hc_byte_perm_S ( 0, w0[0], selector); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -36414,28 +36414,28 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 10: - w7[3] = __byte_perm_S (w5[0], w5[1], selector); - w7[2] = __byte_perm_S (w4[3], w5[0], selector); - w7[1] = __byte_perm_S (w4[2], w4[3], selector); - w7[0] = __byte_perm_S (w4[1], w4[2], selector); - w6[3] = __byte_perm_S (w4[0], w4[1], selector); - w6[2] = __byte_perm_S (w3[3], w4[0], selector); - w6[1] = __byte_perm_S (w3[2], w3[3], selector); - w6[0] = __byte_perm_S (w3[1], w3[2], selector); - w5[3] = __byte_perm_S (w3[0], w3[1], selector); - w5[2] = __byte_perm_S (w2[3], w3[0], selector); - w5[1] = __byte_perm_S (w2[2], w2[3], selector); - w5[0] = __byte_perm_S (w2[1], w2[2], selector); - w4[3] = __byte_perm_S (w2[0], w2[1], selector); - w4[2] = __byte_perm_S (w1[3], w2[0], selector); - w4[1] = __byte_perm_S (w1[2], w1[3], selector); - w4[0] = __byte_perm_S (w1[1], w1[2], selector); - w3[3] = __byte_perm_S (w1[0], w1[1], selector); - w3[2] = __byte_perm_S (w0[3], w1[0], selector); - w3[1] = __byte_perm_S (w0[2], w0[3], selector); - w3[0] = __byte_perm_S (w0[1], w0[2], selector); - w2[3] = __byte_perm_S (w0[0], w0[1], selector); - w2[2] = __byte_perm_S ( 0, w0[0], selector); + w7[3] = hc_byte_perm_S (w5[0], w5[1], selector); + w7[2] = hc_byte_perm_S (w4[3], w5[0], selector); + w7[1] = hc_byte_perm_S (w4[2], w4[3], selector); + w7[0] = hc_byte_perm_S (w4[1], w4[2], selector); + w6[3] = hc_byte_perm_S (w4[0], w4[1], selector); + w6[2] = hc_byte_perm_S (w3[3], w4[0], selector); + w6[1] = hc_byte_perm_S (w3[2], w3[3], selector); + w6[0] = hc_byte_perm_S (w3[1], w3[2], selector); + w5[3] = hc_byte_perm_S (w3[0], w3[1], selector); + w5[2] = hc_byte_perm_S (w2[3], w3[0], selector); + w5[1] = hc_byte_perm_S (w2[2], w2[3], selector); + w5[0] = hc_byte_perm_S (w2[1], w2[2], selector); + w4[3] = hc_byte_perm_S (w2[0], w2[1], selector); + w4[2] = hc_byte_perm_S (w1[3], w2[0], selector); + w4[1] = hc_byte_perm_S (w1[2], w1[3], selector); + w4[0] = hc_byte_perm_S (w1[1], w1[2], selector); + w3[3] = hc_byte_perm_S (w1[0], w1[1], selector); + w3[2] = hc_byte_perm_S (w0[3], w1[0], selector); + w3[1] = hc_byte_perm_S (w0[2], w0[3], selector); + w3[0] = hc_byte_perm_S (w0[1], w0[2], selector); + w2[3] = hc_byte_perm_S (w0[0], w0[1], selector); + w2[2] = hc_byte_perm_S ( 0, w0[0], selector); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -36449,27 +36449,27 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 11: - w7[3] = __byte_perm_S (w4[3], w5[0], selector); - w7[2] = __byte_perm_S (w4[2], w4[3], selector); - w7[1] = __byte_perm_S (w4[1], w4[2], selector); - w7[0] = __byte_perm_S (w4[0], w4[1], selector); - w6[3] = __byte_perm_S (w3[3], w4[0], selector); - w6[2] = __byte_perm_S (w3[2], w3[3], selector); - w6[1] = __byte_perm_S (w3[1], w3[2], selector); - w6[0] = __byte_perm_S (w3[0], w3[1], selector); - w5[3] = __byte_perm_S (w2[3], w3[0], selector); - w5[2] = __byte_perm_S (w2[2], w2[3], selector); - w5[1] = __byte_perm_S (w2[1], w2[2], selector); - w5[0] = __byte_perm_S (w2[0], w2[1], selector); - w4[3] = __byte_perm_S (w1[3], w2[0], selector); - w4[2] = __byte_perm_S (w1[2], w1[3], selector); - w4[1] = __byte_perm_S (w1[1], w1[2], selector); - w4[0] = __byte_perm_S (w1[0], w1[1], selector); - w3[3] = __byte_perm_S (w0[3], w1[0], selector); - w3[2] = __byte_perm_S (w0[2], w0[3], selector); - w3[1] = __byte_perm_S (w0[1], w0[2], selector); - w3[0] = __byte_perm_S (w0[0], w0[1], selector); - w2[3] = __byte_perm_S ( 0, w0[0], selector); + w7[3] = hc_byte_perm_S (w4[3], w5[0], selector); + w7[2] = hc_byte_perm_S (w4[2], w4[3], selector); + w7[1] = hc_byte_perm_S (w4[1], w4[2], selector); + w7[0] = hc_byte_perm_S (w4[0], w4[1], selector); + w6[3] = hc_byte_perm_S (w3[3], w4[0], selector); + w6[2] = hc_byte_perm_S (w3[2], w3[3], selector); + w6[1] = hc_byte_perm_S (w3[1], w3[2], selector); + w6[0] = hc_byte_perm_S (w3[0], w3[1], selector); + w5[3] = hc_byte_perm_S (w2[3], w3[0], selector); + w5[2] = hc_byte_perm_S (w2[2], w2[3], selector); + w5[1] = hc_byte_perm_S (w2[1], w2[2], selector); + w5[0] = hc_byte_perm_S (w2[0], w2[1], selector); + w4[3] = hc_byte_perm_S (w1[3], w2[0], selector); + w4[2] = hc_byte_perm_S (w1[2], w1[3], selector); + w4[1] = hc_byte_perm_S (w1[1], w1[2], selector); + w4[0] = hc_byte_perm_S (w1[0], w1[1], selector); + w3[3] = hc_byte_perm_S (w0[3], w1[0], selector); + w3[2] = hc_byte_perm_S (w0[2], w0[3], selector); + w3[1] = hc_byte_perm_S (w0[1], w0[2], selector); + w3[0] = hc_byte_perm_S (w0[0], w0[1], selector); + w2[3] = hc_byte_perm_S ( 0, w0[0], selector); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -36484,26 +36484,26 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 12: - w7[3] = __byte_perm_S (w4[2], w4[3], selector); - w7[2] = __byte_perm_S (w4[1], w4[2], selector); - w7[1] = __byte_perm_S (w4[0], w4[1], selector); - w7[0] = __byte_perm_S (w3[3], w4[0], selector); - w6[3] = __byte_perm_S (w3[2], w3[3], selector); - w6[2] = __byte_perm_S (w3[1], w3[2], selector); - w6[1] = __byte_perm_S (w3[0], w3[1], selector); - w6[0] = __byte_perm_S (w2[3], w3[0], selector); - w5[3] = __byte_perm_S (w2[2], w2[3], selector); - w5[2] = __byte_perm_S (w2[1], w2[2], selector); - w5[1] = __byte_perm_S (w2[0], w2[1], selector); - w5[0] = __byte_perm_S (w1[3], w2[0], selector); - w4[3] = __byte_perm_S (w1[2], w1[3], selector); - w4[2] = __byte_perm_S (w1[1], w1[2], selector); - w4[1] = __byte_perm_S (w1[0], w1[1], selector); - w4[0] = __byte_perm_S (w0[3], w1[0], selector); - w3[3] = __byte_perm_S (w0[2], w0[3], selector); - w3[2] = __byte_perm_S (w0[1], w0[2], selector); - w3[1] = __byte_perm_S (w0[0], w0[1], selector); - w3[0] = __byte_perm_S ( 0, w0[0], selector); + w7[3] = hc_byte_perm_S (w4[2], w4[3], selector); + w7[2] = hc_byte_perm_S (w4[1], w4[2], selector); + w7[1] = hc_byte_perm_S (w4[0], w4[1], selector); + w7[0] = hc_byte_perm_S (w3[3], w4[0], selector); + w6[3] = hc_byte_perm_S (w3[2], w3[3], selector); + w6[2] = hc_byte_perm_S (w3[1], w3[2], selector); + w6[1] = hc_byte_perm_S (w3[0], w3[1], selector); + w6[0] = hc_byte_perm_S (w2[3], w3[0], selector); + w5[3] = hc_byte_perm_S (w2[2], w2[3], selector); + w5[2] = hc_byte_perm_S (w2[1], w2[2], selector); + w5[1] = hc_byte_perm_S (w2[0], w2[1], selector); + w5[0] = hc_byte_perm_S (w1[3], w2[0], selector); + w4[3] = hc_byte_perm_S (w1[2], w1[3], selector); + w4[2] = hc_byte_perm_S (w1[1], w1[2], selector); + w4[1] = hc_byte_perm_S (w1[0], w1[1], selector); + w4[0] = hc_byte_perm_S (w0[3], w1[0], selector); + w3[3] = hc_byte_perm_S (w0[2], w0[3], selector); + w3[2] = hc_byte_perm_S (w0[1], w0[2], selector); + w3[1] = hc_byte_perm_S (w0[0], w0[1], selector); + w3[0] = hc_byte_perm_S ( 0, w0[0], selector); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -36519,25 +36519,25 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 13: - w7[3] = __byte_perm_S (w4[1], w4[2], selector); - w7[2] = __byte_perm_S (w4[0], w4[1], selector); - w7[1] = __byte_perm_S (w3[3], w4[0], selector); - w7[0] = __byte_perm_S (w3[2], w3[3], selector); - w6[3] = __byte_perm_S (w3[1], w3[2], selector); - w6[2] = __byte_perm_S (w3[0], w3[1], selector); - w6[1] = __byte_perm_S (w2[3], w3[0], selector); - w6[0] = __byte_perm_S (w2[2], w2[3], selector); - w5[3] = __byte_perm_S (w2[1], w2[2], selector); - w5[2] = __byte_perm_S (w2[0], w2[1], selector); - w5[1] = __byte_perm_S (w1[3], w2[0], selector); - w5[0] = __byte_perm_S (w1[2], w1[3], selector); - w4[3] = __byte_perm_S (w1[1], w1[2], selector); - w4[2] = __byte_perm_S (w1[0], w1[1], selector); - w4[1] = __byte_perm_S (w0[3], w1[0], selector); - w4[0] = __byte_perm_S (w0[2], w0[3], selector); - w3[3] = __byte_perm_S (w0[1], w0[2], selector); - w3[2] = __byte_perm_S (w0[0], w0[1], selector); - w3[1] = __byte_perm_S ( 0, w0[0], selector); + w7[3] = hc_byte_perm_S (w4[1], w4[2], selector); + w7[2] = hc_byte_perm_S (w4[0], w4[1], selector); + w7[1] = hc_byte_perm_S (w3[3], w4[0], selector); + w7[0] = hc_byte_perm_S (w3[2], w3[3], selector); + w6[3] = hc_byte_perm_S (w3[1], w3[2], selector); + w6[2] = hc_byte_perm_S (w3[0], w3[1], selector); + w6[1] = hc_byte_perm_S (w2[3], w3[0], selector); + w6[0] = hc_byte_perm_S (w2[2], w2[3], selector); + w5[3] = hc_byte_perm_S (w2[1], w2[2], selector); + w5[2] = hc_byte_perm_S (w2[0], w2[1], selector); + w5[1] = hc_byte_perm_S (w1[3], w2[0], selector); + w5[0] = hc_byte_perm_S (w1[2], w1[3], selector); + w4[3] = hc_byte_perm_S (w1[1], w1[2], selector); + w4[2] = hc_byte_perm_S (w1[0], w1[1], selector); + w4[1] = hc_byte_perm_S (w0[3], w1[0], selector); + w4[0] = hc_byte_perm_S (w0[2], w0[3], selector); + w3[3] = hc_byte_perm_S (w0[1], w0[2], selector); + w3[2] = hc_byte_perm_S (w0[0], w0[1], selector); + w3[1] = hc_byte_perm_S ( 0, w0[0], selector); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -36554,24 +36554,24 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 14: - w7[3] = __byte_perm_S (w4[0], w4[1], selector); - w7[2] = __byte_perm_S (w3[3], w4[0], selector); - w7[1] = __byte_perm_S (w3[2], w3[3], selector); - w7[0] = __byte_perm_S (w3[1], w3[2], selector); - w6[3] = __byte_perm_S (w3[0], w3[1], selector); - w6[2] = __byte_perm_S (w2[3], w3[0], selector); - w6[1] = __byte_perm_S (w2[2], w2[3], selector); - w6[0] = __byte_perm_S (w2[1], w2[2], selector); - w5[3] = __byte_perm_S (w2[0], w2[1], selector); - w5[2] = __byte_perm_S (w1[3], w2[0], selector); - w5[1] = __byte_perm_S (w1[2], w1[3], selector); - w5[0] = __byte_perm_S (w1[1], w1[2], selector); - w4[3] = __byte_perm_S (w1[0], w1[1], selector); - w4[2] = __byte_perm_S (w0[3], w1[0], selector); - w4[1] = __byte_perm_S (w0[2], w0[3], selector); - w4[0] = __byte_perm_S (w0[1], w0[2], selector); - w3[3] = __byte_perm_S (w0[0], w0[1], selector); - w3[2] = __byte_perm_S ( 0, w0[0], selector); + w7[3] = hc_byte_perm_S (w4[0], w4[1], selector); + w7[2] = hc_byte_perm_S (w3[3], w4[0], selector); + w7[1] = hc_byte_perm_S (w3[2], w3[3], selector); + w7[0] = hc_byte_perm_S (w3[1], w3[2], selector); + w6[3] = hc_byte_perm_S (w3[0], w3[1], selector); + w6[2] = hc_byte_perm_S (w2[3], w3[0], selector); + w6[1] = hc_byte_perm_S (w2[2], w2[3], selector); + w6[0] = hc_byte_perm_S (w2[1], w2[2], selector); + w5[3] = hc_byte_perm_S (w2[0], w2[1], selector); + w5[2] = hc_byte_perm_S (w1[3], w2[0], selector); + w5[1] = hc_byte_perm_S (w1[2], w1[3], selector); + w5[0] = hc_byte_perm_S (w1[1], w1[2], selector); + w4[3] = hc_byte_perm_S (w1[0], w1[1], selector); + w4[2] = hc_byte_perm_S (w0[3], w1[0], selector); + w4[1] = hc_byte_perm_S (w0[2], w0[3], selector); + w4[0] = hc_byte_perm_S (w0[1], w0[2], selector); + w3[3] = hc_byte_perm_S (w0[0], w0[1], selector); + w3[2] = hc_byte_perm_S ( 0, w0[0], selector); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -36589,23 +36589,23 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 15: - w7[3] = __byte_perm_S (w3[3], w4[0], selector); - w7[2] = __byte_perm_S (w3[2], w3[3], selector); - w7[1] = __byte_perm_S (w3[1], w3[2], selector); - w7[0] = __byte_perm_S (w3[0], w3[1], selector); - w6[3] = __byte_perm_S (w2[3], w3[0], selector); - w6[2] = __byte_perm_S (w2[2], w2[3], selector); - w6[1] = __byte_perm_S (w2[1], w2[2], selector); - w6[0] = __byte_perm_S (w2[0], w2[1], selector); - w5[3] = __byte_perm_S (w1[3], w2[0], selector); - w5[2] = __byte_perm_S (w1[2], w1[3], selector); - w5[1] = __byte_perm_S (w1[1], w1[2], selector); - w5[0] = __byte_perm_S (w1[0], w1[1], selector); - w4[3] = __byte_perm_S (w0[3], w1[0], selector); - w4[2] = __byte_perm_S (w0[2], w0[3], selector); - w4[1] = __byte_perm_S (w0[1], w0[2], selector); - w4[0] = __byte_perm_S (w0[0], w0[1], selector); - w3[3] = __byte_perm_S ( 0, w0[0], selector); + w7[3] = hc_byte_perm_S (w3[3], w4[0], selector); + w7[2] = hc_byte_perm_S (w3[2], w3[3], selector); + w7[1] = hc_byte_perm_S (w3[1], w3[2], selector); + w7[0] = hc_byte_perm_S (w3[0], w3[1], selector); + w6[3] = hc_byte_perm_S (w2[3], w3[0], selector); + w6[2] = hc_byte_perm_S (w2[2], w2[3], selector); + w6[1] = hc_byte_perm_S (w2[1], w2[2], selector); + w6[0] = hc_byte_perm_S (w2[0], w2[1], selector); + w5[3] = hc_byte_perm_S (w1[3], w2[0], selector); + w5[2] = hc_byte_perm_S (w1[2], w1[3], selector); + w5[1] = hc_byte_perm_S (w1[1], w1[2], selector); + w5[0] = hc_byte_perm_S (w1[0], w1[1], selector); + w4[3] = hc_byte_perm_S (w0[3], w1[0], selector); + w4[2] = hc_byte_perm_S (w0[2], w0[3], selector); + w4[1] = hc_byte_perm_S (w0[1], w0[2], selector); + w4[0] = hc_byte_perm_S (w0[0], w0[1], selector); + w3[3] = hc_byte_perm_S ( 0, w0[0], selector); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -36634,143 +36634,143 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * switch (offset_switch) { case 0: - w7[3] = amd_bytealign_S (w7[2], w7[3], offset); - w7[2] = amd_bytealign_S (w7[1], w7[2], offset); - w7[1] = amd_bytealign_S (w7[0], w7[1], offset); - w7[0] = amd_bytealign_S (w6[3], w7[0], offset); - w6[3] = amd_bytealign_S (w6[2], w6[3], offset); - w6[2] = amd_bytealign_S (w6[1], w6[2], offset); - w6[1] = amd_bytealign_S (w6[0], w6[1], offset); - w6[0] = amd_bytealign_S (w5[3], w6[0], offset); - w5[3] = amd_bytealign_S (w5[2], w5[3], offset); - w5[2] = amd_bytealign_S (w5[1], w5[2], offset); - w5[1] = amd_bytealign_S (w5[0], w5[1], offset); - w5[0] = amd_bytealign_S (w4[3], w5[0], offset); - w4[3] = amd_bytealign_S (w4[2], w4[3], offset); - w4[2] = amd_bytealign_S (w4[1], w4[2], offset); - w4[1] = amd_bytealign_S (w4[0], w4[1], offset); - w4[0] = amd_bytealign_S (w3[3], w4[0], offset); - w3[3] = amd_bytealign_S (w3[2], w3[3], offset); - w3[2] = amd_bytealign_S (w3[1], w3[2], offset); - w3[1] = amd_bytealign_S (w3[0], w3[1], offset); - w3[0] = amd_bytealign_S (w2[3], w3[0], offset); - w2[3] = amd_bytealign_S (w2[2], w2[3], offset); - w2[2] = amd_bytealign_S (w2[1], w2[2], offset); - w2[1] = amd_bytealign_S (w2[0], w2[1], offset); - w2[0] = amd_bytealign_S (w1[3], w2[0], offset); - w1[3] = amd_bytealign_S (w1[2], w1[3], offset); - w1[2] = amd_bytealign_S (w1[1], w1[2], offset); - w1[1] = amd_bytealign_S (w1[0], w1[1], offset); - w1[0] = amd_bytealign_S (w0[3], w1[0], offset); - w0[3] = amd_bytealign_S (w0[2], w0[3], offset); - w0[2] = amd_bytealign_S (w0[1], w0[2], offset); - w0[1] = amd_bytealign_S (w0[0], w0[1], offset); - w0[0] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w7[2], w7[3], offset); + w7[2] = hc_bytealign_S (w7[1], w7[2], offset); + w7[1] = hc_bytealign_S (w7[0], w7[1], offset); + w7[0] = hc_bytealign_S (w6[3], w7[0], offset); + w6[3] = hc_bytealign_S (w6[2], w6[3], offset); + w6[2] = hc_bytealign_S (w6[1], w6[2], offset); + w6[1] = hc_bytealign_S (w6[0], w6[1], offset); + w6[0] = hc_bytealign_S (w5[3], w6[0], offset); + w5[3] = hc_bytealign_S (w5[2], w5[3], offset); + w5[2] = hc_bytealign_S (w5[1], w5[2], offset); + w5[1] = hc_bytealign_S (w5[0], w5[1], offset); + w5[0] = hc_bytealign_S (w4[3], w5[0], offset); + w4[3] = hc_bytealign_S (w4[2], w4[3], offset); + w4[2] = hc_bytealign_S (w4[1], w4[2], offset); + w4[1] = hc_bytealign_S (w4[0], w4[1], offset); + w4[0] = hc_bytealign_S (w3[3], w4[0], offset); + w3[3] = hc_bytealign_S (w3[2], w3[3], offset); + w3[2] = hc_bytealign_S (w3[1], w3[2], offset); + w3[1] = hc_bytealign_S (w3[0], w3[1], offset); + w3[0] = hc_bytealign_S (w2[3], w3[0], offset); + w2[3] = hc_bytealign_S (w2[2], w2[3], offset); + w2[2] = hc_bytealign_S (w2[1], w2[2], offset); + w2[1] = hc_bytealign_S (w2[0], w2[1], offset); + w2[0] = hc_bytealign_S (w1[3], w2[0], offset); + w1[3] = hc_bytealign_S (w1[2], w1[3], offset); + w1[2] = hc_bytealign_S (w1[1], w1[2], offset); + w1[1] = hc_bytealign_S (w1[0], w1[1], offset); + w1[0] = hc_bytealign_S (w0[3], w1[0], offset); + w0[3] = hc_bytealign_S (w0[2], w0[3], offset); + w0[2] = hc_bytealign_S (w0[1], w0[2], offset); + w0[1] = hc_bytealign_S (w0[0], w0[1], offset); + w0[0] = hc_bytealign_S ( 0, w0[0], offset); break; case 1: - w7[3] = amd_bytealign_S (w7[1], w7[2], offset); - w7[2] = amd_bytealign_S (w7[0], w7[1], offset); - w7[1] = amd_bytealign_S (w6[3], w7[0], offset); - w7[0] = amd_bytealign_S (w6[2], w6[3], offset); - w6[3] = amd_bytealign_S (w6[1], w6[2], offset); - w6[2] = amd_bytealign_S (w6[0], w6[1], offset); - w6[1] = amd_bytealign_S (w5[3], w6[0], offset); - w6[0] = amd_bytealign_S (w5[2], w5[3], offset); - w5[3] = amd_bytealign_S (w5[1], w5[2], offset); - w5[2] = amd_bytealign_S (w5[0], w5[1], offset); - w5[1] = amd_bytealign_S (w4[3], w5[0], offset); - w5[0] = amd_bytealign_S (w4[2], w4[3], offset); - w4[3] = amd_bytealign_S (w4[1], w4[2], offset); - w4[2] = amd_bytealign_S (w4[0], w4[1], offset); - w4[1] = amd_bytealign_S (w3[3], w4[0], offset); - w4[0] = amd_bytealign_S (w3[2], w3[3], offset); - w3[3] = amd_bytealign_S (w3[1], w3[2], offset); - w3[2] = amd_bytealign_S (w3[0], w3[1], offset); - w3[1] = amd_bytealign_S (w2[3], w3[0], offset); - w3[0] = amd_bytealign_S (w2[2], w2[3], offset); - w2[3] = amd_bytealign_S (w2[1], w2[2], offset); - w2[2] = amd_bytealign_S (w2[0], w2[1], offset); - w2[1] = amd_bytealign_S (w1[3], w2[0], offset); - w2[0] = amd_bytealign_S (w1[2], w1[3], offset); - w1[3] = amd_bytealign_S (w1[1], w1[2], offset); - w1[2] = amd_bytealign_S (w1[0], w1[1], offset); - w1[1] = amd_bytealign_S (w0[3], w1[0], offset); - w1[0] = amd_bytealign_S (w0[2], w0[3], offset); - w0[3] = amd_bytealign_S (w0[1], w0[2], offset); - w0[2] = amd_bytealign_S (w0[0], w0[1], offset); - w0[1] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w7[1], w7[2], offset); + w7[2] = hc_bytealign_S (w7[0], w7[1], offset); + w7[1] = hc_bytealign_S (w6[3], w7[0], offset); + w7[0] = hc_bytealign_S (w6[2], w6[3], offset); + w6[3] = hc_bytealign_S (w6[1], w6[2], offset); + w6[2] = hc_bytealign_S (w6[0], w6[1], offset); + w6[1] = hc_bytealign_S (w5[3], w6[0], offset); + w6[0] = hc_bytealign_S (w5[2], w5[3], offset); + w5[3] = hc_bytealign_S (w5[1], w5[2], offset); + w5[2] = hc_bytealign_S (w5[0], w5[1], offset); + w5[1] = hc_bytealign_S (w4[3], w5[0], offset); + w5[0] = hc_bytealign_S (w4[2], w4[3], offset); + w4[3] = hc_bytealign_S (w4[1], w4[2], offset); + w4[2] = hc_bytealign_S (w4[0], w4[1], offset); + w4[1] = hc_bytealign_S (w3[3], w4[0], offset); + w4[0] = hc_bytealign_S (w3[2], w3[3], offset); + w3[3] = hc_bytealign_S (w3[1], w3[2], offset); + w3[2] = hc_bytealign_S (w3[0], w3[1], offset); + w3[1] = hc_bytealign_S (w2[3], w3[0], offset); + w3[0] = hc_bytealign_S (w2[2], w2[3], offset); + w2[3] = hc_bytealign_S (w2[1], w2[2], offset); + w2[2] = hc_bytealign_S (w2[0], w2[1], offset); + w2[1] = hc_bytealign_S (w1[3], w2[0], offset); + w2[0] = hc_bytealign_S (w1[2], w1[3], offset); + w1[3] = hc_bytealign_S (w1[1], w1[2], offset); + w1[2] = hc_bytealign_S (w1[0], w1[1], offset); + w1[1] = hc_bytealign_S (w0[3], w1[0], offset); + w1[0] = hc_bytealign_S (w0[2], w0[3], offset); + w0[3] = hc_bytealign_S (w0[1], w0[2], offset); + w0[2] = hc_bytealign_S (w0[0], w0[1], offset); + w0[1] = hc_bytealign_S ( 0, w0[0], offset); w0[0] = 0; break; case 2: - w7[3] = amd_bytealign_S (w7[0], w7[1], offset); - w7[2] = amd_bytealign_S (w6[3], w7[0], offset); - w7[1] = amd_bytealign_S (w6[2], w6[3], offset); - w7[0] = amd_bytealign_S (w6[1], w6[2], offset); - w6[3] = amd_bytealign_S (w6[0], w6[1], offset); - w6[2] = amd_bytealign_S (w5[3], w6[0], offset); - w6[1] = amd_bytealign_S (w5[2], w5[3], offset); - w6[0] = amd_bytealign_S (w5[1], w5[2], offset); - w5[3] = amd_bytealign_S (w5[0], w5[1], offset); - w5[2] = amd_bytealign_S (w4[3], w5[0], offset); - w5[1] = amd_bytealign_S (w4[2], w4[3], offset); - w5[0] = amd_bytealign_S (w4[1], w4[2], offset); - w4[3] = amd_bytealign_S (w4[0], w4[1], offset); - w4[2] = amd_bytealign_S (w3[3], w4[0], offset); - w4[1] = amd_bytealign_S (w3[2], w3[3], offset); - w4[0] = amd_bytealign_S (w3[1], w3[2], offset); - w3[3] = amd_bytealign_S (w3[0], w3[1], offset); - w3[2] = amd_bytealign_S (w2[3], w3[0], offset); - w3[1] = amd_bytealign_S (w2[2], w2[3], offset); - w3[0] = amd_bytealign_S (w2[1], w2[2], offset); - w2[3] = amd_bytealign_S (w2[0], w2[1], offset); - w2[2] = amd_bytealign_S (w1[3], w2[0], offset); - w2[1] = amd_bytealign_S (w1[2], w1[3], offset); - w2[0] = amd_bytealign_S (w1[1], w1[2], offset); - w1[3] = amd_bytealign_S (w1[0], w1[1], offset); - w1[2] = amd_bytealign_S (w0[3], w1[0], offset); - w1[1] = amd_bytealign_S (w0[2], w0[3], offset); - w1[0] = amd_bytealign_S (w0[1], w0[2], offset); - w0[3] = amd_bytealign_S (w0[0], w0[1], offset); - w0[2] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w7[0], w7[1], offset); + w7[2] = hc_bytealign_S (w6[3], w7[0], offset); + w7[1] = hc_bytealign_S (w6[2], w6[3], offset); + w7[0] = hc_bytealign_S (w6[1], w6[2], offset); + w6[3] = hc_bytealign_S (w6[0], w6[1], offset); + w6[2] = hc_bytealign_S (w5[3], w6[0], offset); + w6[1] = hc_bytealign_S (w5[2], w5[3], offset); + w6[0] = hc_bytealign_S (w5[1], w5[2], offset); + w5[3] = hc_bytealign_S (w5[0], w5[1], offset); + w5[2] = hc_bytealign_S (w4[3], w5[0], offset); + w5[1] = hc_bytealign_S (w4[2], w4[3], offset); + w5[0] = hc_bytealign_S (w4[1], w4[2], offset); + w4[3] = hc_bytealign_S (w4[0], w4[1], offset); + w4[2] = hc_bytealign_S (w3[3], w4[0], offset); + w4[1] = hc_bytealign_S (w3[2], w3[3], offset); + w4[0] = hc_bytealign_S (w3[1], w3[2], offset); + w3[3] = hc_bytealign_S (w3[0], w3[1], offset); + w3[2] = hc_bytealign_S (w2[3], w3[0], offset); + w3[1] = hc_bytealign_S (w2[2], w2[3], offset); + w3[0] = hc_bytealign_S (w2[1], w2[2], offset); + w2[3] = hc_bytealign_S (w2[0], w2[1], offset); + w2[2] = hc_bytealign_S (w1[3], w2[0], offset); + w2[1] = hc_bytealign_S (w1[2], w1[3], offset); + w2[0] = hc_bytealign_S (w1[1], w1[2], offset); + w1[3] = hc_bytealign_S (w1[0], w1[1], offset); + w1[2] = hc_bytealign_S (w0[3], w1[0], offset); + w1[1] = hc_bytealign_S (w0[2], w0[3], offset); + w1[0] = hc_bytealign_S (w0[1], w0[2], offset); + w0[3] = hc_bytealign_S (w0[0], w0[1], offset); + w0[2] = hc_bytealign_S ( 0, w0[0], offset); w0[1] = 0; w0[0] = 0; break; case 3: - w7[3] = amd_bytealign_S (w6[3], w7[0], offset); - w7[2] = amd_bytealign_S (w6[2], w6[3], offset); - w7[1] = amd_bytealign_S (w6[1], w6[2], offset); - w7[0] = amd_bytealign_S (w6[0], w6[1], offset); - w6[3] = amd_bytealign_S (w5[3], w6[0], offset); - w6[2] = amd_bytealign_S (w5[2], w5[3], offset); - w6[1] = amd_bytealign_S (w5[1], w5[2], offset); - w6[0] = amd_bytealign_S (w5[0], w5[1], offset); - w5[3] = amd_bytealign_S (w4[3], w5[0], offset); - w5[2] = amd_bytealign_S (w4[2], w4[3], offset); - w5[1] = amd_bytealign_S (w4[1], w4[2], offset); - w5[0] = amd_bytealign_S (w4[0], w4[1], offset); - w4[3] = amd_bytealign_S (w3[3], w4[0], offset); - w4[2] = amd_bytealign_S (w3[2], w3[3], offset); - w4[1] = amd_bytealign_S (w3[1], w3[2], offset); - w4[0] = amd_bytealign_S (w3[0], w3[1], offset); - w3[3] = amd_bytealign_S (w2[3], w3[0], offset); - w3[2] = amd_bytealign_S (w2[2], w2[3], offset); - w3[1] = amd_bytealign_S (w2[1], w2[2], offset); - w3[0] = amd_bytealign_S (w2[0], w2[1], offset); - w2[3] = amd_bytealign_S (w1[3], w2[0], offset); - w2[2] = amd_bytealign_S (w1[2], w1[3], offset); - w2[1] = amd_bytealign_S (w1[1], w1[2], offset); - w2[0] = amd_bytealign_S (w1[0], w1[1], offset); - w1[3] = amd_bytealign_S (w0[3], w1[0], offset); - w1[2] = amd_bytealign_S (w0[2], w0[3], offset); - w1[1] = amd_bytealign_S (w0[1], w0[2], offset); - w1[0] = amd_bytealign_S (w0[0], w0[1], offset); - w0[3] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w6[3], w7[0], offset); + w7[2] = hc_bytealign_S (w6[2], w6[3], offset); + w7[1] = hc_bytealign_S (w6[1], w6[2], offset); + w7[0] = hc_bytealign_S (w6[0], w6[1], offset); + w6[3] = hc_bytealign_S (w5[3], w6[0], offset); + w6[2] = hc_bytealign_S (w5[2], w5[3], offset); + w6[1] = hc_bytealign_S (w5[1], w5[2], offset); + w6[0] = hc_bytealign_S (w5[0], w5[1], offset); + w5[3] = hc_bytealign_S (w4[3], w5[0], offset); + w5[2] = hc_bytealign_S (w4[2], w4[3], offset); + w5[1] = hc_bytealign_S (w4[1], w4[2], offset); + w5[0] = hc_bytealign_S (w4[0], w4[1], offset); + w4[3] = hc_bytealign_S (w3[3], w4[0], offset); + w4[2] = hc_bytealign_S (w3[2], w3[3], offset); + w4[1] = hc_bytealign_S (w3[1], w3[2], offset); + w4[0] = hc_bytealign_S (w3[0], w3[1], offset); + w3[3] = hc_bytealign_S (w2[3], w3[0], offset); + w3[2] = hc_bytealign_S (w2[2], w2[3], offset); + w3[1] = hc_bytealign_S (w2[1], w2[2], offset); + w3[0] = hc_bytealign_S (w2[0], w2[1], offset); + w2[3] = hc_bytealign_S (w1[3], w2[0], offset); + w2[2] = hc_bytealign_S (w1[2], w1[3], offset); + w2[1] = hc_bytealign_S (w1[1], w1[2], offset); + w2[0] = hc_bytealign_S (w1[0], w1[1], offset); + w1[3] = hc_bytealign_S (w0[3], w1[0], offset); + w1[2] = hc_bytealign_S (w0[2], w0[3], offset); + w1[1] = hc_bytealign_S (w0[1], w0[2], offset); + w1[0] = hc_bytealign_S (w0[0], w0[1], offset); + w0[3] = hc_bytealign_S ( 0, w0[0], offset); w0[2] = 0; w0[1] = 0; w0[0] = 0; @@ -36778,34 +36778,34 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 4: - w7[3] = amd_bytealign_S (w6[2], w6[3], offset); - w7[2] = amd_bytealign_S (w6[1], w6[2], offset); - w7[1] = amd_bytealign_S (w6[0], w6[1], offset); - w7[0] = amd_bytealign_S (w5[3], w6[0], offset); - w6[3] = amd_bytealign_S (w5[2], w5[3], offset); - w6[2] = amd_bytealign_S (w5[1], w5[2], offset); - w6[1] = amd_bytealign_S (w5[0], w5[1], offset); - w6[0] = amd_bytealign_S (w4[3], w5[0], offset); - w5[3] = amd_bytealign_S (w4[2], w4[3], offset); - w5[2] = amd_bytealign_S (w4[1], w4[2], offset); - w5[1] = amd_bytealign_S (w4[0], w4[1], offset); - w5[0] = amd_bytealign_S (w3[3], w4[0], offset); - w4[3] = amd_bytealign_S (w3[2], w3[3], offset); - w4[2] = amd_bytealign_S (w3[1], w3[2], offset); - w4[1] = amd_bytealign_S (w3[0], w3[1], offset); - w4[0] = amd_bytealign_S (w2[3], w3[0], offset); - w3[3] = amd_bytealign_S (w2[2], w2[3], offset); - w3[2] = amd_bytealign_S (w2[1], w2[2], offset); - w3[1] = amd_bytealign_S (w2[0], w2[1], offset); - w3[0] = amd_bytealign_S (w1[3], w2[0], offset); - w2[3] = amd_bytealign_S (w1[2], w1[3], offset); - w2[2] = amd_bytealign_S (w1[1], w1[2], offset); - w2[1] = amd_bytealign_S (w1[0], w1[1], offset); - w2[0] = amd_bytealign_S (w0[3], w1[0], offset); - w1[3] = amd_bytealign_S (w0[2], w0[3], offset); - w1[2] = amd_bytealign_S (w0[1], w0[2], offset); - w1[1] = amd_bytealign_S (w0[0], w0[1], offset); - w1[0] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w6[2], w6[3], offset); + w7[2] = hc_bytealign_S (w6[1], w6[2], offset); + w7[1] = hc_bytealign_S (w6[0], w6[1], offset); + w7[0] = hc_bytealign_S (w5[3], w6[0], offset); + w6[3] = hc_bytealign_S (w5[2], w5[3], offset); + w6[2] = hc_bytealign_S (w5[1], w5[2], offset); + w6[1] = hc_bytealign_S (w5[0], w5[1], offset); + w6[0] = hc_bytealign_S (w4[3], w5[0], offset); + w5[3] = hc_bytealign_S (w4[2], w4[3], offset); + w5[2] = hc_bytealign_S (w4[1], w4[2], offset); + w5[1] = hc_bytealign_S (w4[0], w4[1], offset); + w5[0] = hc_bytealign_S (w3[3], w4[0], offset); + w4[3] = hc_bytealign_S (w3[2], w3[3], offset); + w4[2] = hc_bytealign_S (w3[1], w3[2], offset); + w4[1] = hc_bytealign_S (w3[0], w3[1], offset); + w4[0] = hc_bytealign_S (w2[3], w3[0], offset); + w3[3] = hc_bytealign_S (w2[2], w2[3], offset); + w3[2] = hc_bytealign_S (w2[1], w2[2], offset); + w3[1] = hc_bytealign_S (w2[0], w2[1], offset); + w3[0] = hc_bytealign_S (w1[3], w2[0], offset); + w2[3] = hc_bytealign_S (w1[2], w1[3], offset); + w2[2] = hc_bytealign_S (w1[1], w1[2], offset); + w2[1] = hc_bytealign_S (w1[0], w1[1], offset); + w2[0] = hc_bytealign_S (w0[3], w1[0], offset); + w1[3] = hc_bytealign_S (w0[2], w0[3], offset); + w1[2] = hc_bytealign_S (w0[1], w0[2], offset); + w1[1] = hc_bytealign_S (w0[0], w0[1], offset); + w1[0] = hc_bytealign_S ( 0, w0[0], offset); w0[3] = 0; w0[2] = 0; w0[1] = 0; @@ -36814,33 +36814,33 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 5: - w7[3] = amd_bytealign_S (w6[1], w6[2], offset); - w7[2] = amd_bytealign_S (w6[0], w6[1], offset); - w7[1] = amd_bytealign_S (w5[3], w6[0], offset); - w7[0] = amd_bytealign_S (w5[2], w5[3], offset); - w6[3] = amd_bytealign_S (w5[1], w5[2], offset); - w6[2] = amd_bytealign_S (w5[0], w5[1], offset); - w6[1] = amd_bytealign_S (w4[3], w5[0], offset); - w6[0] = amd_bytealign_S (w4[2], w4[3], offset); - w5[3] = amd_bytealign_S (w4[1], w4[2], offset); - w5[2] = amd_bytealign_S (w4[0], w4[1], offset); - w5[1] = amd_bytealign_S (w3[3], w4[0], offset); - w5[0] = amd_bytealign_S (w3[2], w3[3], offset); - w4[3] = amd_bytealign_S (w3[1], w3[2], offset); - w4[2] = amd_bytealign_S (w3[0], w3[1], offset); - w4[1] = amd_bytealign_S (w2[3], w3[0], offset); - w4[0] = amd_bytealign_S (w2[2], w2[3], offset); - w3[3] = amd_bytealign_S (w2[1], w2[2], offset); - w3[2] = amd_bytealign_S (w2[0], w2[1], offset); - w3[1] = amd_bytealign_S (w1[3], w2[0], offset); - w3[0] = amd_bytealign_S (w1[2], w1[3], offset); - w2[3] = amd_bytealign_S (w1[1], w1[2], offset); - w2[2] = amd_bytealign_S (w1[0], w1[1], offset); - w2[1] = amd_bytealign_S (w0[3], w1[0], offset); - w2[0] = amd_bytealign_S (w0[2], w0[3], offset); - w1[3] = amd_bytealign_S (w0[1], w0[2], offset); - w1[2] = amd_bytealign_S (w0[0], w0[1], offset); - w1[1] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w6[1], w6[2], offset); + w7[2] = hc_bytealign_S (w6[0], w6[1], offset); + w7[1] = hc_bytealign_S (w5[3], w6[0], offset); + w7[0] = hc_bytealign_S (w5[2], w5[3], offset); + w6[3] = hc_bytealign_S (w5[1], w5[2], offset); + w6[2] = hc_bytealign_S (w5[0], w5[1], offset); + w6[1] = hc_bytealign_S (w4[3], w5[0], offset); + w6[0] = hc_bytealign_S (w4[2], w4[3], offset); + w5[3] = hc_bytealign_S (w4[1], w4[2], offset); + w5[2] = hc_bytealign_S (w4[0], w4[1], offset); + w5[1] = hc_bytealign_S (w3[3], w4[0], offset); + w5[0] = hc_bytealign_S (w3[2], w3[3], offset); + w4[3] = hc_bytealign_S (w3[1], w3[2], offset); + w4[2] = hc_bytealign_S (w3[0], w3[1], offset); + w4[1] = hc_bytealign_S (w2[3], w3[0], offset); + w4[0] = hc_bytealign_S (w2[2], w2[3], offset); + w3[3] = hc_bytealign_S (w2[1], w2[2], offset); + w3[2] = hc_bytealign_S (w2[0], w2[1], offset); + w3[1] = hc_bytealign_S (w1[3], w2[0], offset); + w3[0] = hc_bytealign_S (w1[2], w1[3], offset); + w2[3] = hc_bytealign_S (w1[1], w1[2], offset); + w2[2] = hc_bytealign_S (w1[0], w1[1], offset); + w2[1] = hc_bytealign_S (w0[3], w1[0], offset); + w2[0] = hc_bytealign_S (w0[2], w0[3], offset); + w1[3] = hc_bytealign_S (w0[1], w0[2], offset); + w1[2] = hc_bytealign_S (w0[0], w0[1], offset); + w1[1] = hc_bytealign_S ( 0, w0[0], offset); w1[0] = 0; w0[3] = 0; w0[2] = 0; @@ -36850,32 +36850,32 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 6: - w7[3] = amd_bytealign_S (w6[0], w6[1], offset); - w7[2] = amd_bytealign_S (w5[3], w6[0], offset); - w7[1] = amd_bytealign_S (w5[2], w5[3], offset); - w7[0] = amd_bytealign_S (w5[1], w5[2], offset); - w6[3] = amd_bytealign_S (w5[0], w5[1], offset); - w6[2] = amd_bytealign_S (w4[3], w5[0], offset); - w6[1] = amd_bytealign_S (w4[2], w4[3], offset); - w6[0] = amd_bytealign_S (w4[1], w4[2], offset); - w5[3] = amd_bytealign_S (w4[0], w4[1], offset); - w5[2] = amd_bytealign_S (w3[3], w4[0], offset); - w5[1] = amd_bytealign_S (w3[2], w3[3], offset); - w5[0] = amd_bytealign_S (w3[1], w3[2], offset); - w4[3] = amd_bytealign_S (w3[0], w3[1], offset); - w4[2] = amd_bytealign_S (w2[3], w3[0], offset); - w4[1] = amd_bytealign_S (w2[2], w2[3], offset); - w4[0] = amd_bytealign_S (w2[1], w2[2], offset); - w3[3] = amd_bytealign_S (w2[0], w2[1], offset); - w3[2] = amd_bytealign_S (w1[3], w2[0], offset); - w3[1] = amd_bytealign_S (w1[2], w1[3], offset); - w3[0] = amd_bytealign_S (w1[1], w1[2], offset); - w2[3] = amd_bytealign_S (w1[0], w1[1], offset); - w2[2] = amd_bytealign_S (w0[3], w1[0], offset); - w2[1] = amd_bytealign_S (w0[2], w0[3], offset); - w2[0] = amd_bytealign_S (w0[1], w0[2], offset); - w1[3] = amd_bytealign_S (w0[0], w0[1], offset); - w1[2] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w6[0], w6[1], offset); + w7[2] = hc_bytealign_S (w5[3], w6[0], offset); + w7[1] = hc_bytealign_S (w5[2], w5[3], offset); + w7[0] = hc_bytealign_S (w5[1], w5[2], offset); + w6[3] = hc_bytealign_S (w5[0], w5[1], offset); + w6[2] = hc_bytealign_S (w4[3], w5[0], offset); + w6[1] = hc_bytealign_S (w4[2], w4[3], offset); + w6[0] = hc_bytealign_S (w4[1], w4[2], offset); + w5[3] = hc_bytealign_S (w4[0], w4[1], offset); + w5[2] = hc_bytealign_S (w3[3], w4[0], offset); + w5[1] = hc_bytealign_S (w3[2], w3[3], offset); + w5[0] = hc_bytealign_S (w3[1], w3[2], offset); + w4[3] = hc_bytealign_S (w3[0], w3[1], offset); + w4[2] = hc_bytealign_S (w2[3], w3[0], offset); + w4[1] = hc_bytealign_S (w2[2], w2[3], offset); + w4[0] = hc_bytealign_S (w2[1], w2[2], offset); + w3[3] = hc_bytealign_S (w2[0], w2[1], offset); + w3[2] = hc_bytealign_S (w1[3], w2[0], offset); + w3[1] = hc_bytealign_S (w1[2], w1[3], offset); + w3[0] = hc_bytealign_S (w1[1], w1[2], offset); + w2[3] = hc_bytealign_S (w1[0], w1[1], offset); + w2[2] = hc_bytealign_S (w0[3], w1[0], offset); + w2[1] = hc_bytealign_S (w0[2], w0[3], offset); + w2[0] = hc_bytealign_S (w0[1], w0[2], offset); + w1[3] = hc_bytealign_S (w0[0], w0[1], offset); + w1[2] = hc_bytealign_S ( 0, w0[0], offset); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -36886,31 +36886,31 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 7: - w7[3] = amd_bytealign_S (w5[3], w6[0], offset); - w7[2] = amd_bytealign_S (w5[2], w5[3], offset); - w7[1] = amd_bytealign_S (w5[1], w5[2], offset); - w7[0] = amd_bytealign_S (w5[0], w5[1], offset); - w6[3] = amd_bytealign_S (w4[3], w5[0], offset); - w6[2] = amd_bytealign_S (w4[2], w4[3], offset); - w6[1] = amd_bytealign_S (w4[1], w4[2], offset); - w6[0] = amd_bytealign_S (w4[0], w4[1], offset); - w5[3] = amd_bytealign_S (w3[3], w4[0], offset); - w5[2] = amd_bytealign_S (w3[2], w3[3], offset); - w5[1] = amd_bytealign_S (w3[1], w3[2], offset); - w5[0] = amd_bytealign_S (w3[0], w3[1], offset); - w4[3] = amd_bytealign_S (w2[3], w3[0], offset); - w4[2] = amd_bytealign_S (w2[2], w2[3], offset); - w4[1] = amd_bytealign_S (w2[1], w2[2], offset); - w4[0] = amd_bytealign_S (w2[0], w2[1], offset); - w3[3] = amd_bytealign_S (w1[3], w2[0], offset); - w3[2] = amd_bytealign_S (w1[2], w1[3], offset); - w3[1] = amd_bytealign_S (w1[1], w1[2], offset); - w3[0] = amd_bytealign_S (w1[0], w1[1], offset); - w2[3] = amd_bytealign_S (w0[3], w1[0], offset); - w2[2] = amd_bytealign_S (w0[2], w0[3], offset); - w2[1] = amd_bytealign_S (w0[1], w0[2], offset); - w2[0] = amd_bytealign_S (w0[0], w0[1], offset); - w1[3] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w5[3], w6[0], offset); + w7[2] = hc_bytealign_S (w5[2], w5[3], offset); + w7[1] = hc_bytealign_S (w5[1], w5[2], offset); + w7[0] = hc_bytealign_S (w5[0], w5[1], offset); + w6[3] = hc_bytealign_S (w4[3], w5[0], offset); + w6[2] = hc_bytealign_S (w4[2], w4[3], offset); + w6[1] = hc_bytealign_S (w4[1], w4[2], offset); + w6[0] = hc_bytealign_S (w4[0], w4[1], offset); + w5[3] = hc_bytealign_S (w3[3], w4[0], offset); + w5[2] = hc_bytealign_S (w3[2], w3[3], offset); + w5[1] = hc_bytealign_S (w3[1], w3[2], offset); + w5[0] = hc_bytealign_S (w3[0], w3[1], offset); + w4[3] = hc_bytealign_S (w2[3], w3[0], offset); + w4[2] = hc_bytealign_S (w2[2], w2[3], offset); + w4[1] = hc_bytealign_S (w2[1], w2[2], offset); + w4[0] = hc_bytealign_S (w2[0], w2[1], offset); + w3[3] = hc_bytealign_S (w1[3], w2[0], offset); + w3[2] = hc_bytealign_S (w1[2], w1[3], offset); + w3[1] = hc_bytealign_S (w1[1], w1[2], offset); + w3[0] = hc_bytealign_S (w1[0], w1[1], offset); + w2[3] = hc_bytealign_S (w0[3], w1[0], offset); + w2[2] = hc_bytealign_S (w0[2], w0[3], offset); + w2[1] = hc_bytealign_S (w0[1], w0[2], offset); + w2[0] = hc_bytealign_S (w0[0], w0[1], offset); + w1[3] = hc_bytealign_S ( 0, w0[0], offset); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -36922,30 +36922,30 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 8: - w7[3] = amd_bytealign_S (w5[2], w5[3], offset); - w7[2] = amd_bytealign_S (w5[1], w5[2], offset); - w7[1] = amd_bytealign_S (w5[0], w5[1], offset); - w7[0] = amd_bytealign_S (w4[3], w5[0], offset); - w6[3] = amd_bytealign_S (w4[2], w4[3], offset); - w6[2] = amd_bytealign_S (w4[1], w4[2], offset); - w6[1] = amd_bytealign_S (w4[0], w4[1], offset); - w6[0] = amd_bytealign_S (w3[3], w4[0], offset); - w5[3] = amd_bytealign_S (w3[2], w3[3], offset); - w5[2] = amd_bytealign_S (w3[1], w3[2], offset); - w5[1] = amd_bytealign_S (w3[0], w3[1], offset); - w5[0] = amd_bytealign_S (w2[3], w3[0], offset); - w4[3] = amd_bytealign_S (w2[2], w2[3], offset); - w4[2] = amd_bytealign_S (w2[1], w2[2], offset); - w4[1] = amd_bytealign_S (w2[0], w2[1], offset); - w4[0] = amd_bytealign_S (w1[3], w2[0], offset); - w3[3] = amd_bytealign_S (w1[2], w1[3], offset); - w3[2] = amd_bytealign_S (w1[1], w1[2], offset); - w3[1] = amd_bytealign_S (w1[0], w1[1], offset); - w3[0] = amd_bytealign_S (w0[3], w1[0], offset); - w2[3] = amd_bytealign_S (w0[2], w0[3], offset); - w2[2] = amd_bytealign_S (w0[1], w0[2], offset); - w2[1] = amd_bytealign_S (w0[0], w0[1], offset); - w2[0] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w5[2], w5[3], offset); + w7[2] = hc_bytealign_S (w5[1], w5[2], offset); + w7[1] = hc_bytealign_S (w5[0], w5[1], offset); + w7[0] = hc_bytealign_S (w4[3], w5[0], offset); + w6[3] = hc_bytealign_S (w4[2], w4[3], offset); + w6[2] = hc_bytealign_S (w4[1], w4[2], offset); + w6[1] = hc_bytealign_S (w4[0], w4[1], offset); + w6[0] = hc_bytealign_S (w3[3], w4[0], offset); + w5[3] = hc_bytealign_S (w3[2], w3[3], offset); + w5[2] = hc_bytealign_S (w3[1], w3[2], offset); + w5[1] = hc_bytealign_S (w3[0], w3[1], offset); + w5[0] = hc_bytealign_S (w2[3], w3[0], offset); + w4[3] = hc_bytealign_S (w2[2], w2[3], offset); + w4[2] = hc_bytealign_S (w2[1], w2[2], offset); + w4[1] = hc_bytealign_S (w2[0], w2[1], offset); + w4[0] = hc_bytealign_S (w1[3], w2[0], offset); + w3[3] = hc_bytealign_S (w1[2], w1[3], offset); + w3[2] = hc_bytealign_S (w1[1], w1[2], offset); + w3[1] = hc_bytealign_S (w1[0], w1[1], offset); + w3[0] = hc_bytealign_S (w0[3], w1[0], offset); + w2[3] = hc_bytealign_S (w0[2], w0[3], offset); + w2[2] = hc_bytealign_S (w0[1], w0[2], offset); + w2[1] = hc_bytealign_S (w0[0], w0[1], offset); + w2[0] = hc_bytealign_S ( 0, w0[0], offset); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -36958,29 +36958,29 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 9: - w7[3] = amd_bytealign_S (w5[1], w5[2], offset); - w7[2] = amd_bytealign_S (w5[0], w5[1], offset); - w7[1] = amd_bytealign_S (w4[3], w5[0], offset); - w7[0] = amd_bytealign_S (w4[2], w4[3], offset); - w6[3] = amd_bytealign_S (w4[1], w4[2], offset); - w6[2] = amd_bytealign_S (w4[0], w4[1], offset); - w6[1] = amd_bytealign_S (w3[3], w4[0], offset); - w6[0] = amd_bytealign_S (w3[2], w3[3], offset); - w5[3] = amd_bytealign_S (w3[1], w3[2], offset); - w5[2] = amd_bytealign_S (w3[0], w3[1], offset); - w5[1] = amd_bytealign_S (w2[3], w3[0], offset); - w5[0] = amd_bytealign_S (w2[2], w2[3], offset); - w4[3] = amd_bytealign_S (w2[1], w2[2], offset); - w4[2] = amd_bytealign_S (w2[0], w2[1], offset); - w4[1] = amd_bytealign_S (w1[3], w2[0], offset); - w4[0] = amd_bytealign_S (w1[2], w1[3], offset); - w3[3] = amd_bytealign_S (w1[1], w1[2], offset); - w3[2] = amd_bytealign_S (w1[0], w1[1], offset); - w3[1] = amd_bytealign_S (w0[3], w1[0], offset); - w3[0] = amd_bytealign_S (w0[2], w0[3], offset); - w2[3] = amd_bytealign_S (w0[1], w0[2], offset); - w2[2] = amd_bytealign_S (w0[0], w0[1], offset); - w2[1] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w5[1], w5[2], offset); + w7[2] = hc_bytealign_S (w5[0], w5[1], offset); + w7[1] = hc_bytealign_S (w4[3], w5[0], offset); + w7[0] = hc_bytealign_S (w4[2], w4[3], offset); + w6[3] = hc_bytealign_S (w4[1], w4[2], offset); + w6[2] = hc_bytealign_S (w4[0], w4[1], offset); + w6[1] = hc_bytealign_S (w3[3], w4[0], offset); + w6[0] = hc_bytealign_S (w3[2], w3[3], offset); + w5[3] = hc_bytealign_S (w3[1], w3[2], offset); + w5[2] = hc_bytealign_S (w3[0], w3[1], offset); + w5[1] = hc_bytealign_S (w2[3], w3[0], offset); + w5[0] = hc_bytealign_S (w2[2], w2[3], offset); + w4[3] = hc_bytealign_S (w2[1], w2[2], offset); + w4[2] = hc_bytealign_S (w2[0], w2[1], offset); + w4[1] = hc_bytealign_S (w1[3], w2[0], offset); + w4[0] = hc_bytealign_S (w1[2], w1[3], offset); + w3[3] = hc_bytealign_S (w1[1], w1[2], offset); + w3[2] = hc_bytealign_S (w1[0], w1[1], offset); + w3[1] = hc_bytealign_S (w0[3], w1[0], offset); + w3[0] = hc_bytealign_S (w0[2], w0[3], offset); + w2[3] = hc_bytealign_S (w0[1], w0[2], offset); + w2[2] = hc_bytealign_S (w0[0], w0[1], offset); + w2[1] = hc_bytealign_S ( 0, w0[0], offset); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -36994,28 +36994,28 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 10: - w7[3] = amd_bytealign_S (w5[0], w5[1], offset); - w7[2] = amd_bytealign_S (w4[3], w5[0], offset); - w7[1] = amd_bytealign_S (w4[2], w4[3], offset); - w7[0] = amd_bytealign_S (w4[1], w4[2], offset); - w6[3] = amd_bytealign_S (w4[0], w4[1], offset); - w6[2] = amd_bytealign_S (w3[3], w4[0], offset); - w6[1] = amd_bytealign_S (w3[2], w3[3], offset); - w6[0] = amd_bytealign_S (w3[1], w3[2], offset); - w5[3] = amd_bytealign_S (w3[0], w3[1], offset); - w5[2] = amd_bytealign_S (w2[3], w3[0], offset); - w5[1] = amd_bytealign_S (w2[2], w2[3], offset); - w5[0] = amd_bytealign_S (w2[1], w2[2], offset); - w4[3] = amd_bytealign_S (w2[0], w2[1], offset); - w4[2] = amd_bytealign_S (w1[3], w2[0], offset); - w4[1] = amd_bytealign_S (w1[2], w1[3], offset); - w4[0] = amd_bytealign_S (w1[1], w1[2], offset); - w3[3] = amd_bytealign_S (w1[0], w1[1], offset); - w3[2] = amd_bytealign_S (w0[3], w1[0], offset); - w3[1] = amd_bytealign_S (w0[2], w0[3], offset); - w3[0] = amd_bytealign_S (w0[1], w0[2], offset); - w2[3] = amd_bytealign_S (w0[0], w0[1], offset); - w2[2] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w5[0], w5[1], offset); + w7[2] = hc_bytealign_S (w4[3], w5[0], offset); + w7[1] = hc_bytealign_S (w4[2], w4[3], offset); + w7[0] = hc_bytealign_S (w4[1], w4[2], offset); + w6[3] = hc_bytealign_S (w4[0], w4[1], offset); + w6[2] = hc_bytealign_S (w3[3], w4[0], offset); + w6[1] = hc_bytealign_S (w3[2], w3[3], offset); + w6[0] = hc_bytealign_S (w3[1], w3[2], offset); + w5[3] = hc_bytealign_S (w3[0], w3[1], offset); + w5[2] = hc_bytealign_S (w2[3], w3[0], offset); + w5[1] = hc_bytealign_S (w2[2], w2[3], offset); + w5[0] = hc_bytealign_S (w2[1], w2[2], offset); + w4[3] = hc_bytealign_S (w2[0], w2[1], offset); + w4[2] = hc_bytealign_S (w1[3], w2[0], offset); + w4[1] = hc_bytealign_S (w1[2], w1[3], offset); + w4[0] = hc_bytealign_S (w1[1], w1[2], offset); + w3[3] = hc_bytealign_S (w1[0], w1[1], offset); + w3[2] = hc_bytealign_S (w0[3], w1[0], offset); + w3[1] = hc_bytealign_S (w0[2], w0[3], offset); + w3[0] = hc_bytealign_S (w0[1], w0[2], offset); + w2[3] = hc_bytealign_S (w0[0], w0[1], offset); + w2[2] = hc_bytealign_S ( 0, w0[0], offset); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -37030,27 +37030,27 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 11: - w7[3] = amd_bytealign_S (w4[3], w5[0], offset); - w7[2] = amd_bytealign_S (w4[2], w4[3], offset); - w7[1] = amd_bytealign_S (w4[1], w4[2], offset); - w7[0] = amd_bytealign_S (w4[0], w4[1], offset); - w6[3] = amd_bytealign_S (w3[3], w4[0], offset); - w6[2] = amd_bytealign_S (w3[2], w3[3], offset); - w6[1] = amd_bytealign_S (w3[1], w3[2], offset); - w6[0] = amd_bytealign_S (w3[0], w3[1], offset); - w5[3] = amd_bytealign_S (w2[3], w3[0], offset); - w5[2] = amd_bytealign_S (w2[2], w2[3], offset); - w5[1] = amd_bytealign_S (w2[1], w2[2], offset); - w5[0] = amd_bytealign_S (w2[0], w2[1], offset); - w4[3] = amd_bytealign_S (w1[3], w2[0], offset); - w4[2] = amd_bytealign_S (w1[2], w1[3], offset); - w4[1] = amd_bytealign_S (w1[1], w1[2], offset); - w4[0] = amd_bytealign_S (w1[0], w1[1], offset); - w3[3] = amd_bytealign_S (w0[3], w1[0], offset); - w3[2] = amd_bytealign_S (w0[2], w0[3], offset); - w3[1] = amd_bytealign_S (w0[1], w0[2], offset); - w3[0] = amd_bytealign_S (w0[0], w0[1], offset); - w2[3] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w4[3], w5[0], offset); + w7[2] = hc_bytealign_S (w4[2], w4[3], offset); + w7[1] = hc_bytealign_S (w4[1], w4[2], offset); + w7[0] = hc_bytealign_S (w4[0], w4[1], offset); + w6[3] = hc_bytealign_S (w3[3], w4[0], offset); + w6[2] = hc_bytealign_S (w3[2], w3[3], offset); + w6[1] = hc_bytealign_S (w3[1], w3[2], offset); + w6[0] = hc_bytealign_S (w3[0], w3[1], offset); + w5[3] = hc_bytealign_S (w2[3], w3[0], offset); + w5[2] = hc_bytealign_S (w2[2], w2[3], offset); + w5[1] = hc_bytealign_S (w2[1], w2[2], offset); + w5[0] = hc_bytealign_S (w2[0], w2[1], offset); + w4[3] = hc_bytealign_S (w1[3], w2[0], offset); + w4[2] = hc_bytealign_S (w1[2], w1[3], offset); + w4[1] = hc_bytealign_S (w1[1], w1[2], offset); + w4[0] = hc_bytealign_S (w1[0], w1[1], offset); + w3[3] = hc_bytealign_S (w0[3], w1[0], offset); + w3[2] = hc_bytealign_S (w0[2], w0[3], offset); + w3[1] = hc_bytealign_S (w0[1], w0[2], offset); + w3[0] = hc_bytealign_S (w0[0], w0[1], offset); + w2[3] = hc_bytealign_S ( 0, w0[0], offset); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -37066,26 +37066,26 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 12: - w7[3] = amd_bytealign_S (w4[2], w4[3], offset); - w7[2] = amd_bytealign_S (w4[1], w4[2], offset); - w7[1] = amd_bytealign_S (w4[0], w4[1], offset); - w7[0] = amd_bytealign_S (w3[3], w4[0], offset); - w6[3] = amd_bytealign_S (w3[2], w3[3], offset); - w6[2] = amd_bytealign_S (w3[1], w3[2], offset); - w6[1] = amd_bytealign_S (w3[0], w3[1], offset); - w6[0] = amd_bytealign_S (w2[3], w3[0], offset); - w5[3] = amd_bytealign_S (w2[2], w2[3], offset); - w5[2] = amd_bytealign_S (w2[1], w2[2], offset); - w5[1] = amd_bytealign_S (w2[0], w2[1], offset); - w5[0] = amd_bytealign_S (w1[3], w2[0], offset); - w4[3] = amd_bytealign_S (w1[2], w1[3], offset); - w4[2] = amd_bytealign_S (w1[1], w1[2], offset); - w4[1] = amd_bytealign_S (w1[0], w1[1], offset); - w4[0] = amd_bytealign_S (w0[3], w1[0], offset); - w3[3] = amd_bytealign_S (w0[2], w0[3], offset); - w3[2] = amd_bytealign_S (w0[1], w0[2], offset); - w3[1] = amd_bytealign_S (w0[0], w0[1], offset); - w3[0] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w4[2], w4[3], offset); + w7[2] = hc_bytealign_S (w4[1], w4[2], offset); + w7[1] = hc_bytealign_S (w4[0], w4[1], offset); + w7[0] = hc_bytealign_S (w3[3], w4[0], offset); + w6[3] = hc_bytealign_S (w3[2], w3[3], offset); + w6[2] = hc_bytealign_S (w3[1], w3[2], offset); + w6[1] = hc_bytealign_S (w3[0], w3[1], offset); + w6[0] = hc_bytealign_S (w2[3], w3[0], offset); + w5[3] = hc_bytealign_S (w2[2], w2[3], offset); + w5[2] = hc_bytealign_S (w2[1], w2[2], offset); + w5[1] = hc_bytealign_S (w2[0], w2[1], offset); + w5[0] = hc_bytealign_S (w1[3], w2[0], offset); + w4[3] = hc_bytealign_S (w1[2], w1[3], offset); + w4[2] = hc_bytealign_S (w1[1], w1[2], offset); + w4[1] = hc_bytealign_S (w1[0], w1[1], offset); + w4[0] = hc_bytealign_S (w0[3], w1[0], offset); + w3[3] = hc_bytealign_S (w0[2], w0[3], offset); + w3[2] = hc_bytealign_S (w0[1], w0[2], offset); + w3[1] = hc_bytealign_S (w0[0], w0[1], offset); + w3[0] = hc_bytealign_S ( 0, w0[0], offset); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -37102,25 +37102,25 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 13: - w7[3] = amd_bytealign_S (w4[1], w4[2], offset); - w7[2] = amd_bytealign_S (w4[0], w4[1], offset); - w7[1] = amd_bytealign_S (w3[3], w4[0], offset); - w7[0] = amd_bytealign_S (w3[2], w3[3], offset); - w6[3] = amd_bytealign_S (w3[1], w3[2], offset); - w6[2] = amd_bytealign_S (w3[0], w3[1], offset); - w6[1] = amd_bytealign_S (w2[3], w3[0], offset); - w6[0] = amd_bytealign_S (w2[2], w2[3], offset); - w5[3] = amd_bytealign_S (w2[1], w2[2], offset); - w5[2] = amd_bytealign_S (w2[0], w2[1], offset); - w5[1] = amd_bytealign_S (w1[3], w2[0], offset); - w5[0] = amd_bytealign_S (w1[2], w1[3], offset); - w4[3] = amd_bytealign_S (w1[1], w1[2], offset); - w4[2] = amd_bytealign_S (w1[0], w1[1], offset); - w4[1] = amd_bytealign_S (w0[3], w1[0], offset); - w4[0] = amd_bytealign_S (w0[2], w0[3], offset); - w3[3] = amd_bytealign_S (w0[1], w0[2], offset); - w3[2] = amd_bytealign_S (w0[0], w0[1], offset); - w3[1] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w4[1], w4[2], offset); + w7[2] = hc_bytealign_S (w4[0], w4[1], offset); + w7[1] = hc_bytealign_S (w3[3], w4[0], offset); + w7[0] = hc_bytealign_S (w3[2], w3[3], offset); + w6[3] = hc_bytealign_S (w3[1], w3[2], offset); + w6[2] = hc_bytealign_S (w3[0], w3[1], offset); + w6[1] = hc_bytealign_S (w2[3], w3[0], offset); + w6[0] = hc_bytealign_S (w2[2], w2[3], offset); + w5[3] = hc_bytealign_S (w2[1], w2[2], offset); + w5[2] = hc_bytealign_S (w2[0], w2[1], offset); + w5[1] = hc_bytealign_S (w1[3], w2[0], offset); + w5[0] = hc_bytealign_S (w1[2], w1[3], offset); + w4[3] = hc_bytealign_S (w1[1], w1[2], offset); + w4[2] = hc_bytealign_S (w1[0], w1[1], offset); + w4[1] = hc_bytealign_S (w0[3], w1[0], offset); + w4[0] = hc_bytealign_S (w0[2], w0[3], offset); + w3[3] = hc_bytealign_S (w0[1], w0[2], offset); + w3[2] = hc_bytealign_S (w0[0], w0[1], offset); + w3[1] = hc_bytealign_S ( 0, w0[0], offset); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -37138,24 +37138,24 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 14: - w7[3] = amd_bytealign_S (w4[0], w4[1], offset); - w7[2] = amd_bytealign_S (w3[3], w4[0], offset); - w7[1] = amd_bytealign_S (w3[2], w3[3], offset); - w7[0] = amd_bytealign_S (w3[1], w3[2], offset); - w6[3] = amd_bytealign_S (w3[0], w3[1], offset); - w6[2] = amd_bytealign_S (w2[3], w3[0], offset); - w6[1] = amd_bytealign_S (w2[2], w2[3], offset); - w6[0] = amd_bytealign_S (w2[1], w2[2], offset); - w5[3] = amd_bytealign_S (w2[0], w2[1], offset); - w5[2] = amd_bytealign_S (w1[3], w2[0], offset); - w5[1] = amd_bytealign_S (w1[2], w1[3], offset); - w5[0] = amd_bytealign_S (w1[1], w1[2], offset); - w4[3] = amd_bytealign_S (w1[0], w1[1], offset); - w4[2] = amd_bytealign_S (w0[3], w1[0], offset); - w4[1] = amd_bytealign_S (w0[2], w0[3], offset); - w4[0] = amd_bytealign_S (w0[1], w0[2], offset); - w3[3] = amd_bytealign_S (w0[0], w0[1], offset); - w3[2] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w4[0], w4[1], offset); + w7[2] = hc_bytealign_S (w3[3], w4[0], offset); + w7[1] = hc_bytealign_S (w3[2], w3[3], offset); + w7[0] = hc_bytealign_S (w3[1], w3[2], offset); + w6[3] = hc_bytealign_S (w3[0], w3[1], offset); + w6[2] = hc_bytealign_S (w2[3], w3[0], offset); + w6[1] = hc_bytealign_S (w2[2], w2[3], offset); + w6[0] = hc_bytealign_S (w2[1], w2[2], offset); + w5[3] = hc_bytealign_S (w2[0], w2[1], offset); + w5[2] = hc_bytealign_S (w1[3], w2[0], offset); + w5[1] = hc_bytealign_S (w1[2], w1[3], offset); + w5[0] = hc_bytealign_S (w1[1], w1[2], offset); + w4[3] = hc_bytealign_S (w1[0], w1[1], offset); + w4[2] = hc_bytealign_S (w0[3], w1[0], offset); + w4[1] = hc_bytealign_S (w0[2], w0[3], offset); + w4[0] = hc_bytealign_S (w0[1], w0[2], offset); + w3[3] = hc_bytealign_S (w0[0], w0[1], offset); + w3[2] = hc_bytealign_S ( 0, w0[0], offset); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -37174,23 +37174,23 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 15: - w7[3] = amd_bytealign_S (w3[3], w4[0], offset); - w7[2] = amd_bytealign_S (w3[2], w3[3], offset); - w7[1] = amd_bytealign_S (w3[1], w3[2], offset); - w7[0] = amd_bytealign_S (w3[0], w3[1], offset); - w6[3] = amd_bytealign_S (w2[3], w3[0], offset); - w6[2] = amd_bytealign_S (w2[2], w2[3], offset); - w6[1] = amd_bytealign_S (w2[1], w2[2], offset); - w6[0] = amd_bytealign_S (w2[0], w2[1], offset); - w5[3] = amd_bytealign_S (w1[3], w2[0], offset); - w5[2] = amd_bytealign_S (w1[2], w1[3], offset); - w5[1] = amd_bytealign_S (w1[1], w1[2], offset); - w5[0] = amd_bytealign_S (w1[0], w1[1], offset); - w4[3] = amd_bytealign_S (w0[3], w1[0], offset); - w4[2] = amd_bytealign_S (w0[2], w0[3], offset); - w4[1] = amd_bytealign_S (w0[1], w0[2], offset); - w4[0] = amd_bytealign_S (w0[0], w0[1], offset); - w3[3] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w3[3], w4[0], offset); + w7[2] = hc_bytealign_S (w3[2], w3[3], offset); + w7[1] = hc_bytealign_S (w3[1], w3[2], offset); + w7[0] = hc_bytealign_S (w3[0], w3[1], offset); + w6[3] = hc_bytealign_S (w2[3], w3[0], offset); + w6[2] = hc_bytealign_S (w2[2], w2[3], offset); + w6[1] = hc_bytealign_S (w2[1], w2[2], offset); + w6[0] = hc_bytealign_S (w2[0], w2[1], offset); + w5[3] = hc_bytealign_S (w1[3], w2[0], offset); + w5[2] = hc_bytealign_S (w1[2], w1[3], offset); + w5[1] = hc_bytealign_S (w1[1], w1[2], offset); + w5[0] = hc_bytealign_S (w1[0], w1[1], offset); + w4[3] = hc_bytealign_S (w0[3], w1[0], offset); + w4[2] = hc_bytealign_S (w0[2], w0[3], offset); + w4[1] = hc_bytealign_S (w0[1], w0[2], offset); + w4[0] = hc_bytealign_S (w0[0], w0[1], offset); + w3[3] = hc_bytealign_S ( 0, w0[0], offset); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -37210,22 +37210,22 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 16: - w7[3] = amd_bytealign_S (w3[2], w3[3], offset); - w7[2] = amd_bytealign_S (w3[1], w3[2], offset); - w7[1] = amd_bytealign_S (w3[0], w3[1], offset); - w7[0] = amd_bytealign_S (w2[3], w3[0], offset); - w6[3] = amd_bytealign_S (w2[2], w2[3], offset); - w6[2] = amd_bytealign_S (w2[1], w2[2], offset); - w6[1] = amd_bytealign_S (w2[0], w2[1], offset); - w6[0] = amd_bytealign_S (w1[3], w2[0], offset); - w5[3] = amd_bytealign_S (w1[2], w1[3], offset); - w5[2] = amd_bytealign_S (w1[1], w1[2], offset); - w5[1] = amd_bytealign_S (w1[0], w1[1], offset); - w5[0] = amd_bytealign_S (w0[3], w1[0], offset); - w4[3] = amd_bytealign_S (w0[2], w0[3], offset); - w4[2] = amd_bytealign_S (w0[1], w0[2], offset); - w4[1] = amd_bytealign_S (w0[0], w0[1], offset); - w4[0] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w3[2], w3[3], offset); + w7[2] = hc_bytealign_S (w3[1], w3[2], offset); + w7[1] = hc_bytealign_S (w3[0], w3[1], offset); + w7[0] = hc_bytealign_S (w2[3], w3[0], offset); + w6[3] = hc_bytealign_S (w2[2], w2[3], offset); + w6[2] = hc_bytealign_S (w2[1], w2[2], offset); + w6[1] = hc_bytealign_S (w2[0], w2[1], offset); + w6[0] = hc_bytealign_S (w1[3], w2[0], offset); + w5[3] = hc_bytealign_S (w1[2], w1[3], offset); + w5[2] = hc_bytealign_S (w1[1], w1[2], offset); + w5[1] = hc_bytealign_S (w1[0], w1[1], offset); + w5[0] = hc_bytealign_S (w0[3], w1[0], offset); + w4[3] = hc_bytealign_S (w0[2], w0[3], offset); + w4[2] = hc_bytealign_S (w0[1], w0[2], offset); + w4[1] = hc_bytealign_S (w0[0], w0[1], offset); + w4[0] = hc_bytealign_S ( 0, w0[0], offset); w3[3] = 0; w3[2] = 0; w3[1] = 0; @@ -37246,21 +37246,21 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 17: - w7[3] = amd_bytealign_S (w3[1], w3[2], offset); - w7[2] = amd_bytealign_S (w3[0], w3[1], offset); - w7[1] = amd_bytealign_S (w2[3], w3[0], offset); - w7[0] = amd_bytealign_S (w2[2], w2[3], offset); - w6[3] = amd_bytealign_S (w2[1], w2[2], offset); - w6[2] = amd_bytealign_S (w2[0], w2[1], offset); - w6[1] = amd_bytealign_S (w1[3], w2[0], offset); - w6[0] = amd_bytealign_S (w1[2], w1[3], offset); - w5[3] = amd_bytealign_S (w1[1], w1[2], offset); - w5[2] = amd_bytealign_S (w1[0], w1[1], offset); - w5[1] = amd_bytealign_S (w0[3], w1[0], offset); - w5[0] = amd_bytealign_S (w0[2], w0[3], offset); - w4[3] = amd_bytealign_S (w0[1], w0[2], offset); - w4[2] = amd_bytealign_S (w0[0], w0[1], offset); - w4[1] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w3[1], w3[2], offset); + w7[2] = hc_bytealign_S (w3[0], w3[1], offset); + w7[1] = hc_bytealign_S (w2[3], w3[0], offset); + w7[0] = hc_bytealign_S (w2[2], w2[3], offset); + w6[3] = hc_bytealign_S (w2[1], w2[2], offset); + w6[2] = hc_bytealign_S (w2[0], w2[1], offset); + w6[1] = hc_bytealign_S (w1[3], w2[0], offset); + w6[0] = hc_bytealign_S (w1[2], w1[3], offset); + w5[3] = hc_bytealign_S (w1[1], w1[2], offset); + w5[2] = hc_bytealign_S (w1[0], w1[1], offset); + w5[1] = hc_bytealign_S (w0[3], w1[0], offset); + w5[0] = hc_bytealign_S (w0[2], w0[3], offset); + w4[3] = hc_bytealign_S (w0[1], w0[2], offset); + w4[2] = hc_bytealign_S (w0[0], w0[1], offset); + w4[1] = hc_bytealign_S ( 0, w0[0], offset); w4[0] = 0; w3[3] = 0; w3[2] = 0; @@ -37282,20 +37282,20 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 18: - w7[3] = amd_bytealign_S (w3[0], w3[1], offset); - w7[2] = amd_bytealign_S (w2[3], w3[0], offset); - w7[1] = amd_bytealign_S (w2[2], w2[3], offset); - w7[0] = amd_bytealign_S (w2[1], w2[2], offset); - w6[3] = amd_bytealign_S (w2[0], w2[1], offset); - w6[2] = amd_bytealign_S (w1[3], w2[0], offset); - w6[1] = amd_bytealign_S (w1[2], w1[3], offset); - w6[0] = amd_bytealign_S (w1[1], w1[2], offset); - w5[3] = amd_bytealign_S (w1[0], w1[1], offset); - w5[2] = amd_bytealign_S (w0[3], w1[0], offset); - w5[1] = amd_bytealign_S (w0[2], w0[3], offset); - w5[0] = amd_bytealign_S (w0[1], w0[2], offset); - w4[3] = amd_bytealign_S (w0[0], w0[1], offset); - w4[2] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w3[0], w3[1], offset); + w7[2] = hc_bytealign_S (w2[3], w3[0], offset); + w7[1] = hc_bytealign_S (w2[2], w2[3], offset); + w7[0] = hc_bytealign_S (w2[1], w2[2], offset); + w6[3] = hc_bytealign_S (w2[0], w2[1], offset); + w6[2] = hc_bytealign_S (w1[3], w2[0], offset); + w6[1] = hc_bytealign_S (w1[2], w1[3], offset); + w6[0] = hc_bytealign_S (w1[1], w1[2], offset); + w5[3] = hc_bytealign_S (w1[0], w1[1], offset); + w5[2] = hc_bytealign_S (w0[3], w1[0], offset); + w5[1] = hc_bytealign_S (w0[2], w0[3], offset); + w5[0] = hc_bytealign_S (w0[1], w0[2], offset); + w4[3] = hc_bytealign_S (w0[0], w0[1], offset); + w4[2] = hc_bytealign_S ( 0, w0[0], offset); w4[1] = 0; w4[0] = 0; w3[3] = 0; @@ -37318,19 +37318,19 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 19: - w7[3] = amd_bytealign_S (w2[3], w3[0], offset); - w7[2] = amd_bytealign_S (w2[2], w2[3], offset); - w7[1] = amd_bytealign_S (w2[1], w2[2], offset); - w7[0] = amd_bytealign_S (w2[0], w2[1], offset); - w6[3] = amd_bytealign_S (w1[3], w2[0], offset); - w6[2] = amd_bytealign_S (w1[2], w1[3], offset); - w6[1] = amd_bytealign_S (w1[1], w1[2], offset); - w6[0] = amd_bytealign_S (w1[0], w1[1], offset); - w5[3] = amd_bytealign_S (w0[3], w1[0], offset); - w5[2] = amd_bytealign_S (w0[2], w0[3], offset); - w5[1] = amd_bytealign_S (w0[1], w0[2], offset); - w5[0] = amd_bytealign_S (w0[0], w0[1], offset); - w4[3] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w2[3], w3[0], offset); + w7[2] = hc_bytealign_S (w2[2], w2[3], offset); + w7[1] = hc_bytealign_S (w2[1], w2[2], offset); + w7[0] = hc_bytealign_S (w2[0], w2[1], offset); + w6[3] = hc_bytealign_S (w1[3], w2[0], offset); + w6[2] = hc_bytealign_S (w1[2], w1[3], offset); + w6[1] = hc_bytealign_S (w1[1], w1[2], offset); + w6[0] = hc_bytealign_S (w1[0], w1[1], offset); + w5[3] = hc_bytealign_S (w0[3], w1[0], offset); + w5[2] = hc_bytealign_S (w0[2], w0[3], offset); + w5[1] = hc_bytealign_S (w0[1], w0[2], offset); + w5[0] = hc_bytealign_S (w0[0], w0[1], offset); + w4[3] = hc_bytealign_S ( 0, w0[0], offset); w4[2] = 0; w4[1] = 0; w4[0] = 0; @@ -37354,18 +37354,18 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 20: - w7[3] = amd_bytealign_S (w2[2], w2[3], offset); - w7[2] = amd_bytealign_S (w2[1], w2[2], offset); - w7[1] = amd_bytealign_S (w2[0], w2[1], offset); - w7[0] = amd_bytealign_S (w1[3], w2[0], offset); - w6[3] = amd_bytealign_S (w1[2], w1[3], offset); - w6[2] = amd_bytealign_S (w1[1], w1[2], offset); - w6[1] = amd_bytealign_S (w1[0], w1[1], offset); - w6[0] = amd_bytealign_S (w0[3], w1[0], offset); - w5[3] = amd_bytealign_S (w0[2], w0[3], offset); - w5[2] = amd_bytealign_S (w0[1], w0[2], offset); - w5[1] = amd_bytealign_S (w0[0], w0[1], offset); - w5[0] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w2[2], w2[3], offset); + w7[2] = hc_bytealign_S (w2[1], w2[2], offset); + w7[1] = hc_bytealign_S (w2[0], w2[1], offset); + w7[0] = hc_bytealign_S (w1[3], w2[0], offset); + w6[3] = hc_bytealign_S (w1[2], w1[3], offset); + w6[2] = hc_bytealign_S (w1[1], w1[2], offset); + w6[1] = hc_bytealign_S (w1[0], w1[1], offset); + w6[0] = hc_bytealign_S (w0[3], w1[0], offset); + w5[3] = hc_bytealign_S (w0[2], w0[3], offset); + w5[2] = hc_bytealign_S (w0[1], w0[2], offset); + w5[1] = hc_bytealign_S (w0[0], w0[1], offset); + w5[0] = hc_bytealign_S ( 0, w0[0], offset); w4[3] = 0; w4[2] = 0; w4[1] = 0; @@ -37390,17 +37390,17 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 21: - w7[3] = amd_bytealign_S (w2[1], w2[2], offset); - w7[2] = amd_bytealign_S (w2[0], w2[1], offset); - w7[1] = amd_bytealign_S (w1[3], w2[0], offset); - w7[0] = amd_bytealign_S (w1[2], w1[3], offset); - w6[3] = amd_bytealign_S (w1[1], w1[2], offset); - w6[2] = amd_bytealign_S (w1[0], w1[1], offset); - w6[1] = amd_bytealign_S (w0[3], w1[0], offset); - w6[0] = amd_bytealign_S (w0[2], w0[3], offset); - w5[3] = amd_bytealign_S (w0[1], w0[2], offset); - w5[2] = amd_bytealign_S (w0[0], w0[1], offset); - w5[1] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w2[1], w2[2], offset); + w7[2] = hc_bytealign_S (w2[0], w2[1], offset); + w7[1] = hc_bytealign_S (w1[3], w2[0], offset); + w7[0] = hc_bytealign_S (w1[2], w1[3], offset); + w6[3] = hc_bytealign_S (w1[1], w1[2], offset); + w6[2] = hc_bytealign_S (w1[0], w1[1], offset); + w6[1] = hc_bytealign_S (w0[3], w1[0], offset); + w6[0] = hc_bytealign_S (w0[2], w0[3], offset); + w5[3] = hc_bytealign_S (w0[1], w0[2], offset); + w5[2] = hc_bytealign_S (w0[0], w0[1], offset); + w5[1] = hc_bytealign_S ( 0, w0[0], offset); w5[0] = 0; w4[3] = 0; w4[2] = 0; @@ -37426,16 +37426,16 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 22: - w7[3] = amd_bytealign_S (w2[0], w2[1], offset); - w7[2] = amd_bytealign_S (w1[3], w2[0], offset); - w7[1] = amd_bytealign_S (w1[2], w1[3], offset); - w7[0] = amd_bytealign_S (w1[1], w1[2], offset); - w6[3] = amd_bytealign_S (w1[0], w1[1], offset); - w6[2] = amd_bytealign_S (w0[3], w1[0], offset); - w6[1] = amd_bytealign_S (w0[2], w0[3], offset); - w6[0] = amd_bytealign_S (w0[1], w0[2], offset); - w5[3] = amd_bytealign_S (w0[0], w0[1], offset); - w5[2] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w2[0], w2[1], offset); + w7[2] = hc_bytealign_S (w1[3], w2[0], offset); + w7[1] = hc_bytealign_S (w1[2], w1[3], offset); + w7[0] = hc_bytealign_S (w1[1], w1[2], offset); + w6[3] = hc_bytealign_S (w1[0], w1[1], offset); + w6[2] = hc_bytealign_S (w0[3], w1[0], offset); + w6[1] = hc_bytealign_S (w0[2], w0[3], offset); + w6[0] = hc_bytealign_S (w0[1], w0[2], offset); + w5[3] = hc_bytealign_S (w0[0], w0[1], offset); + w5[2] = hc_bytealign_S ( 0, w0[0], offset); w5[1] = 0; w5[0] = 0; w4[3] = 0; @@ -37462,15 +37462,15 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 23: - w7[3] = amd_bytealign_S (w1[3], w2[0], offset); - w7[2] = amd_bytealign_S (w1[2], w1[3], offset); - w7[1] = amd_bytealign_S (w1[1], w1[2], offset); - w7[0] = amd_bytealign_S (w1[0], w1[1], offset); - w6[3] = amd_bytealign_S (w0[3], w1[0], offset); - w6[2] = amd_bytealign_S (w0[2], w0[3], offset); - w6[1] = amd_bytealign_S (w0[1], w0[2], offset); - w6[0] = amd_bytealign_S (w0[0], w0[1], offset); - w5[3] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w1[3], w2[0], offset); + w7[2] = hc_bytealign_S (w1[2], w1[3], offset); + w7[1] = hc_bytealign_S (w1[1], w1[2], offset); + w7[0] = hc_bytealign_S (w1[0], w1[1], offset); + w6[3] = hc_bytealign_S (w0[3], w1[0], offset); + w6[2] = hc_bytealign_S (w0[2], w0[3], offset); + w6[1] = hc_bytealign_S (w0[1], w0[2], offset); + w6[0] = hc_bytealign_S (w0[0], w0[1], offset); + w5[3] = hc_bytealign_S ( 0, w0[0], offset); w5[2] = 0; w5[1] = 0; w5[0] = 0; @@ -37498,14 +37498,14 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 24: - w7[3] = amd_bytealign_S (w1[2], w1[3], offset); - w7[2] = amd_bytealign_S (w1[1], w1[2], offset); - w7[1] = amd_bytealign_S (w1[0], w1[1], offset); - w7[0] = amd_bytealign_S (w0[3], w1[0], offset); - w6[3] = amd_bytealign_S (w0[2], w0[3], offset); - w6[2] = amd_bytealign_S (w0[1], w0[2], offset); - w6[1] = amd_bytealign_S (w0[0], w0[1], offset); - w6[0] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w1[2], w1[3], offset); + w7[2] = hc_bytealign_S (w1[1], w1[2], offset); + w7[1] = hc_bytealign_S (w1[0], w1[1], offset); + w7[0] = hc_bytealign_S (w0[3], w1[0], offset); + w6[3] = hc_bytealign_S (w0[2], w0[3], offset); + w6[2] = hc_bytealign_S (w0[1], w0[2], offset); + w6[1] = hc_bytealign_S (w0[0], w0[1], offset); + w6[0] = hc_bytealign_S ( 0, w0[0], offset); w5[3] = 0; w5[2] = 0; w5[1] = 0; @@ -37534,13 +37534,13 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 25: - w7[3] = amd_bytealign_S (w1[1], w1[2], offset); - w7[2] = amd_bytealign_S (w1[0], w1[1], offset); - w7[1] = amd_bytealign_S (w0[3], w1[0], offset); - w7[0] = amd_bytealign_S (w0[2], w0[3], offset); - w6[3] = amd_bytealign_S (w0[1], w0[2], offset); - w6[2] = amd_bytealign_S (w0[0], w0[1], offset); - w6[1] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w1[1], w1[2], offset); + w7[2] = hc_bytealign_S (w1[0], w1[1], offset); + w7[1] = hc_bytealign_S (w0[3], w1[0], offset); + w7[0] = hc_bytealign_S (w0[2], w0[3], offset); + w6[3] = hc_bytealign_S (w0[1], w0[2], offset); + w6[2] = hc_bytealign_S (w0[0], w0[1], offset); + w6[1] = hc_bytealign_S ( 0, w0[0], offset); w6[0] = 0; w5[3] = 0; w5[2] = 0; @@ -37570,12 +37570,12 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 26: - w7[3] = amd_bytealign_S (w1[0], w1[1], offset); - w7[2] = amd_bytealign_S (w0[3], w1[0], offset); - w7[1] = amd_bytealign_S (w0[2], w0[3], offset); - w7[0] = amd_bytealign_S (w0[1], w0[2], offset); - w6[3] = amd_bytealign_S (w0[0], w0[1], offset); - w6[2] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w1[0], w1[1], offset); + w7[2] = hc_bytealign_S (w0[3], w1[0], offset); + w7[1] = hc_bytealign_S (w0[2], w0[3], offset); + w7[0] = hc_bytealign_S (w0[1], w0[2], offset); + w6[3] = hc_bytealign_S (w0[0], w0[1], offset); + w6[2] = hc_bytealign_S ( 0, w0[0], offset); w6[1] = 0; w6[0] = 0; w5[3] = 0; @@ -37606,11 +37606,11 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 27: - w7[3] = amd_bytealign_S (w0[3], w1[0], offset); - w7[2] = amd_bytealign_S (w0[2], w0[3], offset); - w7[1] = amd_bytealign_S (w0[1], w0[2], offset); - w7[0] = amd_bytealign_S (w0[0], w0[1], offset); - w6[3] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w0[3], w1[0], offset); + w7[2] = hc_bytealign_S (w0[2], w0[3], offset); + w7[1] = hc_bytealign_S (w0[1], w0[2], offset); + w7[0] = hc_bytealign_S (w0[0], w0[1], offset); + w6[3] = hc_bytealign_S ( 0, w0[0], offset); w6[2] = 0; w6[1] = 0; w6[0] = 0; @@ -37642,10 +37642,10 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 28: - w7[3] = amd_bytealign_S (w0[2], w0[3], offset); - w7[2] = amd_bytealign_S (w0[1], w0[2], offset); - w7[1] = amd_bytealign_S (w0[0], w0[1], offset); - w7[0] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w0[2], w0[3], offset); + w7[2] = hc_bytealign_S (w0[1], w0[2], offset); + w7[1] = hc_bytealign_S (w0[0], w0[1], offset); + w7[0] = hc_bytealign_S ( 0, w0[0], offset); w6[3] = 0; w6[2] = 0; w6[1] = 0; @@ -37678,9 +37678,9 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 29: - w7[3] = amd_bytealign_S (w0[1], w0[2], offset); - w7[2] = amd_bytealign_S (w0[0], w0[1], offset); - w7[1] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w0[1], w0[2], offset); + w7[2] = hc_bytealign_S (w0[0], w0[1], offset); + w7[1] = hc_bytealign_S ( 0, w0[0], offset); w7[0] = 0; w6[3] = 0; w6[2] = 0; @@ -37714,8 +37714,8 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 30: - w7[3] = amd_bytealign_S (w0[0], w0[1], offset); - w7[2] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S (w0[0], w0[1], offset); + w7[2] = hc_bytealign_S ( 0, w0[0], offset); w7[1] = 0; w7[0] = 0; w6[3] = 0; @@ -37750,7 +37750,7 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 31: - w7[3] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = hc_bytealign_S ( 0, w0[0], offset); w7[2] = 0; w7[1] = 0; w7[0] = 0; @@ -37800,143 +37800,143 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * switch (offset_switch) { case 0: - w7[3] = __byte_perm_S (w7[3], w7[2], selector); - w7[2] = __byte_perm_S (w7[2], w7[1], selector); - w7[1] = __byte_perm_S (w7[1], w7[0], selector); - w7[0] = __byte_perm_S (w7[0], w6[3], selector); - w6[3] = __byte_perm_S (w6[3], w6[2], selector); - w6[2] = __byte_perm_S (w6[2], w6[1], selector); - w6[1] = __byte_perm_S (w6[1], w6[0], selector); - w6[0] = __byte_perm_S (w6[0], w5[3], selector); - w5[3] = __byte_perm_S (w5[3], w5[2], selector); - w5[2] = __byte_perm_S (w5[2], w5[1], selector); - w5[1] = __byte_perm_S (w5[1], w5[0], selector); - w5[0] = __byte_perm_S (w5[0], w4[3], selector); - w4[3] = __byte_perm_S (w4[3], w4[2], selector); - w4[2] = __byte_perm_S (w4[2], w4[1], selector); - w4[1] = __byte_perm_S (w4[1], w4[0], selector); - w4[0] = __byte_perm_S (w4[0], w3[3], selector); - w3[3] = __byte_perm_S (w3[3], w3[2], selector); - w3[2] = __byte_perm_S (w3[2], w3[1], selector); - w3[1] = __byte_perm_S (w3[1], w3[0], selector); - w3[0] = __byte_perm_S (w3[0], w2[3], selector); - w2[3] = __byte_perm_S (w2[3], w2[2], selector); - w2[2] = __byte_perm_S (w2[2], w2[1], selector); - w2[1] = __byte_perm_S (w2[1], w2[0], selector); - w2[0] = __byte_perm_S (w2[0], w1[3], selector); - w1[3] = __byte_perm_S (w1[3], w1[2], selector); - w1[2] = __byte_perm_S (w1[2], w1[1], selector); - w1[1] = __byte_perm_S (w1[1], w1[0], selector); - w1[0] = __byte_perm_S (w1[0], w0[3], selector); - w0[3] = __byte_perm_S (w0[3], w0[2], selector); - w0[2] = __byte_perm_S (w0[2], w0[1], selector); - w0[1] = __byte_perm_S (w0[1], w0[0], selector); - w0[0] = __byte_perm_S (w0[0], 0, selector); + w7[3] = hc_byte_perm_S (w7[3], w7[2], selector); + w7[2] = hc_byte_perm_S (w7[2], w7[1], selector); + w7[1] = hc_byte_perm_S (w7[1], w7[0], selector); + w7[0] = hc_byte_perm_S (w7[0], w6[3], selector); + w6[3] = hc_byte_perm_S (w6[3], w6[2], selector); + w6[2] = hc_byte_perm_S (w6[2], w6[1], selector); + w6[1] = hc_byte_perm_S (w6[1], w6[0], selector); + w6[0] = hc_byte_perm_S (w6[0], w5[3], selector); + w5[3] = hc_byte_perm_S (w5[3], w5[2], selector); + w5[2] = hc_byte_perm_S (w5[2], w5[1], selector); + w5[1] = hc_byte_perm_S (w5[1], w5[0], selector); + w5[0] = hc_byte_perm_S (w5[0], w4[3], selector); + w4[3] = hc_byte_perm_S (w4[3], w4[2], selector); + w4[2] = hc_byte_perm_S (w4[2], w4[1], selector); + w4[1] = hc_byte_perm_S (w4[1], w4[0], selector); + w4[0] = hc_byte_perm_S (w4[0], w3[3], selector); + w3[3] = hc_byte_perm_S (w3[3], w3[2], selector); + w3[2] = hc_byte_perm_S (w3[2], w3[1], selector); + w3[1] = hc_byte_perm_S (w3[1], w3[0], selector); + w3[0] = hc_byte_perm_S (w3[0], w2[3], selector); + w2[3] = hc_byte_perm_S (w2[3], w2[2], selector); + w2[2] = hc_byte_perm_S (w2[2], w2[1], selector); + w2[1] = hc_byte_perm_S (w2[1], w2[0], selector); + w2[0] = hc_byte_perm_S (w2[0], w1[3], selector); + w1[3] = hc_byte_perm_S (w1[3], w1[2], selector); + w1[2] = hc_byte_perm_S (w1[2], w1[1], selector); + w1[1] = hc_byte_perm_S (w1[1], w1[0], selector); + w1[0] = hc_byte_perm_S (w1[0], w0[3], selector); + w0[3] = hc_byte_perm_S (w0[3], w0[2], selector); + w0[2] = hc_byte_perm_S (w0[2], w0[1], selector); + w0[1] = hc_byte_perm_S (w0[1], w0[0], selector); + w0[0] = hc_byte_perm_S (w0[0], 0, selector); break; case 1: - w7[3] = __byte_perm_S (w7[2], w7[1], selector); - w7[2] = __byte_perm_S (w7[1], w7[0], selector); - w7[1] = __byte_perm_S (w7[0], w6[3], selector); - w7[0] = __byte_perm_S (w6[3], w6[2], selector); - w6[3] = __byte_perm_S (w6[2], w6[1], selector); - w6[2] = __byte_perm_S (w6[1], w6[0], selector); - w6[1] = __byte_perm_S (w6[0], w5[3], selector); - w6[0] = __byte_perm_S (w5[3], w5[2], selector); - w5[3] = __byte_perm_S (w5[2], w5[1], selector); - w5[2] = __byte_perm_S (w5[1], w5[0], selector); - w5[1] = __byte_perm_S (w5[0], w4[3], selector); - w5[0] = __byte_perm_S (w4[3], w4[2], selector); - w4[3] = __byte_perm_S (w4[2], w4[1], selector); - w4[2] = __byte_perm_S (w4[1], w4[0], selector); - w4[1] = __byte_perm_S (w4[0], w3[3], selector); - w4[0] = __byte_perm_S (w3[3], w3[2], selector); - w3[3] = __byte_perm_S (w3[2], w3[1], selector); - w3[2] = __byte_perm_S (w3[1], w3[0], selector); - w3[1] = __byte_perm_S (w3[0], w2[3], selector); - w3[0] = __byte_perm_S (w2[3], w2[2], selector); - w2[3] = __byte_perm_S (w2[2], w2[1], selector); - w2[2] = __byte_perm_S (w2[1], w2[0], selector); - w2[1] = __byte_perm_S (w2[0], w1[3], selector); - w2[0] = __byte_perm_S (w1[3], w1[2], selector); - w1[3] = __byte_perm_S (w1[2], w1[1], selector); - w1[2] = __byte_perm_S (w1[1], w1[0], selector); - w1[1] = __byte_perm_S (w1[0], w0[3], selector); - w1[0] = __byte_perm_S (w0[3], w0[2], selector); - w0[3] = __byte_perm_S (w0[2], w0[1], selector); - w0[2] = __byte_perm_S (w0[1], w0[0], selector); - w0[1] = __byte_perm_S (w0[0], 0, selector); + w7[3] = hc_byte_perm_S (w7[2], w7[1], selector); + w7[2] = hc_byte_perm_S (w7[1], w7[0], selector); + w7[1] = hc_byte_perm_S (w7[0], w6[3], selector); + w7[0] = hc_byte_perm_S (w6[3], w6[2], selector); + w6[3] = hc_byte_perm_S (w6[2], w6[1], selector); + w6[2] = hc_byte_perm_S (w6[1], w6[0], selector); + w6[1] = hc_byte_perm_S (w6[0], w5[3], selector); + w6[0] = hc_byte_perm_S (w5[3], w5[2], selector); + w5[3] = hc_byte_perm_S (w5[2], w5[1], selector); + w5[2] = hc_byte_perm_S (w5[1], w5[0], selector); + w5[1] = hc_byte_perm_S (w5[0], w4[3], selector); + w5[0] = hc_byte_perm_S (w4[3], w4[2], selector); + w4[3] = hc_byte_perm_S (w4[2], w4[1], selector); + w4[2] = hc_byte_perm_S (w4[1], w4[0], selector); + w4[1] = hc_byte_perm_S (w4[0], w3[3], selector); + w4[0] = hc_byte_perm_S (w3[3], w3[2], selector); + w3[3] = hc_byte_perm_S (w3[2], w3[1], selector); + w3[2] = hc_byte_perm_S (w3[1], w3[0], selector); + w3[1] = hc_byte_perm_S (w3[0], w2[3], selector); + w3[0] = hc_byte_perm_S (w2[3], w2[2], selector); + w2[3] = hc_byte_perm_S (w2[2], w2[1], selector); + w2[2] = hc_byte_perm_S (w2[1], w2[0], selector); + w2[1] = hc_byte_perm_S (w2[0], w1[3], selector); + w2[0] = hc_byte_perm_S (w1[3], w1[2], selector); + w1[3] = hc_byte_perm_S (w1[2], w1[1], selector); + w1[2] = hc_byte_perm_S (w1[1], w1[0], selector); + w1[1] = hc_byte_perm_S (w1[0], w0[3], selector); + w1[0] = hc_byte_perm_S (w0[3], w0[2], selector); + w0[3] = hc_byte_perm_S (w0[2], w0[1], selector); + w0[2] = hc_byte_perm_S (w0[1], w0[0], selector); + w0[1] = hc_byte_perm_S (w0[0], 0, selector); w0[0] = 0; break; case 2: - w7[3] = __byte_perm_S (w7[1], w7[0], selector); - w7[2] = __byte_perm_S (w7[0], w6[3], selector); - w7[1] = __byte_perm_S (w6[3], w6[2], selector); - w7[0] = __byte_perm_S (w6[2], w6[1], selector); - w6[3] = __byte_perm_S (w6[1], w6[0], selector); - w6[2] = __byte_perm_S (w6[0], w5[3], selector); - w6[1] = __byte_perm_S (w5[3], w5[2], selector); - w6[0] = __byte_perm_S (w5[2], w5[1], selector); - w5[3] = __byte_perm_S (w5[1], w5[0], selector); - w5[2] = __byte_perm_S (w5[0], w4[3], selector); - w5[1] = __byte_perm_S (w4[3], w4[2], selector); - w5[0] = __byte_perm_S (w4[2], w4[1], selector); - w4[3] = __byte_perm_S (w4[1], w4[0], selector); - w4[2] = __byte_perm_S (w4[0], w3[3], selector); - w4[1] = __byte_perm_S (w3[3], w3[2], selector); - w4[0] = __byte_perm_S (w3[2], w3[1], selector); - w3[3] = __byte_perm_S (w3[1], w3[0], selector); - w3[2] = __byte_perm_S (w3[0], w2[3], selector); - w3[1] = __byte_perm_S (w2[3], w2[2], selector); - w3[0] = __byte_perm_S (w2[2], w2[1], selector); - w2[3] = __byte_perm_S (w2[1], w2[0], selector); - w2[2] = __byte_perm_S (w2[0], w1[3], selector); - w2[1] = __byte_perm_S (w1[3], w1[2], selector); - w2[0] = __byte_perm_S (w1[2], w1[1], selector); - w1[3] = __byte_perm_S (w1[1], w1[0], selector); - w1[2] = __byte_perm_S (w1[0], w0[3], selector); - w1[1] = __byte_perm_S (w0[3], w0[2], selector); - w1[0] = __byte_perm_S (w0[2], w0[1], selector); - w0[3] = __byte_perm_S (w0[1], w0[0], selector); - w0[2] = __byte_perm_S (w0[0], 0, selector); + w7[3] = hc_byte_perm_S (w7[1], w7[0], selector); + w7[2] = hc_byte_perm_S (w7[0], w6[3], selector); + w7[1] = hc_byte_perm_S (w6[3], w6[2], selector); + w7[0] = hc_byte_perm_S (w6[2], w6[1], selector); + w6[3] = hc_byte_perm_S (w6[1], w6[0], selector); + w6[2] = hc_byte_perm_S (w6[0], w5[3], selector); + w6[1] = hc_byte_perm_S (w5[3], w5[2], selector); + w6[0] = hc_byte_perm_S (w5[2], w5[1], selector); + w5[3] = hc_byte_perm_S (w5[1], w5[0], selector); + w5[2] = hc_byte_perm_S (w5[0], w4[3], selector); + w5[1] = hc_byte_perm_S (w4[3], w4[2], selector); + w5[0] = hc_byte_perm_S (w4[2], w4[1], selector); + w4[3] = hc_byte_perm_S (w4[1], w4[0], selector); + w4[2] = hc_byte_perm_S (w4[0], w3[3], selector); + w4[1] = hc_byte_perm_S (w3[3], w3[2], selector); + w4[0] = hc_byte_perm_S (w3[2], w3[1], selector); + w3[3] = hc_byte_perm_S (w3[1], w3[0], selector); + w3[2] = hc_byte_perm_S (w3[0], w2[3], selector); + w3[1] = hc_byte_perm_S (w2[3], w2[2], selector); + w3[0] = hc_byte_perm_S (w2[2], w2[1], selector); + w2[3] = hc_byte_perm_S (w2[1], w2[0], selector); + w2[2] = hc_byte_perm_S (w2[0], w1[3], selector); + w2[1] = hc_byte_perm_S (w1[3], w1[2], selector); + w2[0] = hc_byte_perm_S (w1[2], w1[1], selector); + w1[3] = hc_byte_perm_S (w1[1], w1[0], selector); + w1[2] = hc_byte_perm_S (w1[0], w0[3], selector); + w1[1] = hc_byte_perm_S (w0[3], w0[2], selector); + w1[0] = hc_byte_perm_S (w0[2], w0[1], selector); + w0[3] = hc_byte_perm_S (w0[1], w0[0], selector); + w0[2] = hc_byte_perm_S (w0[0], 0, selector); w0[1] = 0; w0[0] = 0; break; case 3: - w7[3] = __byte_perm_S (w7[0], w6[3], selector); - w7[2] = __byte_perm_S (w6[3], w6[2], selector); - w7[1] = __byte_perm_S (w6[2], w6[1], selector); - w7[0] = __byte_perm_S (w6[1], w6[0], selector); - w6[3] = __byte_perm_S (w6[0], w5[3], selector); - w6[2] = __byte_perm_S (w5[3], w5[2], selector); - w6[1] = __byte_perm_S (w5[2], w5[1], selector); - w6[0] = __byte_perm_S (w5[1], w5[0], selector); - w5[3] = __byte_perm_S (w5[0], w4[3], selector); - w5[2] = __byte_perm_S (w4[3], w4[2], selector); - w5[1] = __byte_perm_S (w4[2], w4[1], selector); - w5[0] = __byte_perm_S (w4[1], w4[0], selector); - w4[3] = __byte_perm_S (w4[0], w3[3], selector); - w4[2] = __byte_perm_S (w3[3], w3[2], selector); - w4[1] = __byte_perm_S (w3[2], w3[1], selector); - w4[0] = __byte_perm_S (w3[1], w3[0], selector); - w3[3] = __byte_perm_S (w3[0], w2[3], selector); - w3[2] = __byte_perm_S (w2[3], w2[2], selector); - w3[1] = __byte_perm_S (w2[2], w2[1], selector); - w3[0] = __byte_perm_S (w2[1], w2[0], selector); - w2[3] = __byte_perm_S (w2[0], w1[3], selector); - w2[2] = __byte_perm_S (w1[3], w1[2], selector); - w2[1] = __byte_perm_S (w1[2], w1[1], selector); - w2[0] = __byte_perm_S (w1[1], w1[0], selector); - w1[3] = __byte_perm_S (w1[0], w0[3], selector); - w1[2] = __byte_perm_S (w0[3], w0[2], selector); - w1[1] = __byte_perm_S (w0[2], w0[1], selector); - w1[0] = __byte_perm_S (w0[1], w0[0], selector); - w0[3] = __byte_perm_S (w0[0], 0, selector); + w7[3] = hc_byte_perm_S (w7[0], w6[3], selector); + w7[2] = hc_byte_perm_S (w6[3], w6[2], selector); + w7[1] = hc_byte_perm_S (w6[2], w6[1], selector); + w7[0] = hc_byte_perm_S (w6[1], w6[0], selector); + w6[3] = hc_byte_perm_S (w6[0], w5[3], selector); + w6[2] = hc_byte_perm_S (w5[3], w5[2], selector); + w6[1] = hc_byte_perm_S (w5[2], w5[1], selector); + w6[0] = hc_byte_perm_S (w5[1], w5[0], selector); + w5[3] = hc_byte_perm_S (w5[0], w4[3], selector); + w5[2] = hc_byte_perm_S (w4[3], w4[2], selector); + w5[1] = hc_byte_perm_S (w4[2], w4[1], selector); + w5[0] = hc_byte_perm_S (w4[1], w4[0], selector); + w4[3] = hc_byte_perm_S (w4[0], w3[3], selector); + w4[2] = hc_byte_perm_S (w3[3], w3[2], selector); + w4[1] = hc_byte_perm_S (w3[2], w3[1], selector); + w4[0] = hc_byte_perm_S (w3[1], w3[0], selector); + w3[3] = hc_byte_perm_S (w3[0], w2[3], selector); + w3[2] = hc_byte_perm_S (w2[3], w2[2], selector); + w3[1] = hc_byte_perm_S (w2[2], w2[1], selector); + w3[0] = hc_byte_perm_S (w2[1], w2[0], selector); + w2[3] = hc_byte_perm_S (w2[0], w1[3], selector); + w2[2] = hc_byte_perm_S (w1[3], w1[2], selector); + w2[1] = hc_byte_perm_S (w1[2], w1[1], selector); + w2[0] = hc_byte_perm_S (w1[1], w1[0], selector); + w1[3] = hc_byte_perm_S (w1[0], w0[3], selector); + w1[2] = hc_byte_perm_S (w0[3], w0[2], selector); + w1[1] = hc_byte_perm_S (w0[2], w0[1], selector); + w1[0] = hc_byte_perm_S (w0[1], w0[0], selector); + w0[3] = hc_byte_perm_S (w0[0], 0, selector); w0[2] = 0; w0[1] = 0; w0[0] = 0; @@ -37944,34 +37944,34 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 4: - w7[3] = __byte_perm_S (w6[3], w6[2], selector); - w7[2] = __byte_perm_S (w6[2], w6[1], selector); - w7[1] = __byte_perm_S (w6[1], w6[0], selector); - w7[0] = __byte_perm_S (w6[0], w5[3], selector); - w6[3] = __byte_perm_S (w5[3], w5[2], selector); - w6[2] = __byte_perm_S (w5[2], w5[1], selector); - w6[1] = __byte_perm_S (w5[1], w5[0], selector); - w6[0] = __byte_perm_S (w5[0], w4[3], selector); - w5[3] = __byte_perm_S (w4[3], w4[2], selector); - w5[2] = __byte_perm_S (w4[2], w4[1], selector); - w5[1] = __byte_perm_S (w4[1], w4[0], selector); - w5[0] = __byte_perm_S (w4[0], w3[3], selector); - w4[3] = __byte_perm_S (w3[3], w3[2], selector); - w4[2] = __byte_perm_S (w3[2], w3[1], selector); - w4[1] = __byte_perm_S (w3[1], w3[0], selector); - w4[0] = __byte_perm_S (w3[0], w2[3], selector); - w3[3] = __byte_perm_S (w2[3], w2[2], selector); - w3[2] = __byte_perm_S (w2[2], w2[1], selector); - w3[1] = __byte_perm_S (w2[1], w2[0], selector); - w3[0] = __byte_perm_S (w2[0], w1[3], selector); - w2[3] = __byte_perm_S (w1[3], w1[2], selector); - w2[2] = __byte_perm_S (w1[2], w1[1], selector); - w2[1] = __byte_perm_S (w1[1], w1[0], selector); - w2[0] = __byte_perm_S (w1[0], w0[3], selector); - w1[3] = __byte_perm_S (w0[3], w0[2], selector); - w1[2] = __byte_perm_S (w0[2], w0[1], selector); - w1[1] = __byte_perm_S (w0[1], w0[0], selector); - w1[0] = __byte_perm_S (w0[0], 0, selector); + w7[3] = hc_byte_perm_S (w6[3], w6[2], selector); + w7[2] = hc_byte_perm_S (w6[2], w6[1], selector); + w7[1] = hc_byte_perm_S (w6[1], w6[0], selector); + w7[0] = hc_byte_perm_S (w6[0], w5[3], selector); + w6[3] = hc_byte_perm_S (w5[3], w5[2], selector); + w6[2] = hc_byte_perm_S (w5[2], w5[1], selector); + w6[1] = hc_byte_perm_S (w5[1], w5[0], selector); + w6[0] = hc_byte_perm_S (w5[0], w4[3], selector); + w5[3] = hc_byte_perm_S (w4[3], w4[2], selector); + w5[2] = hc_byte_perm_S (w4[2], w4[1], selector); + w5[1] = hc_byte_perm_S (w4[1], w4[0], selector); + w5[0] = hc_byte_perm_S (w4[0], w3[3], selector); + w4[3] = hc_byte_perm_S (w3[3], w3[2], selector); + w4[2] = hc_byte_perm_S (w3[2], w3[1], selector); + w4[1] = hc_byte_perm_S (w3[1], w3[0], selector); + w4[0] = hc_byte_perm_S (w3[0], w2[3], selector); + w3[3] = hc_byte_perm_S (w2[3], w2[2], selector); + w3[2] = hc_byte_perm_S (w2[2], w2[1], selector); + w3[1] = hc_byte_perm_S (w2[1], w2[0], selector); + w3[0] = hc_byte_perm_S (w2[0], w1[3], selector); + w2[3] = hc_byte_perm_S (w1[3], w1[2], selector); + w2[2] = hc_byte_perm_S (w1[2], w1[1], selector); + w2[1] = hc_byte_perm_S (w1[1], w1[0], selector); + w2[0] = hc_byte_perm_S (w1[0], w0[3], selector); + w1[3] = hc_byte_perm_S (w0[3], w0[2], selector); + w1[2] = hc_byte_perm_S (w0[2], w0[1], selector); + w1[1] = hc_byte_perm_S (w0[1], w0[0], selector); + w1[0] = hc_byte_perm_S (w0[0], 0, selector); w0[3] = 0; w0[2] = 0; w0[1] = 0; @@ -37980,33 +37980,33 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 5: - w7[3] = __byte_perm_S (w6[2], w6[1], selector); - w7[2] = __byte_perm_S (w6[1], w6[0], selector); - w7[1] = __byte_perm_S (w6[0], w5[3], selector); - w7[0] = __byte_perm_S (w5[3], w5[2], selector); - w6[3] = __byte_perm_S (w5[2], w5[1], selector); - w6[2] = __byte_perm_S (w5[1], w5[0], selector); - w6[1] = __byte_perm_S (w5[0], w4[3], selector); - w6[0] = __byte_perm_S (w4[3], w4[2], selector); - w5[3] = __byte_perm_S (w4[2], w4[1], selector); - w5[2] = __byte_perm_S (w4[1], w4[0], selector); - w5[1] = __byte_perm_S (w4[0], w3[3], selector); - w5[0] = __byte_perm_S (w3[3], w3[2], selector); - w4[3] = __byte_perm_S (w3[2], w3[1], selector); - w4[2] = __byte_perm_S (w3[1], w3[0], selector); - w4[1] = __byte_perm_S (w3[0], w2[3], selector); - w4[0] = __byte_perm_S (w2[3], w2[2], selector); - w3[3] = __byte_perm_S (w2[2], w2[1], selector); - w3[2] = __byte_perm_S (w2[1], w2[0], selector); - w3[1] = __byte_perm_S (w2[0], w1[3], selector); - w3[0] = __byte_perm_S (w1[3], w1[2], selector); - w2[3] = __byte_perm_S (w1[2], w1[1], selector); - w2[2] = __byte_perm_S (w1[1], w1[0], selector); - w2[1] = __byte_perm_S (w1[0], w0[3], selector); - w2[0] = __byte_perm_S (w0[3], w0[2], selector); - w1[3] = __byte_perm_S (w0[2], w0[1], selector); - w1[2] = __byte_perm_S (w0[1], w0[0], selector); - w1[1] = __byte_perm_S (w0[0], 0, selector); + w7[3] = hc_byte_perm_S (w6[2], w6[1], selector); + w7[2] = hc_byte_perm_S (w6[1], w6[0], selector); + w7[1] = hc_byte_perm_S (w6[0], w5[3], selector); + w7[0] = hc_byte_perm_S (w5[3], w5[2], selector); + w6[3] = hc_byte_perm_S (w5[2], w5[1], selector); + w6[2] = hc_byte_perm_S (w5[1], w5[0], selector); + w6[1] = hc_byte_perm_S (w5[0], w4[3], selector); + w6[0] = hc_byte_perm_S (w4[3], w4[2], selector); + w5[3] = hc_byte_perm_S (w4[2], w4[1], selector); + w5[2] = hc_byte_perm_S (w4[1], w4[0], selector); + w5[1] = hc_byte_perm_S (w4[0], w3[3], selector); + w5[0] = hc_byte_perm_S (w3[3], w3[2], selector); + w4[3] = hc_byte_perm_S (w3[2], w3[1], selector); + w4[2] = hc_byte_perm_S (w3[1], w3[0], selector); + w4[1] = hc_byte_perm_S (w3[0], w2[3], selector); + w4[0] = hc_byte_perm_S (w2[3], w2[2], selector); + w3[3] = hc_byte_perm_S (w2[2], w2[1], selector); + w3[2] = hc_byte_perm_S (w2[1], w2[0], selector); + w3[1] = hc_byte_perm_S (w2[0], w1[3], selector); + w3[0] = hc_byte_perm_S (w1[3], w1[2], selector); + w2[3] = hc_byte_perm_S (w1[2], w1[1], selector); + w2[2] = hc_byte_perm_S (w1[1], w1[0], selector); + w2[1] = hc_byte_perm_S (w1[0], w0[3], selector); + w2[0] = hc_byte_perm_S (w0[3], w0[2], selector); + w1[3] = hc_byte_perm_S (w0[2], w0[1], selector); + w1[2] = hc_byte_perm_S (w0[1], w0[0], selector); + w1[1] = hc_byte_perm_S (w0[0], 0, selector); w1[0] = 0; w0[3] = 0; w0[2] = 0; @@ -38016,32 +38016,32 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 6: - w7[3] = __byte_perm_S (w6[1], w6[0], selector); - w7[2] = __byte_perm_S (w6[0], w5[3], selector); - w7[1] = __byte_perm_S (w5[3], w5[2], selector); - w7[0] = __byte_perm_S (w5[2], w5[1], selector); - w6[3] = __byte_perm_S (w5[1], w5[0], selector); - w6[2] = __byte_perm_S (w5[0], w4[3], selector); - w6[1] = __byte_perm_S (w4[3], w4[2], selector); - w6[0] = __byte_perm_S (w4[2], w4[1], selector); - w5[3] = __byte_perm_S (w4[1], w4[0], selector); - w5[2] = __byte_perm_S (w4[0], w3[3], selector); - w5[1] = __byte_perm_S (w3[3], w3[2], selector); - w5[0] = __byte_perm_S (w3[2], w3[1], selector); - w4[3] = __byte_perm_S (w3[1], w3[0], selector); - w4[2] = __byte_perm_S (w3[0], w2[3], selector); - w4[1] = __byte_perm_S (w2[3], w2[2], selector); - w4[0] = __byte_perm_S (w2[2], w2[1], selector); - w3[3] = __byte_perm_S (w2[1], w2[0], selector); - w3[2] = __byte_perm_S (w2[0], w1[3], selector); - w3[1] = __byte_perm_S (w1[3], w1[2], selector); - w3[0] = __byte_perm_S (w1[2], w1[1], selector); - w2[3] = __byte_perm_S (w1[1], w1[0], selector); - w2[2] = __byte_perm_S (w1[0], w0[3], selector); - w2[1] = __byte_perm_S (w0[3], w0[2], selector); - w2[0] = __byte_perm_S (w0[2], w0[1], selector); - w1[3] = __byte_perm_S (w0[1], w0[0], selector); - w1[2] = __byte_perm_S (w0[0], 0, selector); + w7[3] = hc_byte_perm_S (w6[1], w6[0], selector); + w7[2] = hc_byte_perm_S (w6[0], w5[3], selector); + w7[1] = hc_byte_perm_S (w5[3], w5[2], selector); + w7[0] = hc_byte_perm_S (w5[2], w5[1], selector); + w6[3] = hc_byte_perm_S (w5[1], w5[0], selector); + w6[2] = hc_byte_perm_S (w5[0], w4[3], selector); + w6[1] = hc_byte_perm_S (w4[3], w4[2], selector); + w6[0] = hc_byte_perm_S (w4[2], w4[1], selector); + w5[3] = hc_byte_perm_S (w4[1], w4[0], selector); + w5[2] = hc_byte_perm_S (w4[0], w3[3], selector); + w5[1] = hc_byte_perm_S (w3[3], w3[2], selector); + w5[0] = hc_byte_perm_S (w3[2], w3[1], selector); + w4[3] = hc_byte_perm_S (w3[1], w3[0], selector); + w4[2] = hc_byte_perm_S (w3[0], w2[3], selector); + w4[1] = hc_byte_perm_S (w2[3], w2[2], selector); + w4[0] = hc_byte_perm_S (w2[2], w2[1], selector); + w3[3] = hc_byte_perm_S (w2[1], w2[0], selector); + w3[2] = hc_byte_perm_S (w2[0], w1[3], selector); + w3[1] = hc_byte_perm_S (w1[3], w1[2], selector); + w3[0] = hc_byte_perm_S (w1[2], w1[1], selector); + w2[3] = hc_byte_perm_S (w1[1], w1[0], selector); + w2[2] = hc_byte_perm_S (w1[0], w0[3], selector); + w2[1] = hc_byte_perm_S (w0[3], w0[2], selector); + w2[0] = hc_byte_perm_S (w0[2], w0[1], selector); + w1[3] = hc_byte_perm_S (w0[1], w0[0], selector); + w1[2] = hc_byte_perm_S (w0[0], 0, selector); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -38052,31 +38052,31 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 7: - w7[3] = __byte_perm_S (w6[0], w5[3], selector); - w7[2] = __byte_perm_S (w5[3], w5[2], selector); - w7[1] = __byte_perm_S (w5[2], w5[1], selector); - w7[0] = __byte_perm_S (w5[1], w5[0], selector); - w6[3] = __byte_perm_S (w5[0], w4[3], selector); - w6[2] = __byte_perm_S (w4[3], w4[2], selector); - w6[1] = __byte_perm_S (w4[2], w4[1], selector); - w6[0] = __byte_perm_S (w4[1], w4[0], selector); - w5[3] = __byte_perm_S (w4[0], w3[3], selector); - w5[2] = __byte_perm_S (w3[3], w3[2], selector); - w5[1] = __byte_perm_S (w3[2], w3[1], selector); - w5[0] = __byte_perm_S (w3[1], w3[0], selector); - w4[3] = __byte_perm_S (w3[0], w2[3], selector); - w4[2] = __byte_perm_S (w2[3], w2[2], selector); - w4[1] = __byte_perm_S (w2[2], w2[1], selector); - w4[0] = __byte_perm_S (w2[1], w2[0], selector); - w3[3] = __byte_perm_S (w2[0], w1[3], selector); - w3[2] = __byte_perm_S (w1[3], w1[2], selector); - w3[1] = __byte_perm_S (w1[2], w1[1], selector); - w3[0] = __byte_perm_S (w1[1], w1[0], selector); - w2[3] = __byte_perm_S (w1[0], w0[3], selector); - w2[2] = __byte_perm_S (w0[3], w0[2], selector); - w2[1] = __byte_perm_S (w0[2], w0[1], selector); - w2[0] = __byte_perm_S (w0[1], w0[0], selector); - w1[3] = __byte_perm_S (w0[0], 0, selector); + w7[3] = hc_byte_perm_S (w6[0], w5[3], selector); + w7[2] = hc_byte_perm_S (w5[3], w5[2], selector); + w7[1] = hc_byte_perm_S (w5[2], w5[1], selector); + w7[0] = hc_byte_perm_S (w5[1], w5[0], selector); + w6[3] = hc_byte_perm_S (w5[0], w4[3], selector); + w6[2] = hc_byte_perm_S (w4[3], w4[2], selector); + w6[1] = hc_byte_perm_S (w4[2], w4[1], selector); + w6[0] = hc_byte_perm_S (w4[1], w4[0], selector); + w5[3] = hc_byte_perm_S (w4[0], w3[3], selector); + w5[2] = hc_byte_perm_S (w3[3], w3[2], selector); + w5[1] = hc_byte_perm_S (w3[2], w3[1], selector); + w5[0] = hc_byte_perm_S (w3[1], w3[0], selector); + w4[3] = hc_byte_perm_S (w3[0], w2[3], selector); + w4[2] = hc_byte_perm_S (w2[3], w2[2], selector); + w4[1] = hc_byte_perm_S (w2[2], w2[1], selector); + w4[0] = hc_byte_perm_S (w2[1], w2[0], selector); + w3[3] = hc_byte_perm_S (w2[0], w1[3], selector); + w3[2] = hc_byte_perm_S (w1[3], w1[2], selector); + w3[1] = hc_byte_perm_S (w1[2], w1[1], selector); + w3[0] = hc_byte_perm_S (w1[1], w1[0], selector); + w2[3] = hc_byte_perm_S (w1[0], w0[3], selector); + w2[2] = hc_byte_perm_S (w0[3], w0[2], selector); + w2[1] = hc_byte_perm_S (w0[2], w0[1], selector); + w2[0] = hc_byte_perm_S (w0[1], w0[0], selector); + w1[3] = hc_byte_perm_S (w0[0], 0, selector); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -38088,30 +38088,30 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 8: - w7[3] = __byte_perm_S (w5[3], w5[2], selector); - w7[2] = __byte_perm_S (w5[2], w5[1], selector); - w7[1] = __byte_perm_S (w5[1], w5[0], selector); - w7[0] = __byte_perm_S (w5[0], w4[3], selector); - w6[3] = __byte_perm_S (w4[3], w4[2], selector); - w6[2] = __byte_perm_S (w4[2], w4[1], selector); - w6[1] = __byte_perm_S (w4[1], w4[0], selector); - w6[0] = __byte_perm_S (w4[0], w3[3], selector); - w5[3] = __byte_perm_S (w3[3], w3[2], selector); - w5[2] = __byte_perm_S (w3[2], w3[1], selector); - w5[1] = __byte_perm_S (w3[1], w3[0], selector); - w5[0] = __byte_perm_S (w3[0], w2[3], selector); - w4[3] = __byte_perm_S (w2[3], w2[2], selector); - w4[2] = __byte_perm_S (w2[2], w2[1], selector); - w4[1] = __byte_perm_S (w2[1], w2[0], selector); - w4[0] = __byte_perm_S (w2[0], w1[3], selector); - w3[3] = __byte_perm_S (w1[3], w1[2], selector); - w3[2] = __byte_perm_S (w1[2], w1[1], selector); - w3[1] = __byte_perm_S (w1[1], w1[0], selector); - w3[0] = __byte_perm_S (w1[0], w0[3], selector); - w2[3] = __byte_perm_S (w0[3], w0[2], selector); - w2[2] = __byte_perm_S (w0[2], w0[1], selector); - w2[1] = __byte_perm_S (w0[1], w0[0], selector); - w2[0] = __byte_perm_S (w0[0], 0, selector); + w7[3] = hc_byte_perm_S (w5[3], w5[2], selector); + w7[2] = hc_byte_perm_S (w5[2], w5[1], selector); + w7[1] = hc_byte_perm_S (w5[1], w5[0], selector); + w7[0] = hc_byte_perm_S (w5[0], w4[3], selector); + w6[3] = hc_byte_perm_S (w4[3], w4[2], selector); + w6[2] = hc_byte_perm_S (w4[2], w4[1], selector); + w6[1] = hc_byte_perm_S (w4[1], w4[0], selector); + w6[0] = hc_byte_perm_S (w4[0], w3[3], selector); + w5[3] = hc_byte_perm_S (w3[3], w3[2], selector); + w5[2] = hc_byte_perm_S (w3[2], w3[1], selector); + w5[1] = hc_byte_perm_S (w3[1], w3[0], selector); + w5[0] = hc_byte_perm_S (w3[0], w2[3], selector); + w4[3] = hc_byte_perm_S (w2[3], w2[2], selector); + w4[2] = hc_byte_perm_S (w2[2], w2[1], selector); + w4[1] = hc_byte_perm_S (w2[1], w2[0], selector); + w4[0] = hc_byte_perm_S (w2[0], w1[3], selector); + w3[3] = hc_byte_perm_S (w1[3], w1[2], selector); + w3[2] = hc_byte_perm_S (w1[2], w1[1], selector); + w3[1] = hc_byte_perm_S (w1[1], w1[0], selector); + w3[0] = hc_byte_perm_S (w1[0], w0[3], selector); + w2[3] = hc_byte_perm_S (w0[3], w0[2], selector); + w2[2] = hc_byte_perm_S (w0[2], w0[1], selector); + w2[1] = hc_byte_perm_S (w0[1], w0[0], selector); + w2[0] = hc_byte_perm_S (w0[0], 0, selector); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -38124,29 +38124,29 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 9: - w7[3] = __byte_perm_S (w5[2], w5[1], selector); - w7[2] = __byte_perm_S (w5[1], w5[0], selector); - w7[1] = __byte_perm_S (w5[0], w4[3], selector); - w7[0] = __byte_perm_S (w4[3], w4[2], selector); - w6[3] = __byte_perm_S (w4[2], w4[1], selector); - w6[2] = __byte_perm_S (w4[1], w4[0], selector); - w6[1] = __byte_perm_S (w4[0], w3[3], selector); - w6[0] = __byte_perm_S (w3[3], w3[2], selector); - w5[3] = __byte_perm_S (w3[2], w3[1], selector); - w5[2] = __byte_perm_S (w3[1], w3[0], selector); - w5[1] = __byte_perm_S (w3[0], w2[3], selector); - w5[0] = __byte_perm_S (w2[3], w2[2], selector); - w4[3] = __byte_perm_S (w2[2], w2[1], selector); - w4[2] = __byte_perm_S (w2[1], w2[0], selector); - w4[1] = __byte_perm_S (w2[0], w1[3], selector); - w4[0] = __byte_perm_S (w1[3], w1[2], selector); - w3[3] = __byte_perm_S (w1[2], w1[1], selector); - w3[2] = __byte_perm_S (w1[1], w1[0], selector); - w3[1] = __byte_perm_S (w1[0], w0[3], selector); - w3[0] = __byte_perm_S (w0[3], w0[2], selector); - w2[3] = __byte_perm_S (w0[2], w0[1], selector); - w2[2] = __byte_perm_S (w0[1], w0[0], selector); - w2[1] = __byte_perm_S (w0[0], 0, selector); + w7[3] = hc_byte_perm_S (w5[2], w5[1], selector); + w7[2] = hc_byte_perm_S (w5[1], w5[0], selector); + w7[1] = hc_byte_perm_S (w5[0], w4[3], selector); + w7[0] = hc_byte_perm_S (w4[3], w4[2], selector); + w6[3] = hc_byte_perm_S (w4[2], w4[1], selector); + w6[2] = hc_byte_perm_S (w4[1], w4[0], selector); + w6[1] = hc_byte_perm_S (w4[0], w3[3], selector); + w6[0] = hc_byte_perm_S (w3[3], w3[2], selector); + w5[3] = hc_byte_perm_S (w3[2], w3[1], selector); + w5[2] = hc_byte_perm_S (w3[1], w3[0], selector); + w5[1] = hc_byte_perm_S (w3[0], w2[3], selector); + w5[0] = hc_byte_perm_S (w2[3], w2[2], selector); + w4[3] = hc_byte_perm_S (w2[2], w2[1], selector); + w4[2] = hc_byte_perm_S (w2[1], w2[0], selector); + w4[1] = hc_byte_perm_S (w2[0], w1[3], selector); + w4[0] = hc_byte_perm_S (w1[3], w1[2], selector); + w3[3] = hc_byte_perm_S (w1[2], w1[1], selector); + w3[2] = hc_byte_perm_S (w1[1], w1[0], selector); + w3[1] = hc_byte_perm_S (w1[0], w0[3], selector); + w3[0] = hc_byte_perm_S (w0[3], w0[2], selector); + w2[3] = hc_byte_perm_S (w0[2], w0[1], selector); + w2[2] = hc_byte_perm_S (w0[1], w0[0], selector); + w2[1] = hc_byte_perm_S (w0[0], 0, selector); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -38160,28 +38160,28 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 10: - w7[3] = __byte_perm_S (w5[1], w5[0], selector); - w7[2] = __byte_perm_S (w5[0], w4[3], selector); - w7[1] = __byte_perm_S (w4[3], w4[2], selector); - w7[0] = __byte_perm_S (w4[2], w4[1], selector); - w6[3] = __byte_perm_S (w4[1], w4[0], selector); - w6[2] = __byte_perm_S (w4[0], w3[3], selector); - w6[1] = __byte_perm_S (w3[3], w3[2], selector); - w6[0] = __byte_perm_S (w3[2], w3[1], selector); - w5[3] = __byte_perm_S (w3[1], w3[0], selector); - w5[2] = __byte_perm_S (w3[0], w2[3], selector); - w5[1] = __byte_perm_S (w2[3], w2[2], selector); - w5[0] = __byte_perm_S (w2[2], w2[1], selector); - w4[3] = __byte_perm_S (w2[1], w2[0], selector); - w4[2] = __byte_perm_S (w2[0], w1[3], selector); - w4[1] = __byte_perm_S (w1[3], w1[2], selector); - w4[0] = __byte_perm_S (w1[2], w1[1], selector); - w3[3] = __byte_perm_S (w1[1], w1[0], selector); - w3[2] = __byte_perm_S (w1[0], w0[3], selector); - w3[1] = __byte_perm_S (w0[3], w0[2], selector); - w3[0] = __byte_perm_S (w0[2], w0[1], selector); - w2[3] = __byte_perm_S (w0[1], w0[0], selector); - w2[2] = __byte_perm_S (w0[0], 0, selector); + w7[3] = hc_byte_perm_S (w5[1], w5[0], selector); + w7[2] = hc_byte_perm_S (w5[0], w4[3], selector); + w7[1] = hc_byte_perm_S (w4[3], w4[2], selector); + w7[0] = hc_byte_perm_S (w4[2], w4[1], selector); + w6[3] = hc_byte_perm_S (w4[1], w4[0], selector); + w6[2] = hc_byte_perm_S (w4[0], w3[3], selector); + w6[1] = hc_byte_perm_S (w3[3], w3[2], selector); + w6[0] = hc_byte_perm_S (w3[2], w3[1], selector); + w5[3] = hc_byte_perm_S (w3[1], w3[0], selector); + w5[2] = hc_byte_perm_S (w3[0], w2[3], selector); + w5[1] = hc_byte_perm_S (w2[3], w2[2], selector); + w5[0] = hc_byte_perm_S (w2[2], w2[1], selector); + w4[3] = hc_byte_perm_S (w2[1], w2[0], selector); + w4[2] = hc_byte_perm_S (w2[0], w1[3], selector); + w4[1] = hc_byte_perm_S (w1[3], w1[2], selector); + w4[0] = hc_byte_perm_S (w1[2], w1[1], selector); + w3[3] = hc_byte_perm_S (w1[1], w1[0], selector); + w3[2] = hc_byte_perm_S (w1[0], w0[3], selector); + w3[1] = hc_byte_perm_S (w0[3], w0[2], selector); + w3[0] = hc_byte_perm_S (w0[2], w0[1], selector); + w2[3] = hc_byte_perm_S (w0[1], w0[0], selector); + w2[2] = hc_byte_perm_S (w0[0], 0, selector); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -38196,27 +38196,27 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 11: - w7[3] = __byte_perm_S (w5[0], w4[3], selector); - w7[2] = __byte_perm_S (w4[3], w4[2], selector); - w7[1] = __byte_perm_S (w4[2], w4[1], selector); - w7[0] = __byte_perm_S (w4[1], w4[0], selector); - w6[3] = __byte_perm_S (w4[0], w3[3], selector); - w6[2] = __byte_perm_S (w3[3], w3[2], selector); - w6[1] = __byte_perm_S (w3[2], w3[1], selector); - w6[0] = __byte_perm_S (w3[1], w3[0], selector); - w5[3] = __byte_perm_S (w3[0], w2[3], selector); - w5[2] = __byte_perm_S (w2[3], w2[2], selector); - w5[1] = __byte_perm_S (w2[2], w2[1], selector); - w5[0] = __byte_perm_S (w2[1], w2[0], selector); - w4[3] = __byte_perm_S (w2[0], w1[3], selector); - w4[2] = __byte_perm_S (w1[3], w1[2], selector); - w4[1] = __byte_perm_S (w1[2], w1[1], selector); - w4[0] = __byte_perm_S (w1[1], w1[0], selector); - w3[3] = __byte_perm_S (w1[0], w0[3], selector); - w3[2] = __byte_perm_S (w0[3], w0[2], selector); - w3[1] = __byte_perm_S (w0[2], w0[1], selector); - w3[0] = __byte_perm_S (w0[1], w0[0], selector); - w2[3] = __byte_perm_S (w0[0], 0, selector); + w7[3] = hc_byte_perm_S (w5[0], w4[3], selector); + w7[2] = hc_byte_perm_S (w4[3], w4[2], selector); + w7[1] = hc_byte_perm_S (w4[2], w4[1], selector); + w7[0] = hc_byte_perm_S (w4[1], w4[0], selector); + w6[3] = hc_byte_perm_S (w4[0], w3[3], selector); + w6[2] = hc_byte_perm_S (w3[3], w3[2], selector); + w6[1] = hc_byte_perm_S (w3[2], w3[1], selector); + w6[0] = hc_byte_perm_S (w3[1], w3[0], selector); + w5[3] = hc_byte_perm_S (w3[0], w2[3], selector); + w5[2] = hc_byte_perm_S (w2[3], w2[2], selector); + w5[1] = hc_byte_perm_S (w2[2], w2[1], selector); + w5[0] = hc_byte_perm_S (w2[1], w2[0], selector); + w4[3] = hc_byte_perm_S (w2[0], w1[3], selector); + w4[2] = hc_byte_perm_S (w1[3], w1[2], selector); + w4[1] = hc_byte_perm_S (w1[2], w1[1], selector); + w4[0] = hc_byte_perm_S (w1[1], w1[0], selector); + w3[3] = hc_byte_perm_S (w1[0], w0[3], selector); + w3[2] = hc_byte_perm_S (w0[3], w0[2], selector); + w3[1] = hc_byte_perm_S (w0[2], w0[1], selector); + w3[0] = hc_byte_perm_S (w0[1], w0[0], selector); + w2[3] = hc_byte_perm_S (w0[0], 0, selector); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -38232,26 +38232,26 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 12: - w7[3] = __byte_perm_S (w4[3], w4[2], selector); - w7[2] = __byte_perm_S (w4[2], w4[1], selector); - w7[1] = __byte_perm_S (w4[1], w4[0], selector); - w7[0] = __byte_perm_S (w4[0], w3[3], selector); - w6[3] = __byte_perm_S (w3[3], w3[2], selector); - w6[2] = __byte_perm_S (w3[2], w3[1], selector); - w6[1] = __byte_perm_S (w3[1], w3[0], selector); - w6[0] = __byte_perm_S (w3[0], w2[3], selector); - w5[3] = __byte_perm_S (w2[3], w2[2], selector); - w5[2] = __byte_perm_S (w2[2], w2[1], selector); - w5[1] = __byte_perm_S (w2[1], w2[0], selector); - w5[0] = __byte_perm_S (w2[0], w1[3], selector); - w4[3] = __byte_perm_S (w1[3], w1[2], selector); - w4[2] = __byte_perm_S (w1[2], w1[1], selector); - w4[1] = __byte_perm_S (w1[1], w1[0], selector); - w4[0] = __byte_perm_S (w1[0], w0[3], selector); - w3[3] = __byte_perm_S (w0[3], w0[2], selector); - w3[2] = __byte_perm_S (w0[2], w0[1], selector); - w3[1] = __byte_perm_S (w0[1], w0[0], selector); - w3[0] = __byte_perm_S (w0[0], 0, selector); + w7[3] = hc_byte_perm_S (w4[3], w4[2], selector); + w7[2] = hc_byte_perm_S (w4[2], w4[1], selector); + w7[1] = hc_byte_perm_S (w4[1], w4[0], selector); + w7[0] = hc_byte_perm_S (w4[0], w3[3], selector); + w6[3] = hc_byte_perm_S (w3[3], w3[2], selector); + w6[2] = hc_byte_perm_S (w3[2], w3[1], selector); + w6[1] = hc_byte_perm_S (w3[1], w3[0], selector); + w6[0] = hc_byte_perm_S (w3[0], w2[3], selector); + w5[3] = hc_byte_perm_S (w2[3], w2[2], selector); + w5[2] = hc_byte_perm_S (w2[2], w2[1], selector); + w5[1] = hc_byte_perm_S (w2[1], w2[0], selector); + w5[0] = hc_byte_perm_S (w2[0], w1[3], selector); + w4[3] = hc_byte_perm_S (w1[3], w1[2], selector); + w4[2] = hc_byte_perm_S (w1[2], w1[1], selector); + w4[1] = hc_byte_perm_S (w1[1], w1[0], selector); + w4[0] = hc_byte_perm_S (w1[0], w0[3], selector); + w3[3] = hc_byte_perm_S (w0[3], w0[2], selector); + w3[2] = hc_byte_perm_S (w0[2], w0[1], selector); + w3[1] = hc_byte_perm_S (w0[1], w0[0], selector); + w3[0] = hc_byte_perm_S (w0[0], 0, selector); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -38268,25 +38268,25 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 13: - w7[3] = __byte_perm_S (w4[2], w4[1], selector); - w7[2] = __byte_perm_S (w4[1], w4[0], selector); - w7[1] = __byte_perm_S (w4[0], w3[3], selector); - w7[0] = __byte_perm_S (w3[3], w3[2], selector); - w6[3] = __byte_perm_S (w3[2], w3[1], selector); - w6[2] = __byte_perm_S (w3[1], w3[0], selector); - w6[1] = __byte_perm_S (w3[0], w2[3], selector); - w6[0] = __byte_perm_S (w2[3], w2[2], selector); - w5[3] = __byte_perm_S (w2[2], w2[1], selector); - w5[2] = __byte_perm_S (w2[1], w2[0], selector); - w5[1] = __byte_perm_S (w2[0], w1[3], selector); - w5[0] = __byte_perm_S (w1[3], w1[2], selector); - w4[3] = __byte_perm_S (w1[2], w1[1], selector); - w4[2] = __byte_perm_S (w1[1], w1[0], selector); - w4[1] = __byte_perm_S (w1[0], w0[3], selector); - w4[0] = __byte_perm_S (w0[3], w0[2], selector); - w3[3] = __byte_perm_S (w0[2], w0[1], selector); - w3[2] = __byte_perm_S (w0[1], w0[0], selector); - w3[1] = __byte_perm_S (w0[0], 0, selector); + w7[3] = hc_byte_perm_S (w4[2], w4[1], selector); + w7[2] = hc_byte_perm_S (w4[1], w4[0], selector); + w7[1] = hc_byte_perm_S (w4[0], w3[3], selector); + w7[0] = hc_byte_perm_S (w3[3], w3[2], selector); + w6[3] = hc_byte_perm_S (w3[2], w3[1], selector); + w6[2] = hc_byte_perm_S (w3[1], w3[0], selector); + w6[1] = hc_byte_perm_S (w3[0], w2[3], selector); + w6[0] = hc_byte_perm_S (w2[3], w2[2], selector); + w5[3] = hc_byte_perm_S (w2[2], w2[1], selector); + w5[2] = hc_byte_perm_S (w2[1], w2[0], selector); + w5[1] = hc_byte_perm_S (w2[0], w1[3], selector); + w5[0] = hc_byte_perm_S (w1[3], w1[2], selector); + w4[3] = hc_byte_perm_S (w1[2], w1[1], selector); + w4[2] = hc_byte_perm_S (w1[1], w1[0], selector); + w4[1] = hc_byte_perm_S (w1[0], w0[3], selector); + w4[0] = hc_byte_perm_S (w0[3], w0[2], selector); + w3[3] = hc_byte_perm_S (w0[2], w0[1], selector); + w3[2] = hc_byte_perm_S (w0[1], w0[0], selector); + w3[1] = hc_byte_perm_S (w0[0], 0, selector); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -38304,24 +38304,24 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 14: - w7[3] = __byte_perm_S (w4[1], w4[0], selector); - w7[2] = __byte_perm_S (w4[0], w3[3], selector); - w7[1] = __byte_perm_S (w3[3], w3[2], selector); - w7[0] = __byte_perm_S (w3[2], w3[1], selector); - w6[3] = __byte_perm_S (w3[1], w3[0], selector); - w6[2] = __byte_perm_S (w3[0], w2[3], selector); - w6[1] = __byte_perm_S (w2[3], w2[2], selector); - w6[0] = __byte_perm_S (w2[2], w2[1], selector); - w5[3] = __byte_perm_S (w2[1], w2[0], selector); - w5[2] = __byte_perm_S (w2[0], w1[3], selector); - w5[1] = __byte_perm_S (w1[3], w1[2], selector); - w5[0] = __byte_perm_S (w1[2], w1[1], selector); - w4[3] = __byte_perm_S (w1[1], w1[0], selector); - w4[2] = __byte_perm_S (w1[0], w0[3], selector); - w4[1] = __byte_perm_S (w0[3], w0[2], selector); - w4[0] = __byte_perm_S (w0[2], w0[1], selector); - w3[3] = __byte_perm_S (w0[1], w0[0], selector); - w3[2] = __byte_perm_S (w0[0], 0, selector); + w7[3] = hc_byte_perm_S (w4[1], w4[0], selector); + w7[2] = hc_byte_perm_S (w4[0], w3[3], selector); + w7[1] = hc_byte_perm_S (w3[3], w3[2], selector); + w7[0] = hc_byte_perm_S (w3[2], w3[1], selector); + w6[3] = hc_byte_perm_S (w3[1], w3[0], selector); + w6[2] = hc_byte_perm_S (w3[0], w2[3], selector); + w6[1] = hc_byte_perm_S (w2[3], w2[2], selector); + w6[0] = hc_byte_perm_S (w2[2], w2[1], selector); + w5[3] = hc_byte_perm_S (w2[1], w2[0], selector); + w5[2] = hc_byte_perm_S (w2[0], w1[3], selector); + w5[1] = hc_byte_perm_S (w1[3], w1[2], selector); + w5[0] = hc_byte_perm_S (w1[2], w1[1], selector); + w4[3] = hc_byte_perm_S (w1[1], w1[0], selector); + w4[2] = hc_byte_perm_S (w1[0], w0[3], selector); + w4[1] = hc_byte_perm_S (w0[3], w0[2], selector); + w4[0] = hc_byte_perm_S (w0[2], w0[1], selector); + w3[3] = hc_byte_perm_S (w0[1], w0[0], selector); + w3[2] = hc_byte_perm_S (w0[0], 0, selector); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -38340,23 +38340,23 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 15: - w7[3] = __byte_perm_S (w4[0], w3[3], selector); - w7[2] = __byte_perm_S (w3[3], w3[2], selector); - w7[1] = __byte_perm_S (w3[2], w3[1], selector); - w7[0] = __byte_perm_S (w3[1], w3[0], selector); - w6[3] = __byte_perm_S (w3[0], w2[3], selector); - w6[2] = __byte_perm_S (w2[3], w2[2], selector); - w6[1] = __byte_perm_S (w2[2], w2[1], selector); - w6[0] = __byte_perm_S (w2[1], w2[0], selector); - w5[3] = __byte_perm_S (w2[0], w1[3], selector); - w5[2] = __byte_perm_S (w1[3], w1[2], selector); - w5[1] = __byte_perm_S (w1[2], w1[1], selector); - w5[0] = __byte_perm_S (w1[1], w1[0], selector); - w4[3] = __byte_perm_S (w1[0], w0[3], selector); - w4[2] = __byte_perm_S (w0[3], w0[2], selector); - w4[1] = __byte_perm_S (w0[2], w0[1], selector); - w4[0] = __byte_perm_S (w0[1], w0[0], selector); - w3[3] = __byte_perm_S (w0[0], 0, selector); + w7[3] = hc_byte_perm_S (w4[0], w3[3], selector); + w7[2] = hc_byte_perm_S (w3[3], w3[2], selector); + w7[1] = hc_byte_perm_S (w3[2], w3[1], selector); + w7[0] = hc_byte_perm_S (w3[1], w3[0], selector); + w6[3] = hc_byte_perm_S (w3[0], w2[3], selector); + w6[2] = hc_byte_perm_S (w2[3], w2[2], selector); + w6[1] = hc_byte_perm_S (w2[2], w2[1], selector); + w6[0] = hc_byte_perm_S (w2[1], w2[0], selector); + w5[3] = hc_byte_perm_S (w2[0], w1[3], selector); + w5[2] = hc_byte_perm_S (w1[3], w1[2], selector); + w5[1] = hc_byte_perm_S (w1[2], w1[1], selector); + w5[0] = hc_byte_perm_S (w1[1], w1[0], selector); + w4[3] = hc_byte_perm_S (w1[0], w0[3], selector); + w4[2] = hc_byte_perm_S (w0[3], w0[2], selector); + w4[1] = hc_byte_perm_S (w0[2], w0[1], selector); + w4[0] = hc_byte_perm_S (w0[1], w0[0], selector); + w3[3] = hc_byte_perm_S (w0[0], 0, selector); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -38376,22 +38376,22 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 16: - w7[3] = __byte_perm_S (w3[3], w3[2], selector); - w7[2] = __byte_perm_S (w3[2], w3[1], selector); - w7[1] = __byte_perm_S (w3[1], w3[0], selector); - w7[0] = __byte_perm_S (w3[0], w2[3], selector); - w6[3] = __byte_perm_S (w2[3], w2[2], selector); - w6[2] = __byte_perm_S (w2[2], w2[1], selector); - w6[1] = __byte_perm_S (w2[1], w2[0], selector); - w6[0] = __byte_perm_S (w2[0], w1[3], selector); - w5[3] = __byte_perm_S (w1[3], w1[2], selector); - w5[2] = __byte_perm_S (w1[2], w1[1], selector); - w5[1] = __byte_perm_S (w1[1], w1[0], selector); - w5[0] = __byte_perm_S (w1[0], w0[3], selector); - w4[3] = __byte_perm_S (w0[3], w0[2], selector); - w4[2] = __byte_perm_S (w0[2], w0[1], selector); - w4[1] = __byte_perm_S (w0[1], w0[0], selector); - w4[0] = __byte_perm_S (w0[0], 0, selector); + w7[3] = hc_byte_perm_S (w3[3], w3[2], selector); + w7[2] = hc_byte_perm_S (w3[2], w3[1], selector); + w7[1] = hc_byte_perm_S (w3[1], w3[0], selector); + w7[0] = hc_byte_perm_S (w3[0], w2[3], selector); + w6[3] = hc_byte_perm_S (w2[3], w2[2], selector); + w6[2] = hc_byte_perm_S (w2[2], w2[1], selector); + w6[1] = hc_byte_perm_S (w2[1], w2[0], selector); + w6[0] = hc_byte_perm_S (w2[0], w1[3], selector); + w5[3] = hc_byte_perm_S (w1[3], w1[2], selector); + w5[2] = hc_byte_perm_S (w1[2], w1[1], selector); + w5[1] = hc_byte_perm_S (w1[1], w1[0], selector); + w5[0] = hc_byte_perm_S (w1[0], w0[3], selector); + w4[3] = hc_byte_perm_S (w0[3], w0[2], selector); + w4[2] = hc_byte_perm_S (w0[2], w0[1], selector); + w4[1] = hc_byte_perm_S (w0[1], w0[0], selector); + w4[0] = hc_byte_perm_S (w0[0], 0, selector); w3[3] = 0; w3[2] = 0; w3[1] = 0; @@ -38412,21 +38412,21 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 17: - w7[3] = __byte_perm_S (w3[2], w3[1], selector); - w7[2] = __byte_perm_S (w3[1], w3[0], selector); - w7[1] = __byte_perm_S (w3[0], w2[3], selector); - w7[0] = __byte_perm_S (w2[3], w2[2], selector); - w6[3] = __byte_perm_S (w2[2], w2[1], selector); - w6[2] = __byte_perm_S (w2[1], w2[0], selector); - w6[1] = __byte_perm_S (w2[0], w1[3], selector); - w6[0] = __byte_perm_S (w1[3], w1[2], selector); - w5[3] = __byte_perm_S (w1[2], w1[1], selector); - w5[2] = __byte_perm_S (w1[1], w1[0], selector); - w5[1] = __byte_perm_S (w1[0], w0[3], selector); - w5[0] = __byte_perm_S (w0[3], w0[2], selector); - w4[3] = __byte_perm_S (w0[2], w0[1], selector); - w4[2] = __byte_perm_S (w0[1], w0[0], selector); - w4[1] = __byte_perm_S (w0[0], 0, selector); + w7[3] = hc_byte_perm_S (w3[2], w3[1], selector); + w7[2] = hc_byte_perm_S (w3[1], w3[0], selector); + w7[1] = hc_byte_perm_S (w3[0], w2[3], selector); + w7[0] = hc_byte_perm_S (w2[3], w2[2], selector); + w6[3] = hc_byte_perm_S (w2[2], w2[1], selector); + w6[2] = hc_byte_perm_S (w2[1], w2[0], selector); + w6[1] = hc_byte_perm_S (w2[0], w1[3], selector); + w6[0] = hc_byte_perm_S (w1[3], w1[2], selector); + w5[3] = hc_byte_perm_S (w1[2], w1[1], selector); + w5[2] = hc_byte_perm_S (w1[1], w1[0], selector); + w5[1] = hc_byte_perm_S (w1[0], w0[3], selector); + w5[0] = hc_byte_perm_S (w0[3], w0[2], selector); + w4[3] = hc_byte_perm_S (w0[2], w0[1], selector); + w4[2] = hc_byte_perm_S (w0[1], w0[0], selector); + w4[1] = hc_byte_perm_S (w0[0], 0, selector); w4[0] = 0; w3[3] = 0; w3[2] = 0; @@ -38448,20 +38448,20 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 18: - w7[3] = __byte_perm_S (w3[1], w3[0], selector); - w7[2] = __byte_perm_S (w3[0], w2[3], selector); - w7[1] = __byte_perm_S (w2[3], w2[2], selector); - w7[0] = __byte_perm_S (w2[2], w2[1], selector); - w6[3] = __byte_perm_S (w2[1], w2[0], selector); - w6[2] = __byte_perm_S (w2[0], w1[3], selector); - w6[1] = __byte_perm_S (w1[3], w1[2], selector); - w6[0] = __byte_perm_S (w1[2], w1[1], selector); - w5[3] = __byte_perm_S (w1[1], w1[0], selector); - w5[2] = __byte_perm_S (w1[0], w0[3], selector); - w5[1] = __byte_perm_S (w0[3], w0[2], selector); - w5[0] = __byte_perm_S (w0[2], w0[1], selector); - w4[3] = __byte_perm_S (w0[1], w0[0], selector); - w4[2] = __byte_perm_S (w0[0], 0, selector); + w7[3] = hc_byte_perm_S (w3[1], w3[0], selector); + w7[2] = hc_byte_perm_S (w3[0], w2[3], selector); + w7[1] = hc_byte_perm_S (w2[3], w2[2], selector); + w7[0] = hc_byte_perm_S (w2[2], w2[1], selector); + w6[3] = hc_byte_perm_S (w2[1], w2[0], selector); + w6[2] = hc_byte_perm_S (w2[0], w1[3], selector); + w6[1] = hc_byte_perm_S (w1[3], w1[2], selector); + w6[0] = hc_byte_perm_S (w1[2], w1[1], selector); + w5[3] = hc_byte_perm_S (w1[1], w1[0], selector); + w5[2] = hc_byte_perm_S (w1[0], w0[3], selector); + w5[1] = hc_byte_perm_S (w0[3], w0[2], selector); + w5[0] = hc_byte_perm_S (w0[2], w0[1], selector); + w4[3] = hc_byte_perm_S (w0[1], w0[0], selector); + w4[2] = hc_byte_perm_S (w0[0], 0, selector); w4[1] = 0; w4[0] = 0; w3[3] = 0; @@ -38484,19 +38484,19 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 19: - w7[3] = __byte_perm_S (w3[0], w2[3], selector); - w7[2] = __byte_perm_S (w2[3], w2[2], selector); - w7[1] = __byte_perm_S (w2[2], w2[1], selector); - w7[0] = __byte_perm_S (w2[1], w2[0], selector); - w6[3] = __byte_perm_S (w2[0], w1[3], selector); - w6[2] = __byte_perm_S (w1[3], w1[2], selector); - w6[1] = __byte_perm_S (w1[2], w1[1], selector); - w6[0] = __byte_perm_S (w1[1], w1[0], selector); - w5[3] = __byte_perm_S (w1[0], w0[3], selector); - w5[2] = __byte_perm_S (w0[3], w0[2], selector); - w5[1] = __byte_perm_S (w0[2], w0[1], selector); - w5[0] = __byte_perm_S (w0[1], w0[0], selector); - w4[3] = __byte_perm_S (w0[0], 0, selector); + w7[3] = hc_byte_perm_S (w3[0], w2[3], selector); + w7[2] = hc_byte_perm_S (w2[3], w2[2], selector); + w7[1] = hc_byte_perm_S (w2[2], w2[1], selector); + w7[0] = hc_byte_perm_S (w2[1], w2[0], selector); + w6[3] = hc_byte_perm_S (w2[0], w1[3], selector); + w6[2] = hc_byte_perm_S (w1[3], w1[2], selector); + w6[1] = hc_byte_perm_S (w1[2], w1[1], selector); + w6[0] = hc_byte_perm_S (w1[1], w1[0], selector); + w5[3] = hc_byte_perm_S (w1[0], w0[3], selector); + w5[2] = hc_byte_perm_S (w0[3], w0[2], selector); + w5[1] = hc_byte_perm_S (w0[2], w0[1], selector); + w5[0] = hc_byte_perm_S (w0[1], w0[0], selector); + w4[3] = hc_byte_perm_S (w0[0], 0, selector); w4[2] = 0; w4[1] = 0; w4[0] = 0; @@ -38520,18 +38520,18 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 20: - w7[3] = __byte_perm_S (w2[3], w2[2], selector); - w7[2] = __byte_perm_S (w2[2], w2[1], selector); - w7[1] = __byte_perm_S (w2[1], w2[0], selector); - w7[0] = __byte_perm_S (w2[0], w1[3], selector); - w6[3] = __byte_perm_S (w1[3], w1[2], selector); - w6[2] = __byte_perm_S (w1[2], w1[1], selector); - w6[1] = __byte_perm_S (w1[1], w1[0], selector); - w6[0] = __byte_perm_S (w1[0], w0[3], selector); - w5[3] = __byte_perm_S (w0[3], w0[2], selector); - w5[2] = __byte_perm_S (w0[2], w0[1], selector); - w5[1] = __byte_perm_S (w0[1], w0[0], selector); - w5[0] = __byte_perm_S (w0[0], 0, selector); + w7[3] = hc_byte_perm_S (w2[3], w2[2], selector); + w7[2] = hc_byte_perm_S (w2[2], w2[1], selector); + w7[1] = hc_byte_perm_S (w2[1], w2[0], selector); + w7[0] = hc_byte_perm_S (w2[0], w1[3], selector); + w6[3] = hc_byte_perm_S (w1[3], w1[2], selector); + w6[2] = hc_byte_perm_S (w1[2], w1[1], selector); + w6[1] = hc_byte_perm_S (w1[1], w1[0], selector); + w6[0] = hc_byte_perm_S (w1[0], w0[3], selector); + w5[3] = hc_byte_perm_S (w0[3], w0[2], selector); + w5[2] = hc_byte_perm_S (w0[2], w0[1], selector); + w5[1] = hc_byte_perm_S (w0[1], w0[0], selector); + w5[0] = hc_byte_perm_S (w0[0], 0, selector); w4[3] = 0; w4[2] = 0; w4[1] = 0; @@ -38556,17 +38556,17 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 21: - w7[3] = __byte_perm_S (w2[2], w2[1], selector); - w7[2] = __byte_perm_S (w2[1], w2[0], selector); - w7[1] = __byte_perm_S (w2[0], w1[3], selector); - w7[0] = __byte_perm_S (w1[3], w1[2], selector); - w6[3] = __byte_perm_S (w1[2], w1[1], selector); - w6[2] = __byte_perm_S (w1[1], w1[0], selector); - w6[1] = __byte_perm_S (w1[0], w0[3], selector); - w6[0] = __byte_perm_S (w0[3], w0[2], selector); - w5[3] = __byte_perm_S (w0[2], w0[1], selector); - w5[2] = __byte_perm_S (w0[1], w0[0], selector); - w5[1] = __byte_perm_S (w0[0], 0, selector); + w7[3] = hc_byte_perm_S (w2[2], w2[1], selector); + w7[2] = hc_byte_perm_S (w2[1], w2[0], selector); + w7[1] = hc_byte_perm_S (w2[0], w1[3], selector); + w7[0] = hc_byte_perm_S (w1[3], w1[2], selector); + w6[3] = hc_byte_perm_S (w1[2], w1[1], selector); + w6[2] = hc_byte_perm_S (w1[1], w1[0], selector); + w6[1] = hc_byte_perm_S (w1[0], w0[3], selector); + w6[0] = hc_byte_perm_S (w0[3], w0[2], selector); + w5[3] = hc_byte_perm_S (w0[2], w0[1], selector); + w5[2] = hc_byte_perm_S (w0[1], w0[0], selector); + w5[1] = hc_byte_perm_S (w0[0], 0, selector); w5[0] = 0; w4[3] = 0; w4[2] = 0; @@ -38592,16 +38592,16 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 22: - w7[3] = __byte_perm_S (w2[1], w2[0], selector); - w7[2] = __byte_perm_S (w2[0], w1[3], selector); - w7[1] = __byte_perm_S (w1[3], w1[2], selector); - w7[0] = __byte_perm_S (w1[2], w1[1], selector); - w6[3] = __byte_perm_S (w1[1], w1[0], selector); - w6[2] = __byte_perm_S (w1[0], w0[3], selector); - w6[1] = __byte_perm_S (w0[3], w0[2], selector); - w6[0] = __byte_perm_S (w0[2], w0[1], selector); - w5[3] = __byte_perm_S (w0[1], w0[0], selector); - w5[2] = __byte_perm_S (w0[0], 0, selector); + w7[3] = hc_byte_perm_S (w2[1], w2[0], selector); + w7[2] = hc_byte_perm_S (w2[0], w1[3], selector); + w7[1] = hc_byte_perm_S (w1[3], w1[2], selector); + w7[0] = hc_byte_perm_S (w1[2], w1[1], selector); + w6[3] = hc_byte_perm_S (w1[1], w1[0], selector); + w6[2] = hc_byte_perm_S (w1[0], w0[3], selector); + w6[1] = hc_byte_perm_S (w0[3], w0[2], selector); + w6[0] = hc_byte_perm_S (w0[2], w0[1], selector); + w5[3] = hc_byte_perm_S (w0[1], w0[0], selector); + w5[2] = hc_byte_perm_S (w0[0], 0, selector); w5[1] = 0; w5[0] = 0; w4[3] = 0; @@ -38628,15 +38628,15 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 23: - w7[3] = __byte_perm_S (w2[0], w1[3], selector); - w7[2] = __byte_perm_S (w1[3], w1[2], selector); - w7[1] = __byte_perm_S (w1[2], w1[1], selector); - w7[0] = __byte_perm_S (w1[1], w1[0], selector); - w6[3] = __byte_perm_S (w1[0], w0[3], selector); - w6[2] = __byte_perm_S (w0[3], w0[2], selector); - w6[1] = __byte_perm_S (w0[2], w0[1], selector); - w6[0] = __byte_perm_S (w0[1], w0[0], selector); - w5[3] = __byte_perm_S (w0[0], 0, selector); + w7[3] = hc_byte_perm_S (w2[0], w1[3], selector); + w7[2] = hc_byte_perm_S (w1[3], w1[2], selector); + w7[1] = hc_byte_perm_S (w1[2], w1[1], selector); + w7[0] = hc_byte_perm_S (w1[1], w1[0], selector); + w6[3] = hc_byte_perm_S (w1[0], w0[3], selector); + w6[2] = hc_byte_perm_S (w0[3], w0[2], selector); + w6[1] = hc_byte_perm_S (w0[2], w0[1], selector); + w6[0] = hc_byte_perm_S (w0[1], w0[0], selector); + w5[3] = hc_byte_perm_S (w0[0], 0, selector); w5[2] = 0; w5[1] = 0; w5[0] = 0; @@ -38664,14 +38664,14 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 24: - w7[3] = __byte_perm_S (w1[3], w1[2], selector); - w7[2] = __byte_perm_S (w1[2], w1[1], selector); - w7[1] = __byte_perm_S (w1[1], w1[0], selector); - w7[0] = __byte_perm_S (w1[0], w0[3], selector); - w6[3] = __byte_perm_S (w0[3], w0[2], selector); - w6[2] = __byte_perm_S (w0[2], w0[1], selector); - w6[1] = __byte_perm_S (w0[1], w0[0], selector); - w6[0] = __byte_perm_S (w0[0], 0, selector); + w7[3] = hc_byte_perm_S (w1[3], w1[2], selector); + w7[2] = hc_byte_perm_S (w1[2], w1[1], selector); + w7[1] = hc_byte_perm_S (w1[1], w1[0], selector); + w7[0] = hc_byte_perm_S (w1[0], w0[3], selector); + w6[3] = hc_byte_perm_S (w0[3], w0[2], selector); + w6[2] = hc_byte_perm_S (w0[2], w0[1], selector); + w6[1] = hc_byte_perm_S (w0[1], w0[0], selector); + w6[0] = hc_byte_perm_S (w0[0], 0, selector); w5[3] = 0; w5[2] = 0; w5[1] = 0; @@ -38700,13 +38700,13 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 25: - w7[3] = __byte_perm_S (w1[2], w1[1], selector); - w7[2] = __byte_perm_S (w1[1], w1[0], selector); - w7[1] = __byte_perm_S (w1[0], w0[3], selector); - w7[0] = __byte_perm_S (w0[3], w0[2], selector); - w6[3] = __byte_perm_S (w0[2], w0[1], selector); - w6[2] = __byte_perm_S (w0[1], w0[0], selector); - w6[1] = __byte_perm_S (w0[0], 0, selector); + w7[3] = hc_byte_perm_S (w1[2], w1[1], selector); + w7[2] = hc_byte_perm_S (w1[1], w1[0], selector); + w7[1] = hc_byte_perm_S (w1[0], w0[3], selector); + w7[0] = hc_byte_perm_S (w0[3], w0[2], selector); + w6[3] = hc_byte_perm_S (w0[2], w0[1], selector); + w6[2] = hc_byte_perm_S (w0[1], w0[0], selector); + w6[1] = hc_byte_perm_S (w0[0], 0, selector); w6[0] = 0; w5[3] = 0; w5[2] = 0; @@ -38736,12 +38736,12 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 26: - w7[3] = __byte_perm_S (w1[1], w1[0], selector); - w7[2] = __byte_perm_S (w1[0], w0[3], selector); - w7[1] = __byte_perm_S (w0[3], w0[2], selector); - w7[0] = __byte_perm_S (w0[2], w0[1], selector); - w6[3] = __byte_perm_S (w0[1], w0[0], selector); - w6[2] = __byte_perm_S (w0[0], 0, selector); + w7[3] = hc_byte_perm_S (w1[1], w1[0], selector); + w7[2] = hc_byte_perm_S (w1[0], w0[3], selector); + w7[1] = hc_byte_perm_S (w0[3], w0[2], selector); + w7[0] = hc_byte_perm_S (w0[2], w0[1], selector); + w6[3] = hc_byte_perm_S (w0[1], w0[0], selector); + w6[2] = hc_byte_perm_S (w0[0], 0, selector); w6[1] = 0; w6[0] = 0; w5[3] = 0; @@ -38772,11 +38772,11 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 27: - w7[3] = __byte_perm_S (w1[0], w0[3], selector); - w7[2] = __byte_perm_S (w0[3], w0[2], selector); - w7[1] = __byte_perm_S (w0[2], w0[1], selector); - w7[0] = __byte_perm_S (w0[1], w0[0], selector); - w6[3] = __byte_perm_S (w0[0], 0, selector); + w7[3] = hc_byte_perm_S (w1[0], w0[3], selector); + w7[2] = hc_byte_perm_S (w0[3], w0[2], selector); + w7[1] = hc_byte_perm_S (w0[2], w0[1], selector); + w7[0] = hc_byte_perm_S (w0[1], w0[0], selector); + w6[3] = hc_byte_perm_S (w0[0], 0, selector); w6[2] = 0; w6[1] = 0; w6[0] = 0; @@ -38808,10 +38808,10 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 28: - w7[3] = __byte_perm_S (w0[3], w0[2], selector); - w7[2] = __byte_perm_S (w0[2], w0[1], selector); - w7[1] = __byte_perm_S (w0[1], w0[0], selector); - w7[0] = __byte_perm_S (w0[0], 0, selector); + w7[3] = hc_byte_perm_S (w0[3], w0[2], selector); + w7[2] = hc_byte_perm_S (w0[2], w0[1], selector); + w7[1] = hc_byte_perm_S (w0[1], w0[0], selector); + w7[0] = hc_byte_perm_S (w0[0], 0, selector); w6[3] = 0; w6[2] = 0; w6[1] = 0; @@ -38844,9 +38844,9 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 29: - w7[3] = __byte_perm_S (w0[2], w0[1], selector); - w7[2] = __byte_perm_S (w0[1], w0[0], selector); - w7[1] = __byte_perm_S (w0[0], 0, selector); + w7[3] = hc_byte_perm_S (w0[2], w0[1], selector); + w7[2] = hc_byte_perm_S (w0[1], w0[0], selector); + w7[1] = hc_byte_perm_S (w0[0], 0, selector); w7[0] = 0; w6[3] = 0; w6[2] = 0; @@ -38880,8 +38880,8 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 30: - w7[3] = __byte_perm_S (w0[1], w0[0], selector); - w7[2] = __byte_perm_S (w0[0], 0, selector); + w7[3] = hc_byte_perm_S (w0[1], w0[0], selector); + w7[2] = hc_byte_perm_S (w0[0], 0, selector); w7[1] = 0; w7[0] = 0; w6[3] = 0; @@ -38916,7 +38916,7 @@ DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; case 31: - w7[3] = __byte_perm_S (w0[0], 0, selector); + w7[3] = hc_byte_perm_S (w0[0], 0, selector); w7[2] = 0; w7[1] = 0; w7[0] = 0; @@ -38962,153 +38962,153 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, switch (offset_switch) { case 0: - c0[0] = amd_bytealign_S (w7[3], 0, offset); - w7[3] = amd_bytealign_S (w7[2], w7[3], offset); - w7[2] = amd_bytealign_S (w7[1], w7[2], offset); - w7[1] = amd_bytealign_S (w7[0], w7[1], offset); - w7[0] = amd_bytealign_S (w6[3], w7[0], offset); - w6[3] = amd_bytealign_S (w6[2], w6[3], offset); - w6[2] = amd_bytealign_S (w6[1], w6[2], offset); - w6[1] = amd_bytealign_S (w6[0], w6[1], offset); - w6[0] = amd_bytealign_S (w5[3], w6[0], offset); - w5[3] = amd_bytealign_S (w5[2], w5[3], offset); - w5[2] = amd_bytealign_S (w5[1], w5[2], offset); - w5[1] = amd_bytealign_S (w5[0], w5[1], offset); - w5[0] = amd_bytealign_S (w4[3], w5[0], offset); - w4[3] = amd_bytealign_S (w4[2], w4[3], offset); - w4[2] = amd_bytealign_S (w4[1], w4[2], offset); - w4[1] = amd_bytealign_S (w4[0], w4[1], offset); - w4[0] = amd_bytealign_S (w3[3], w4[0], offset); - w3[3] = amd_bytealign_S (w3[2], w3[3], offset); - w3[2] = amd_bytealign_S (w3[1], w3[2], offset); - w3[1] = amd_bytealign_S (w3[0], w3[1], offset); - w3[0] = amd_bytealign_S (w2[3], w3[0], offset); - w2[3] = amd_bytealign_S (w2[2], w2[3], offset); - w2[2] = amd_bytealign_S (w2[1], w2[2], offset); - w2[1] = amd_bytealign_S (w2[0], w2[1], offset); - w2[0] = amd_bytealign_S (w1[3], w2[0], offset); - w1[3] = amd_bytealign_S (w1[2], w1[3], offset); - w1[2] = amd_bytealign_S (w1[1], w1[2], offset); - w1[1] = amd_bytealign_S (w1[0], w1[1], offset); - w1[0] = amd_bytealign_S (w0[3], w1[0], offset); - w0[3] = amd_bytealign_S (w0[2], w0[3], offset); - w0[2] = amd_bytealign_S (w0[1], w0[2], offset); - w0[1] = amd_bytealign_S (w0[0], w0[1], offset); - w0[0] = amd_bytealign_S ( 0, w0[0], offset); + c0[0] = hc_bytealign_S (w7[3], 0, offset); + w7[3] = hc_bytealign_S (w7[2], w7[3], offset); + w7[2] = hc_bytealign_S (w7[1], w7[2], offset); + w7[1] = hc_bytealign_S (w7[0], w7[1], offset); + w7[0] = hc_bytealign_S (w6[3], w7[0], offset); + w6[3] = hc_bytealign_S (w6[2], w6[3], offset); + w6[2] = hc_bytealign_S (w6[1], w6[2], offset); + w6[1] = hc_bytealign_S (w6[0], w6[1], offset); + w6[0] = hc_bytealign_S (w5[3], w6[0], offset); + w5[3] = hc_bytealign_S (w5[2], w5[3], offset); + w5[2] = hc_bytealign_S (w5[1], w5[2], offset); + w5[1] = hc_bytealign_S (w5[0], w5[1], offset); + w5[0] = hc_bytealign_S (w4[3], w5[0], offset); + w4[3] = hc_bytealign_S (w4[2], w4[3], offset); + w4[2] = hc_bytealign_S (w4[1], w4[2], offset); + w4[1] = hc_bytealign_S (w4[0], w4[1], offset); + w4[0] = hc_bytealign_S (w3[3], w4[0], offset); + w3[3] = hc_bytealign_S (w3[2], w3[3], offset); + w3[2] = hc_bytealign_S (w3[1], w3[2], offset); + w3[1] = hc_bytealign_S (w3[0], w3[1], offset); + w3[0] = hc_bytealign_S (w2[3], w3[0], offset); + w2[3] = hc_bytealign_S (w2[2], w2[3], offset); + w2[2] = hc_bytealign_S (w2[1], w2[2], offset); + w2[1] = hc_bytealign_S (w2[0], w2[1], offset); + w2[0] = hc_bytealign_S (w1[3], w2[0], offset); + w1[3] = hc_bytealign_S (w1[2], w1[3], offset); + w1[2] = hc_bytealign_S (w1[1], w1[2], offset); + w1[1] = hc_bytealign_S (w1[0], w1[1], offset); + w1[0] = hc_bytealign_S (w0[3], w1[0], offset); + w0[3] = hc_bytealign_S (w0[2], w0[3], offset); + w0[2] = hc_bytealign_S (w0[1], w0[2], offset); + w0[1] = hc_bytealign_S (w0[0], w0[1], offset); + w0[0] = hc_bytealign_S ( 0, w0[0], offset); break; case 1: - c0[1] = amd_bytealign_S (w7[3], 0, offset); - c0[0] = amd_bytealign_S (w7[2], w7[3], offset); - w7[3] = amd_bytealign_S (w7[1], w7[2], offset); - w7[2] = amd_bytealign_S (w7[0], w7[1], offset); - w7[1] = amd_bytealign_S (w6[3], w7[0], offset); - w7[0] = amd_bytealign_S (w6[2], w6[3], offset); - w6[3] = amd_bytealign_S (w6[1], w6[2], offset); - w6[2] = amd_bytealign_S (w6[0], w6[1], offset); - w6[1] = amd_bytealign_S (w5[3], w6[0], offset); - w6[0] = amd_bytealign_S (w5[2], w5[3], offset); - w5[3] = amd_bytealign_S (w5[1], w5[2], offset); - w5[2] = amd_bytealign_S (w5[0], w5[1], offset); - w5[1] = amd_bytealign_S (w4[3], w5[0], offset); - w5[0] = amd_bytealign_S (w4[2], w4[3], offset); - w4[3] = amd_bytealign_S (w4[1], w4[2], offset); - w4[2] = amd_bytealign_S (w4[0], w4[1], offset); - w4[1] = amd_bytealign_S (w3[3], w4[0], offset); - w4[0] = amd_bytealign_S (w3[2], w3[3], offset); - w3[3] = amd_bytealign_S (w3[1], w3[2], offset); - w3[2] = amd_bytealign_S (w3[0], w3[1], offset); - w3[1] = amd_bytealign_S (w2[3], w3[0], offset); - w3[0] = amd_bytealign_S (w2[2], w2[3], offset); - w2[3] = amd_bytealign_S (w2[1], w2[2], offset); - w2[2] = amd_bytealign_S (w2[0], w2[1], offset); - w2[1] = amd_bytealign_S (w1[3], w2[0], offset); - w2[0] = amd_bytealign_S (w1[2], w1[3], offset); - w1[3] = amd_bytealign_S (w1[1], w1[2], offset); - w1[2] = amd_bytealign_S (w1[0], w1[1], offset); - w1[1] = amd_bytealign_S (w0[3], w1[0], offset); - w1[0] = amd_bytealign_S (w0[2], w0[3], offset); - w0[3] = amd_bytealign_S (w0[1], w0[2], offset); - w0[2] = amd_bytealign_S (w0[0], w0[1], offset); - w0[1] = amd_bytealign_S ( 0, w0[0], offset); + c0[1] = hc_bytealign_S (w7[3], 0, offset); + c0[0] = hc_bytealign_S (w7[2], w7[3], offset); + w7[3] = hc_bytealign_S (w7[1], w7[2], offset); + w7[2] = hc_bytealign_S (w7[0], w7[1], offset); + w7[1] = hc_bytealign_S (w6[3], w7[0], offset); + w7[0] = hc_bytealign_S (w6[2], w6[3], offset); + w6[3] = hc_bytealign_S (w6[1], w6[2], offset); + w6[2] = hc_bytealign_S (w6[0], w6[1], offset); + w6[1] = hc_bytealign_S (w5[3], w6[0], offset); + w6[0] = hc_bytealign_S (w5[2], w5[3], offset); + w5[3] = hc_bytealign_S (w5[1], w5[2], offset); + w5[2] = hc_bytealign_S (w5[0], w5[1], offset); + w5[1] = hc_bytealign_S (w4[3], w5[0], offset); + w5[0] = hc_bytealign_S (w4[2], w4[3], offset); + w4[3] = hc_bytealign_S (w4[1], w4[2], offset); + w4[2] = hc_bytealign_S (w4[0], w4[1], offset); + w4[1] = hc_bytealign_S (w3[3], w4[0], offset); + w4[0] = hc_bytealign_S (w3[2], w3[3], offset); + w3[3] = hc_bytealign_S (w3[1], w3[2], offset); + w3[2] = hc_bytealign_S (w3[0], w3[1], offset); + w3[1] = hc_bytealign_S (w2[3], w3[0], offset); + w3[0] = hc_bytealign_S (w2[2], w2[3], offset); + w2[3] = hc_bytealign_S (w2[1], w2[2], offset); + w2[2] = hc_bytealign_S (w2[0], w2[1], offset); + w2[1] = hc_bytealign_S (w1[3], w2[0], offset); + w2[0] = hc_bytealign_S (w1[2], w1[3], offset); + w1[3] = hc_bytealign_S (w1[1], w1[2], offset); + w1[2] = hc_bytealign_S (w1[0], w1[1], offset); + w1[1] = hc_bytealign_S (w0[3], w1[0], offset); + w1[0] = hc_bytealign_S (w0[2], w0[3], offset); + w0[3] = hc_bytealign_S (w0[1], w0[2], offset); + w0[2] = hc_bytealign_S (w0[0], w0[1], offset); + w0[1] = hc_bytealign_S ( 0, w0[0], offset); w0[0] = 0; break; case 2: - c0[2] = amd_bytealign_S (w7[3], 0, offset); - c0[1] = amd_bytealign_S (w7[2], w7[3], offset); - c0[0] = amd_bytealign_S (w7[1], w7[2], offset); - w7[3] = amd_bytealign_S (w7[0], w7[1], offset); - w7[2] = amd_bytealign_S (w6[3], w7[0], offset); - w7[1] = amd_bytealign_S (w6[2], w6[3], offset); - w7[0] = amd_bytealign_S (w6[1], w6[2], offset); - w6[3] = amd_bytealign_S (w6[0], w6[1], offset); - w6[2] = amd_bytealign_S (w5[3], w6[0], offset); - w6[1] = amd_bytealign_S (w5[2], w5[3], offset); - w6[0] = amd_bytealign_S (w5[1], w5[2], offset); - w5[3] = amd_bytealign_S (w5[0], w5[1], offset); - w5[2] = amd_bytealign_S (w4[3], w5[0], offset); - w5[1] = amd_bytealign_S (w4[2], w4[3], offset); - w5[0] = amd_bytealign_S (w4[1], w4[2], offset); - w4[3] = amd_bytealign_S (w4[0], w4[1], offset); - w4[2] = amd_bytealign_S (w3[3], w4[0], offset); - w4[1] = amd_bytealign_S (w3[2], w3[3], offset); - w4[0] = amd_bytealign_S (w3[1], w3[2], offset); - w3[3] = amd_bytealign_S (w3[0], w3[1], offset); - w3[2] = amd_bytealign_S (w2[3], w3[0], offset); - w3[1] = amd_bytealign_S (w2[2], w2[3], offset); - w3[0] = amd_bytealign_S (w2[1], w2[2], offset); - w2[3] = amd_bytealign_S (w2[0], w2[1], offset); - w2[2] = amd_bytealign_S (w1[3], w2[0], offset); - w2[1] = amd_bytealign_S (w1[2], w1[3], offset); - w2[0] = amd_bytealign_S (w1[1], w1[2], offset); - w1[3] = amd_bytealign_S (w1[0], w1[1], offset); - w1[2] = amd_bytealign_S (w0[3], w1[0], offset); - w1[1] = amd_bytealign_S (w0[2], w0[3], offset); - w1[0] = amd_bytealign_S (w0[1], w0[2], offset); - w0[3] = amd_bytealign_S (w0[0], w0[1], offset); - w0[2] = amd_bytealign_S ( 0, w0[0], offset); + c0[2] = hc_bytealign_S (w7[3], 0, offset); + c0[1] = hc_bytealign_S (w7[2], w7[3], offset); + c0[0] = hc_bytealign_S (w7[1], w7[2], offset); + w7[3] = hc_bytealign_S (w7[0], w7[1], offset); + w7[2] = hc_bytealign_S (w6[3], w7[0], offset); + w7[1] = hc_bytealign_S (w6[2], w6[3], offset); + w7[0] = hc_bytealign_S (w6[1], w6[2], offset); + w6[3] = hc_bytealign_S (w6[0], w6[1], offset); + w6[2] = hc_bytealign_S (w5[3], w6[0], offset); + w6[1] = hc_bytealign_S (w5[2], w5[3], offset); + w6[0] = hc_bytealign_S (w5[1], w5[2], offset); + w5[3] = hc_bytealign_S (w5[0], w5[1], offset); + w5[2] = hc_bytealign_S (w4[3], w5[0], offset); + w5[1] = hc_bytealign_S (w4[2], w4[3], offset); + w5[0] = hc_bytealign_S (w4[1], w4[2], offset); + w4[3] = hc_bytealign_S (w4[0], w4[1], offset); + w4[2] = hc_bytealign_S (w3[3], w4[0], offset); + w4[1] = hc_bytealign_S (w3[2], w3[3], offset); + w4[0] = hc_bytealign_S (w3[1], w3[2], offset); + w3[3] = hc_bytealign_S (w3[0], w3[1], offset); + w3[2] = hc_bytealign_S (w2[3], w3[0], offset); + w3[1] = hc_bytealign_S (w2[2], w2[3], offset); + w3[0] = hc_bytealign_S (w2[1], w2[2], offset); + w2[3] = hc_bytealign_S (w2[0], w2[1], offset); + w2[2] = hc_bytealign_S (w1[3], w2[0], offset); + w2[1] = hc_bytealign_S (w1[2], w1[3], offset); + w2[0] = hc_bytealign_S (w1[1], w1[2], offset); + w1[3] = hc_bytealign_S (w1[0], w1[1], offset); + w1[2] = hc_bytealign_S (w0[3], w1[0], offset); + w1[1] = hc_bytealign_S (w0[2], w0[3], offset); + w1[0] = hc_bytealign_S (w0[1], w0[2], offset); + w0[3] = hc_bytealign_S (w0[0], w0[1], offset); + w0[2] = hc_bytealign_S ( 0, w0[0], offset); w0[1] = 0; w0[0] = 0; break; case 3: - c0[3] = amd_bytealign_S (w7[3], 0, offset); - c0[2] = amd_bytealign_S (w7[2], w7[3], offset); - c0[1] = amd_bytealign_S (w7[1], w7[2], offset); - c0[0] = amd_bytealign_S (w7[0], w7[1], offset); - w7[3] = amd_bytealign_S (w6[3], w7[0], offset); - w7[2] = amd_bytealign_S (w6[2], w6[3], offset); - w7[1] = amd_bytealign_S (w6[1], w6[2], offset); - w7[0] = amd_bytealign_S (w6[0], w6[1], offset); - w6[3] = amd_bytealign_S (w5[3], w6[0], offset); - w6[2] = amd_bytealign_S (w5[2], w5[3], offset); - w6[1] = amd_bytealign_S (w5[1], w5[2], offset); - w6[0] = amd_bytealign_S (w5[0], w5[1], offset); - w5[3] = amd_bytealign_S (w4[3], w5[0], offset); - w5[2] = amd_bytealign_S (w4[2], w4[3], offset); - w5[1] = amd_bytealign_S (w4[1], w4[2], offset); - w5[0] = amd_bytealign_S (w4[0], w4[1], offset); - w4[3] = amd_bytealign_S (w3[3], w4[0], offset); - w4[2] = amd_bytealign_S (w3[2], w3[3], offset); - w4[1] = amd_bytealign_S (w3[1], w3[2], offset); - w4[0] = amd_bytealign_S (w3[0], w3[1], offset); - w3[3] = amd_bytealign_S (w2[3], w3[0], offset); - w3[2] = amd_bytealign_S (w2[2], w2[3], offset); - w3[1] = amd_bytealign_S (w2[1], w2[2], offset); - w3[0] = amd_bytealign_S (w2[0], w2[1], offset); - w2[3] = amd_bytealign_S (w1[3], w2[0], offset); - w2[2] = amd_bytealign_S (w1[2], w1[3], offset); - w2[1] = amd_bytealign_S (w1[1], w1[2], offset); - w2[0] = amd_bytealign_S (w1[0], w1[1], offset); - w1[3] = amd_bytealign_S (w0[3], w1[0], offset); - w1[2] = amd_bytealign_S (w0[2], w0[3], offset); - w1[1] = amd_bytealign_S (w0[1], w0[2], offset); - w1[0] = amd_bytealign_S (w0[0], w0[1], offset); - w0[3] = amd_bytealign_S ( 0, w0[0], offset); + c0[3] = hc_bytealign_S (w7[3], 0, offset); + c0[2] = hc_bytealign_S (w7[2], w7[3], offset); + c0[1] = hc_bytealign_S (w7[1], w7[2], offset); + c0[0] = hc_bytealign_S (w7[0], w7[1], offset); + w7[3] = hc_bytealign_S (w6[3], w7[0], offset); + w7[2] = hc_bytealign_S (w6[2], w6[3], offset); + w7[1] = hc_bytealign_S (w6[1], w6[2], offset); + w7[0] = hc_bytealign_S (w6[0], w6[1], offset); + w6[3] = hc_bytealign_S (w5[3], w6[0], offset); + w6[2] = hc_bytealign_S (w5[2], w5[3], offset); + w6[1] = hc_bytealign_S (w5[1], w5[2], offset); + w6[0] = hc_bytealign_S (w5[0], w5[1], offset); + w5[3] = hc_bytealign_S (w4[3], w5[0], offset); + w5[2] = hc_bytealign_S (w4[2], w4[3], offset); + w5[1] = hc_bytealign_S (w4[1], w4[2], offset); + w5[0] = hc_bytealign_S (w4[0], w4[1], offset); + w4[3] = hc_bytealign_S (w3[3], w4[0], offset); + w4[2] = hc_bytealign_S (w3[2], w3[3], offset); + w4[1] = hc_bytealign_S (w3[1], w3[2], offset); + w4[0] = hc_bytealign_S (w3[0], w3[1], offset); + w3[3] = hc_bytealign_S (w2[3], w3[0], offset); + w3[2] = hc_bytealign_S (w2[2], w2[3], offset); + w3[1] = hc_bytealign_S (w2[1], w2[2], offset); + w3[0] = hc_bytealign_S (w2[0], w2[1], offset); + w2[3] = hc_bytealign_S (w1[3], w2[0], offset); + w2[2] = hc_bytealign_S (w1[2], w1[3], offset); + w2[1] = hc_bytealign_S (w1[1], w1[2], offset); + w2[0] = hc_bytealign_S (w1[0], w1[1], offset); + w1[3] = hc_bytealign_S (w0[3], w1[0], offset); + w1[2] = hc_bytealign_S (w0[2], w0[3], offset); + w1[1] = hc_bytealign_S (w0[1], w0[2], offset); + w1[0] = hc_bytealign_S (w0[0], w0[1], offset); + w0[3] = hc_bytealign_S ( 0, w0[0], offset); w0[2] = 0; w0[1] = 0; w0[0] = 0; @@ -39116,39 +39116,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 4: - c1[0] = amd_bytealign_S (w7[3], 0, offset); - c0[3] = amd_bytealign_S (w7[2], w7[3], offset); - c0[2] = amd_bytealign_S (w7[1], w7[2], offset); - c0[1] = amd_bytealign_S (w7[0], w7[1], offset); - c0[0] = amd_bytealign_S (w6[3], w7[0], offset); - w7[3] = amd_bytealign_S (w6[2], w6[3], offset); - w7[2] = amd_bytealign_S (w6[1], w6[2], offset); - w7[1] = amd_bytealign_S (w6[0], w6[1], offset); - w7[0] = amd_bytealign_S (w5[3], w6[0], offset); - w6[3] = amd_bytealign_S (w5[2], w5[3], offset); - w6[2] = amd_bytealign_S (w5[1], w5[2], offset); - w6[1] = amd_bytealign_S (w5[0], w5[1], offset); - w6[0] = amd_bytealign_S (w4[3], w5[0], offset); - w5[3] = amd_bytealign_S (w4[2], w4[3], offset); - w5[2] = amd_bytealign_S (w4[1], w4[2], offset); - w5[1] = amd_bytealign_S (w4[0], w4[1], offset); - w5[0] = amd_bytealign_S (w3[3], w4[0], offset); - w4[3] = amd_bytealign_S (w3[2], w3[3], offset); - w4[2] = amd_bytealign_S (w3[1], w3[2], offset); - w4[1] = amd_bytealign_S (w3[0], w3[1], offset); - w4[0] = amd_bytealign_S (w2[3], w3[0], offset); - w3[3] = amd_bytealign_S (w2[2], w2[3], offset); - w3[2] = amd_bytealign_S (w2[1], w2[2], offset); - w3[1] = amd_bytealign_S (w2[0], w2[1], offset); - w3[0] = amd_bytealign_S (w1[3], w2[0], offset); - w2[3] = amd_bytealign_S (w1[2], w1[3], offset); - w2[2] = amd_bytealign_S (w1[1], w1[2], offset); - w2[1] = amd_bytealign_S (w1[0], w1[1], offset); - w2[0] = amd_bytealign_S (w0[3], w1[0], offset); - w1[3] = amd_bytealign_S (w0[2], w0[3], offset); - w1[2] = amd_bytealign_S (w0[1], w0[2], offset); - w1[1] = amd_bytealign_S (w0[0], w0[1], offset); - w1[0] = amd_bytealign_S ( 0, w0[0], offset); + c1[0] = hc_bytealign_S (w7[3], 0, offset); + c0[3] = hc_bytealign_S (w7[2], w7[3], offset); + c0[2] = hc_bytealign_S (w7[1], w7[2], offset); + c0[1] = hc_bytealign_S (w7[0], w7[1], offset); + c0[0] = hc_bytealign_S (w6[3], w7[0], offset); + w7[3] = hc_bytealign_S (w6[2], w6[3], offset); + w7[2] = hc_bytealign_S (w6[1], w6[2], offset); + w7[1] = hc_bytealign_S (w6[0], w6[1], offset); + w7[0] = hc_bytealign_S (w5[3], w6[0], offset); + w6[3] = hc_bytealign_S (w5[2], w5[3], offset); + w6[2] = hc_bytealign_S (w5[1], w5[2], offset); + w6[1] = hc_bytealign_S (w5[0], w5[1], offset); + w6[0] = hc_bytealign_S (w4[3], w5[0], offset); + w5[3] = hc_bytealign_S (w4[2], w4[3], offset); + w5[2] = hc_bytealign_S (w4[1], w4[2], offset); + w5[1] = hc_bytealign_S (w4[0], w4[1], offset); + w5[0] = hc_bytealign_S (w3[3], w4[0], offset); + w4[3] = hc_bytealign_S (w3[2], w3[3], offset); + w4[2] = hc_bytealign_S (w3[1], w3[2], offset); + w4[1] = hc_bytealign_S (w3[0], w3[1], offset); + w4[0] = hc_bytealign_S (w2[3], w3[0], offset); + w3[3] = hc_bytealign_S (w2[2], w2[3], offset); + w3[2] = hc_bytealign_S (w2[1], w2[2], offset); + w3[1] = hc_bytealign_S (w2[0], w2[1], offset); + w3[0] = hc_bytealign_S (w1[3], w2[0], offset); + w2[3] = hc_bytealign_S (w1[2], w1[3], offset); + w2[2] = hc_bytealign_S (w1[1], w1[2], offset); + w2[1] = hc_bytealign_S (w1[0], w1[1], offset); + w2[0] = hc_bytealign_S (w0[3], w1[0], offset); + w1[3] = hc_bytealign_S (w0[2], w0[3], offset); + w1[2] = hc_bytealign_S (w0[1], w0[2], offset); + w1[1] = hc_bytealign_S (w0[0], w0[1], offset); + w1[0] = hc_bytealign_S ( 0, w0[0], offset); w0[3] = 0; w0[2] = 0; w0[1] = 0; @@ -39157,39 +39157,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 5: - c1[1] = amd_bytealign_S (w7[3], 0, offset); - c1[0] = amd_bytealign_S (w7[2], w7[3], offset); - c0[3] = amd_bytealign_S (w7[1], w7[2], offset); - c0[2] = amd_bytealign_S (w7[0], w7[1], offset); - c0[1] = amd_bytealign_S (w6[3], w7[0], offset); - c0[0] = amd_bytealign_S (w6[2], w6[3], offset); - w7[3] = amd_bytealign_S (w6[1], w6[2], offset); - w7[2] = amd_bytealign_S (w6[0], w6[1], offset); - w7[1] = amd_bytealign_S (w5[3], w6[0], offset); - w7[0] = amd_bytealign_S (w5[2], w5[3], offset); - w6[3] = amd_bytealign_S (w5[1], w5[2], offset); - w6[2] = amd_bytealign_S (w5[0], w5[1], offset); - w6[1] = amd_bytealign_S (w4[3], w5[0], offset); - w6[0] = amd_bytealign_S (w4[2], w4[3], offset); - w5[3] = amd_bytealign_S (w4[1], w4[2], offset); - w5[2] = amd_bytealign_S (w4[0], w4[1], offset); - w5[1] = amd_bytealign_S (w3[3], w4[0], offset); - w5[0] = amd_bytealign_S (w3[2], w3[3], offset); - w4[3] = amd_bytealign_S (w3[1], w3[2], offset); - w4[2] = amd_bytealign_S (w3[0], w3[1], offset); - w4[1] = amd_bytealign_S (w2[3], w3[0], offset); - w4[0] = amd_bytealign_S (w2[2], w2[3], offset); - w3[3] = amd_bytealign_S (w2[1], w2[2], offset); - w3[2] = amd_bytealign_S (w2[0], w2[1], offset); - w3[1] = amd_bytealign_S (w1[3], w2[0], offset); - w3[0] = amd_bytealign_S (w1[2], w1[3], offset); - w2[3] = amd_bytealign_S (w1[1], w1[2], offset); - w2[2] = amd_bytealign_S (w1[0], w1[1], offset); - w2[1] = amd_bytealign_S (w0[3], w1[0], offset); - w2[0] = amd_bytealign_S (w0[2], w0[3], offset); - w1[3] = amd_bytealign_S (w0[1], w0[2], offset); - w1[2] = amd_bytealign_S (w0[0], w0[1], offset); - w1[1] = amd_bytealign_S ( 0, w0[0], offset); + c1[1] = hc_bytealign_S (w7[3], 0, offset); + c1[0] = hc_bytealign_S (w7[2], w7[3], offset); + c0[3] = hc_bytealign_S (w7[1], w7[2], offset); + c0[2] = hc_bytealign_S (w7[0], w7[1], offset); + c0[1] = hc_bytealign_S (w6[3], w7[0], offset); + c0[0] = hc_bytealign_S (w6[2], w6[3], offset); + w7[3] = hc_bytealign_S (w6[1], w6[2], offset); + w7[2] = hc_bytealign_S (w6[0], w6[1], offset); + w7[1] = hc_bytealign_S (w5[3], w6[0], offset); + w7[0] = hc_bytealign_S (w5[2], w5[3], offset); + w6[3] = hc_bytealign_S (w5[1], w5[2], offset); + w6[2] = hc_bytealign_S (w5[0], w5[1], offset); + w6[1] = hc_bytealign_S (w4[3], w5[0], offset); + w6[0] = hc_bytealign_S (w4[2], w4[3], offset); + w5[3] = hc_bytealign_S (w4[1], w4[2], offset); + w5[2] = hc_bytealign_S (w4[0], w4[1], offset); + w5[1] = hc_bytealign_S (w3[3], w4[0], offset); + w5[0] = hc_bytealign_S (w3[2], w3[3], offset); + w4[3] = hc_bytealign_S (w3[1], w3[2], offset); + w4[2] = hc_bytealign_S (w3[0], w3[1], offset); + w4[1] = hc_bytealign_S (w2[3], w3[0], offset); + w4[0] = hc_bytealign_S (w2[2], w2[3], offset); + w3[3] = hc_bytealign_S (w2[1], w2[2], offset); + w3[2] = hc_bytealign_S (w2[0], w2[1], offset); + w3[1] = hc_bytealign_S (w1[3], w2[0], offset); + w3[0] = hc_bytealign_S (w1[2], w1[3], offset); + w2[3] = hc_bytealign_S (w1[1], w1[2], offset); + w2[2] = hc_bytealign_S (w1[0], w1[1], offset); + w2[1] = hc_bytealign_S (w0[3], w1[0], offset); + w2[0] = hc_bytealign_S (w0[2], w0[3], offset); + w1[3] = hc_bytealign_S (w0[1], w0[2], offset); + w1[2] = hc_bytealign_S (w0[0], w0[1], offset); + w1[1] = hc_bytealign_S ( 0, w0[0], offset); w1[0] = 0; w0[3] = 0; w0[2] = 0; @@ -39199,39 +39199,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 6: - c1[2] = amd_bytealign_S (w7[3], 0, offset); - c1[1] = amd_bytealign_S (w7[2], w7[3], offset); - c1[0] = amd_bytealign_S (w7[1], w7[2], offset); - c0[3] = amd_bytealign_S (w7[0], w7[1], offset); - c0[2] = amd_bytealign_S (w6[3], w7[0], offset); - c0[1] = amd_bytealign_S (w6[2], w6[3], offset); - c0[0] = amd_bytealign_S (w6[1], w6[2], offset); - w7[3] = amd_bytealign_S (w6[0], w6[1], offset); - w7[2] = amd_bytealign_S (w5[3], w6[0], offset); - w7[1] = amd_bytealign_S (w5[2], w5[3], offset); - w7[0] = amd_bytealign_S (w5[1], w5[2], offset); - w6[3] = amd_bytealign_S (w5[0], w5[1], offset); - w6[2] = amd_bytealign_S (w4[3], w5[0], offset); - w6[1] = amd_bytealign_S (w4[2], w4[3], offset); - w6[0] = amd_bytealign_S (w4[1], w4[2], offset); - w5[3] = amd_bytealign_S (w4[0], w4[1], offset); - w5[2] = amd_bytealign_S (w3[3], w4[0], offset); - w5[1] = amd_bytealign_S (w3[2], w3[3], offset); - w5[0] = amd_bytealign_S (w3[1], w3[2], offset); - w4[3] = amd_bytealign_S (w3[0], w3[1], offset); - w4[2] = amd_bytealign_S (w2[3], w3[0], offset); - w4[1] = amd_bytealign_S (w2[2], w2[3], offset); - w4[0] = amd_bytealign_S (w2[1], w2[2], offset); - w3[3] = amd_bytealign_S (w2[0], w2[1], offset); - w3[2] = amd_bytealign_S (w1[3], w2[0], offset); - w3[1] = amd_bytealign_S (w1[2], w1[3], offset); - w3[0] = amd_bytealign_S (w1[1], w1[2], offset); - w2[3] = amd_bytealign_S (w1[0], w1[1], offset); - w2[2] = amd_bytealign_S (w0[3], w1[0], offset); - w2[1] = amd_bytealign_S (w0[2], w0[3], offset); - w2[0] = amd_bytealign_S (w0[1], w0[2], offset); - w1[3] = amd_bytealign_S (w0[0], w0[1], offset); - w1[2] = amd_bytealign_S ( 0, w0[0], offset); + c1[2] = hc_bytealign_S (w7[3], 0, offset); + c1[1] = hc_bytealign_S (w7[2], w7[3], offset); + c1[0] = hc_bytealign_S (w7[1], w7[2], offset); + c0[3] = hc_bytealign_S (w7[0], w7[1], offset); + c0[2] = hc_bytealign_S (w6[3], w7[0], offset); + c0[1] = hc_bytealign_S (w6[2], w6[3], offset); + c0[0] = hc_bytealign_S (w6[1], w6[2], offset); + w7[3] = hc_bytealign_S (w6[0], w6[1], offset); + w7[2] = hc_bytealign_S (w5[3], w6[0], offset); + w7[1] = hc_bytealign_S (w5[2], w5[3], offset); + w7[0] = hc_bytealign_S (w5[1], w5[2], offset); + w6[3] = hc_bytealign_S (w5[0], w5[1], offset); + w6[2] = hc_bytealign_S (w4[3], w5[0], offset); + w6[1] = hc_bytealign_S (w4[2], w4[3], offset); + w6[0] = hc_bytealign_S (w4[1], w4[2], offset); + w5[3] = hc_bytealign_S (w4[0], w4[1], offset); + w5[2] = hc_bytealign_S (w3[3], w4[0], offset); + w5[1] = hc_bytealign_S (w3[2], w3[3], offset); + w5[0] = hc_bytealign_S (w3[1], w3[2], offset); + w4[3] = hc_bytealign_S (w3[0], w3[1], offset); + w4[2] = hc_bytealign_S (w2[3], w3[0], offset); + w4[1] = hc_bytealign_S (w2[2], w2[3], offset); + w4[0] = hc_bytealign_S (w2[1], w2[2], offset); + w3[3] = hc_bytealign_S (w2[0], w2[1], offset); + w3[2] = hc_bytealign_S (w1[3], w2[0], offset); + w3[1] = hc_bytealign_S (w1[2], w1[3], offset); + w3[0] = hc_bytealign_S (w1[1], w1[2], offset); + w2[3] = hc_bytealign_S (w1[0], w1[1], offset); + w2[2] = hc_bytealign_S (w0[3], w1[0], offset); + w2[1] = hc_bytealign_S (w0[2], w0[3], offset); + w2[0] = hc_bytealign_S (w0[1], w0[2], offset); + w1[3] = hc_bytealign_S (w0[0], w0[1], offset); + w1[2] = hc_bytealign_S ( 0, w0[0], offset); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -39242,39 +39242,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 7: - c1[3] = amd_bytealign_S (w7[3], 0, offset); - c1[2] = amd_bytealign_S (w7[2], w7[3], offset); - c1[1] = amd_bytealign_S (w7[1], w7[2], offset); - c1[0] = amd_bytealign_S (w7[0], w7[1], offset); - c0[3] = amd_bytealign_S (w6[3], w7[0], offset); - c0[2] = amd_bytealign_S (w6[2], w6[3], offset); - c0[1] = amd_bytealign_S (w6[1], w6[2], offset); - c0[0] = amd_bytealign_S (w6[0], w6[1], offset); - w7[3] = amd_bytealign_S (w5[3], w6[0], offset); - w7[2] = amd_bytealign_S (w5[2], w5[3], offset); - w7[1] = amd_bytealign_S (w5[1], w5[2], offset); - w7[0] = amd_bytealign_S (w5[0], w5[1], offset); - w6[3] = amd_bytealign_S (w4[3], w5[0], offset); - w6[2] = amd_bytealign_S (w4[2], w4[3], offset); - w6[1] = amd_bytealign_S (w4[1], w4[2], offset); - w6[0] = amd_bytealign_S (w4[0], w4[1], offset); - w5[3] = amd_bytealign_S (w3[3], w4[0], offset); - w5[2] = amd_bytealign_S (w3[2], w3[3], offset); - w5[1] = amd_bytealign_S (w3[1], w3[2], offset); - w5[0] = amd_bytealign_S (w3[0], w3[1], offset); - w4[3] = amd_bytealign_S (w2[3], w3[0], offset); - w4[2] = amd_bytealign_S (w2[2], w2[3], offset); - w4[1] = amd_bytealign_S (w2[1], w2[2], offset); - w4[0] = amd_bytealign_S (w2[0], w2[1], offset); - w3[3] = amd_bytealign_S (w1[3], w2[0], offset); - w3[2] = amd_bytealign_S (w1[2], w1[3], offset); - w3[1] = amd_bytealign_S (w1[1], w1[2], offset); - w3[0] = amd_bytealign_S (w1[0], w1[1], offset); - w2[3] = amd_bytealign_S (w0[3], w1[0], offset); - w2[2] = amd_bytealign_S (w0[2], w0[3], offset); - w2[1] = amd_bytealign_S (w0[1], w0[2], offset); - w2[0] = amd_bytealign_S (w0[0], w0[1], offset); - w1[3] = amd_bytealign_S ( 0, w0[0], offset); + c1[3] = hc_bytealign_S (w7[3], 0, offset); + c1[2] = hc_bytealign_S (w7[2], w7[3], offset); + c1[1] = hc_bytealign_S (w7[1], w7[2], offset); + c1[0] = hc_bytealign_S (w7[0], w7[1], offset); + c0[3] = hc_bytealign_S (w6[3], w7[0], offset); + c0[2] = hc_bytealign_S (w6[2], w6[3], offset); + c0[1] = hc_bytealign_S (w6[1], w6[2], offset); + c0[0] = hc_bytealign_S (w6[0], w6[1], offset); + w7[3] = hc_bytealign_S (w5[3], w6[0], offset); + w7[2] = hc_bytealign_S (w5[2], w5[3], offset); + w7[1] = hc_bytealign_S (w5[1], w5[2], offset); + w7[0] = hc_bytealign_S (w5[0], w5[1], offset); + w6[3] = hc_bytealign_S (w4[3], w5[0], offset); + w6[2] = hc_bytealign_S (w4[2], w4[3], offset); + w6[1] = hc_bytealign_S (w4[1], w4[2], offset); + w6[0] = hc_bytealign_S (w4[0], w4[1], offset); + w5[3] = hc_bytealign_S (w3[3], w4[0], offset); + w5[2] = hc_bytealign_S (w3[2], w3[3], offset); + w5[1] = hc_bytealign_S (w3[1], w3[2], offset); + w5[0] = hc_bytealign_S (w3[0], w3[1], offset); + w4[3] = hc_bytealign_S (w2[3], w3[0], offset); + w4[2] = hc_bytealign_S (w2[2], w2[3], offset); + w4[1] = hc_bytealign_S (w2[1], w2[2], offset); + w4[0] = hc_bytealign_S (w2[0], w2[1], offset); + w3[3] = hc_bytealign_S (w1[3], w2[0], offset); + w3[2] = hc_bytealign_S (w1[2], w1[3], offset); + w3[1] = hc_bytealign_S (w1[1], w1[2], offset); + w3[0] = hc_bytealign_S (w1[0], w1[1], offset); + w2[3] = hc_bytealign_S (w0[3], w1[0], offset); + w2[2] = hc_bytealign_S (w0[2], w0[3], offset); + w2[1] = hc_bytealign_S (w0[1], w0[2], offset); + w2[0] = hc_bytealign_S (w0[0], w0[1], offset); + w1[3] = hc_bytealign_S ( 0, w0[0], offset); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -39286,39 +39286,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 8: - c2[0] = amd_bytealign_S (w7[3], 0, offset); - c1[3] = amd_bytealign_S (w7[2], w7[3], offset); - c1[2] = amd_bytealign_S (w7[1], w7[2], offset); - c1[1] = amd_bytealign_S (w7[0], w7[1], offset); - c1[0] = amd_bytealign_S (w6[3], w7[0], offset); - c0[3] = amd_bytealign_S (w6[2], w6[3], offset); - c0[2] = amd_bytealign_S (w6[1], w6[2], offset); - c0[1] = amd_bytealign_S (w6[0], w6[1], offset); - c0[0] = amd_bytealign_S (w5[3], w6[0], offset); - w7[3] = amd_bytealign_S (w5[2], w5[3], offset); - w7[2] = amd_bytealign_S (w5[1], w5[2], offset); - w7[1] = amd_bytealign_S (w5[0], w5[1], offset); - w7[0] = amd_bytealign_S (w4[3], w5[0], offset); - w6[3] = amd_bytealign_S (w4[2], w4[3], offset); - w6[2] = amd_bytealign_S (w4[1], w4[2], offset); - w6[1] = amd_bytealign_S (w4[0], w4[1], offset); - w6[0] = amd_bytealign_S (w3[3], w4[0], offset); - w5[3] = amd_bytealign_S (w3[2], w3[3], offset); - w5[2] = amd_bytealign_S (w3[1], w3[2], offset); - w5[1] = amd_bytealign_S (w3[0], w3[1], offset); - w5[0] = amd_bytealign_S (w2[3], w3[0], offset); - w4[3] = amd_bytealign_S (w2[2], w2[3], offset); - w4[2] = amd_bytealign_S (w2[1], w2[2], offset); - w4[1] = amd_bytealign_S (w2[0], w2[1], offset); - w4[0] = amd_bytealign_S (w1[3], w2[0], offset); - w3[3] = amd_bytealign_S (w1[2], w1[3], offset); - w3[2] = amd_bytealign_S (w1[1], w1[2], offset); - w3[1] = amd_bytealign_S (w1[0], w1[1], offset); - w3[0] = amd_bytealign_S (w0[3], w1[0], offset); - w2[3] = amd_bytealign_S (w0[2], w0[3], offset); - w2[2] = amd_bytealign_S (w0[1], w0[2], offset); - w2[1] = amd_bytealign_S (w0[0], w0[1], offset); - w2[0] = amd_bytealign_S ( 0, w0[0], offset); + c2[0] = hc_bytealign_S (w7[3], 0, offset); + c1[3] = hc_bytealign_S (w7[2], w7[3], offset); + c1[2] = hc_bytealign_S (w7[1], w7[2], offset); + c1[1] = hc_bytealign_S (w7[0], w7[1], offset); + c1[0] = hc_bytealign_S (w6[3], w7[0], offset); + c0[3] = hc_bytealign_S (w6[2], w6[3], offset); + c0[2] = hc_bytealign_S (w6[1], w6[2], offset); + c0[1] = hc_bytealign_S (w6[0], w6[1], offset); + c0[0] = hc_bytealign_S (w5[3], w6[0], offset); + w7[3] = hc_bytealign_S (w5[2], w5[3], offset); + w7[2] = hc_bytealign_S (w5[1], w5[2], offset); + w7[1] = hc_bytealign_S (w5[0], w5[1], offset); + w7[0] = hc_bytealign_S (w4[3], w5[0], offset); + w6[3] = hc_bytealign_S (w4[2], w4[3], offset); + w6[2] = hc_bytealign_S (w4[1], w4[2], offset); + w6[1] = hc_bytealign_S (w4[0], w4[1], offset); + w6[0] = hc_bytealign_S (w3[3], w4[0], offset); + w5[3] = hc_bytealign_S (w3[2], w3[3], offset); + w5[2] = hc_bytealign_S (w3[1], w3[2], offset); + w5[1] = hc_bytealign_S (w3[0], w3[1], offset); + w5[0] = hc_bytealign_S (w2[3], w3[0], offset); + w4[3] = hc_bytealign_S (w2[2], w2[3], offset); + w4[2] = hc_bytealign_S (w2[1], w2[2], offset); + w4[1] = hc_bytealign_S (w2[0], w2[1], offset); + w4[0] = hc_bytealign_S (w1[3], w2[0], offset); + w3[3] = hc_bytealign_S (w1[2], w1[3], offset); + w3[2] = hc_bytealign_S (w1[1], w1[2], offset); + w3[1] = hc_bytealign_S (w1[0], w1[1], offset); + w3[0] = hc_bytealign_S (w0[3], w1[0], offset); + w2[3] = hc_bytealign_S (w0[2], w0[3], offset); + w2[2] = hc_bytealign_S (w0[1], w0[2], offset); + w2[1] = hc_bytealign_S (w0[0], w0[1], offset); + w2[0] = hc_bytealign_S ( 0, w0[0], offset); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -39331,39 +39331,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 9: - c2[1] = amd_bytealign_S (w7[3], 0, offset); - c2[0] = amd_bytealign_S (w7[2], w7[3], offset); - c1[3] = amd_bytealign_S (w7[1], w7[2], offset); - c1[2] = amd_bytealign_S (w7[0], w7[1], offset); - c1[1] = amd_bytealign_S (w6[3], w7[0], offset); - c1[0] = amd_bytealign_S (w6[2], w6[3], offset); - c0[3] = amd_bytealign_S (w6[1], w6[2], offset); - c0[2] = amd_bytealign_S (w6[0], w6[1], offset); - c0[1] = amd_bytealign_S (w5[3], w6[0], offset); - c0[0] = amd_bytealign_S (w5[2], w5[3], offset); - w7[3] = amd_bytealign_S (w5[1], w5[2], offset); - w7[2] = amd_bytealign_S (w5[0], w5[1], offset); - w7[1] = amd_bytealign_S (w4[3], w5[0], offset); - w7[0] = amd_bytealign_S (w4[2], w4[3], offset); - w6[3] = amd_bytealign_S (w4[1], w4[2], offset); - w6[2] = amd_bytealign_S (w4[0], w4[1], offset); - w6[1] = amd_bytealign_S (w3[3], w4[0], offset); - w6[0] = amd_bytealign_S (w3[2], w3[3], offset); - w5[3] = amd_bytealign_S (w3[1], w3[2], offset); - w5[2] = amd_bytealign_S (w3[0], w3[1], offset); - w5[1] = amd_bytealign_S (w2[3], w3[0], offset); - w5[0] = amd_bytealign_S (w2[2], w2[3], offset); - w4[3] = amd_bytealign_S (w2[1], w2[2], offset); - w4[2] = amd_bytealign_S (w2[0], w2[1], offset); - w4[1] = amd_bytealign_S (w1[3], w2[0], offset); - w4[0] = amd_bytealign_S (w1[2], w1[3], offset); - w3[3] = amd_bytealign_S (w1[1], w1[2], offset); - w3[2] = amd_bytealign_S (w1[0], w1[1], offset); - w3[1] = amd_bytealign_S (w0[3], w1[0], offset); - w3[0] = amd_bytealign_S (w0[2], w0[3], offset); - w2[3] = amd_bytealign_S (w0[1], w0[2], offset); - w2[2] = amd_bytealign_S (w0[0], w0[1], offset); - w2[1] = amd_bytealign_S ( 0, w0[0], offset); + c2[1] = hc_bytealign_S (w7[3], 0, offset); + c2[0] = hc_bytealign_S (w7[2], w7[3], offset); + c1[3] = hc_bytealign_S (w7[1], w7[2], offset); + c1[2] = hc_bytealign_S (w7[0], w7[1], offset); + c1[1] = hc_bytealign_S (w6[3], w7[0], offset); + c1[0] = hc_bytealign_S (w6[2], w6[3], offset); + c0[3] = hc_bytealign_S (w6[1], w6[2], offset); + c0[2] = hc_bytealign_S (w6[0], w6[1], offset); + c0[1] = hc_bytealign_S (w5[3], w6[0], offset); + c0[0] = hc_bytealign_S (w5[2], w5[3], offset); + w7[3] = hc_bytealign_S (w5[1], w5[2], offset); + w7[2] = hc_bytealign_S (w5[0], w5[1], offset); + w7[1] = hc_bytealign_S (w4[3], w5[0], offset); + w7[0] = hc_bytealign_S (w4[2], w4[3], offset); + w6[3] = hc_bytealign_S (w4[1], w4[2], offset); + w6[2] = hc_bytealign_S (w4[0], w4[1], offset); + w6[1] = hc_bytealign_S (w3[3], w4[0], offset); + w6[0] = hc_bytealign_S (w3[2], w3[3], offset); + w5[3] = hc_bytealign_S (w3[1], w3[2], offset); + w5[2] = hc_bytealign_S (w3[0], w3[1], offset); + w5[1] = hc_bytealign_S (w2[3], w3[0], offset); + w5[0] = hc_bytealign_S (w2[2], w2[3], offset); + w4[3] = hc_bytealign_S (w2[1], w2[2], offset); + w4[2] = hc_bytealign_S (w2[0], w2[1], offset); + w4[1] = hc_bytealign_S (w1[3], w2[0], offset); + w4[0] = hc_bytealign_S (w1[2], w1[3], offset); + w3[3] = hc_bytealign_S (w1[1], w1[2], offset); + w3[2] = hc_bytealign_S (w1[0], w1[1], offset); + w3[1] = hc_bytealign_S (w0[3], w1[0], offset); + w3[0] = hc_bytealign_S (w0[2], w0[3], offset); + w2[3] = hc_bytealign_S (w0[1], w0[2], offset); + w2[2] = hc_bytealign_S (w0[0], w0[1], offset); + w2[1] = hc_bytealign_S ( 0, w0[0], offset); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -39377,39 +39377,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 10: - c2[2] = amd_bytealign_S (w7[3], 0, offset); - c2[1] = amd_bytealign_S (w7[2], w7[3], offset); - c2[0] = amd_bytealign_S (w7[1], w7[2], offset); - c1[3] = amd_bytealign_S (w7[0], w7[1], offset); - c1[2] = amd_bytealign_S (w6[3], w7[0], offset); - c1[1] = amd_bytealign_S (w6[2], w6[3], offset); - c1[0] = amd_bytealign_S (w6[1], w6[2], offset); - c0[3] = amd_bytealign_S (w6[0], w6[1], offset); - c0[2] = amd_bytealign_S (w5[3], w6[0], offset); - c0[1] = amd_bytealign_S (w5[2], w5[3], offset); - c0[0] = amd_bytealign_S (w5[1], w5[2], offset); - w7[3] = amd_bytealign_S (w5[0], w5[1], offset); - w7[2] = amd_bytealign_S (w4[3], w5[0], offset); - w7[1] = amd_bytealign_S (w4[2], w4[3], offset); - w7[0] = amd_bytealign_S (w4[1], w4[2], offset); - w6[3] = amd_bytealign_S (w4[0], w4[1], offset); - w6[2] = amd_bytealign_S (w3[3], w4[0], offset); - w6[1] = amd_bytealign_S (w3[2], w3[3], offset); - w6[0] = amd_bytealign_S (w3[1], w3[2], offset); - w5[3] = amd_bytealign_S (w3[0], w3[1], offset); - w5[2] = amd_bytealign_S (w2[3], w3[0], offset); - w5[1] = amd_bytealign_S (w2[2], w2[3], offset); - w5[0] = amd_bytealign_S (w2[1], w2[2], offset); - w4[3] = amd_bytealign_S (w2[0], w2[1], offset); - w4[2] = amd_bytealign_S (w1[3], w2[0], offset); - w4[1] = amd_bytealign_S (w1[2], w1[3], offset); - w4[0] = amd_bytealign_S (w1[1], w1[2], offset); - w3[3] = amd_bytealign_S (w1[0], w1[1], offset); - w3[2] = amd_bytealign_S (w0[3], w1[0], offset); - w3[1] = amd_bytealign_S (w0[2], w0[3], offset); - w3[0] = amd_bytealign_S (w0[1], w0[2], offset); - w2[3] = amd_bytealign_S (w0[0], w0[1], offset); - w2[2] = amd_bytealign_S ( 0, w0[0], offset); + c2[2] = hc_bytealign_S (w7[3], 0, offset); + c2[1] = hc_bytealign_S (w7[2], w7[3], offset); + c2[0] = hc_bytealign_S (w7[1], w7[2], offset); + c1[3] = hc_bytealign_S (w7[0], w7[1], offset); + c1[2] = hc_bytealign_S (w6[3], w7[0], offset); + c1[1] = hc_bytealign_S (w6[2], w6[3], offset); + c1[0] = hc_bytealign_S (w6[1], w6[2], offset); + c0[3] = hc_bytealign_S (w6[0], w6[1], offset); + c0[2] = hc_bytealign_S (w5[3], w6[0], offset); + c0[1] = hc_bytealign_S (w5[2], w5[3], offset); + c0[0] = hc_bytealign_S (w5[1], w5[2], offset); + w7[3] = hc_bytealign_S (w5[0], w5[1], offset); + w7[2] = hc_bytealign_S (w4[3], w5[0], offset); + w7[1] = hc_bytealign_S (w4[2], w4[3], offset); + w7[0] = hc_bytealign_S (w4[1], w4[2], offset); + w6[3] = hc_bytealign_S (w4[0], w4[1], offset); + w6[2] = hc_bytealign_S (w3[3], w4[0], offset); + w6[1] = hc_bytealign_S (w3[2], w3[3], offset); + w6[0] = hc_bytealign_S (w3[1], w3[2], offset); + w5[3] = hc_bytealign_S (w3[0], w3[1], offset); + w5[2] = hc_bytealign_S (w2[3], w3[0], offset); + w5[1] = hc_bytealign_S (w2[2], w2[3], offset); + w5[0] = hc_bytealign_S (w2[1], w2[2], offset); + w4[3] = hc_bytealign_S (w2[0], w2[1], offset); + w4[2] = hc_bytealign_S (w1[3], w2[0], offset); + w4[1] = hc_bytealign_S (w1[2], w1[3], offset); + w4[0] = hc_bytealign_S (w1[1], w1[2], offset); + w3[3] = hc_bytealign_S (w1[0], w1[1], offset); + w3[2] = hc_bytealign_S (w0[3], w1[0], offset); + w3[1] = hc_bytealign_S (w0[2], w0[3], offset); + w3[0] = hc_bytealign_S (w0[1], w0[2], offset); + w2[3] = hc_bytealign_S (w0[0], w0[1], offset); + w2[2] = hc_bytealign_S ( 0, w0[0], offset); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -39424,39 +39424,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 11: - c2[3] = amd_bytealign_S (w7[3], 0, offset); - c2[2] = amd_bytealign_S (w7[2], w7[3], offset); - c2[1] = amd_bytealign_S (w7[1], w7[2], offset); - c2[0] = amd_bytealign_S (w7[0], w7[1], offset); - c1[3] = amd_bytealign_S (w6[3], w7[0], offset); - c1[2] = amd_bytealign_S (w6[2], w6[3], offset); - c1[1] = amd_bytealign_S (w6[1], w6[2], offset); - c1[0] = amd_bytealign_S (w6[0], w6[1], offset); - c0[3] = amd_bytealign_S (w5[3], w6[0], offset); - c0[2] = amd_bytealign_S (w5[2], w5[3], offset); - c0[1] = amd_bytealign_S (w5[1], w5[2], offset); - c0[0] = amd_bytealign_S (w5[0], w5[1], offset); - w7[3] = amd_bytealign_S (w4[3], w5[0], offset); - w7[2] = amd_bytealign_S (w4[2], w4[3], offset); - w7[1] = amd_bytealign_S (w4[1], w4[2], offset); - w7[0] = amd_bytealign_S (w4[0], w4[1], offset); - w6[3] = amd_bytealign_S (w3[3], w4[0], offset); - w6[2] = amd_bytealign_S (w3[2], w3[3], offset); - w6[1] = amd_bytealign_S (w3[1], w3[2], offset); - w6[0] = amd_bytealign_S (w3[0], w3[1], offset); - w5[3] = amd_bytealign_S (w2[3], w3[0], offset); - w5[2] = amd_bytealign_S (w2[2], w2[3], offset); - w5[1] = amd_bytealign_S (w2[1], w2[2], offset); - w5[0] = amd_bytealign_S (w2[0], w2[1], offset); - w4[3] = amd_bytealign_S (w1[3], w2[0], offset); - w4[2] = amd_bytealign_S (w1[2], w1[3], offset); - w4[1] = amd_bytealign_S (w1[1], w1[2], offset); - w4[0] = amd_bytealign_S (w1[0], w1[1], offset); - w3[3] = amd_bytealign_S (w0[3], w1[0], offset); - w3[2] = amd_bytealign_S (w0[2], w0[3], offset); - w3[1] = amd_bytealign_S (w0[1], w0[2], offset); - w3[0] = amd_bytealign_S (w0[0], w0[1], offset); - w2[3] = amd_bytealign_S ( 0, w0[0], offset); + c2[3] = hc_bytealign_S (w7[3], 0, offset); + c2[2] = hc_bytealign_S (w7[2], w7[3], offset); + c2[1] = hc_bytealign_S (w7[1], w7[2], offset); + c2[0] = hc_bytealign_S (w7[0], w7[1], offset); + c1[3] = hc_bytealign_S (w6[3], w7[0], offset); + c1[2] = hc_bytealign_S (w6[2], w6[3], offset); + c1[1] = hc_bytealign_S (w6[1], w6[2], offset); + c1[0] = hc_bytealign_S (w6[0], w6[1], offset); + c0[3] = hc_bytealign_S (w5[3], w6[0], offset); + c0[2] = hc_bytealign_S (w5[2], w5[3], offset); + c0[1] = hc_bytealign_S (w5[1], w5[2], offset); + c0[0] = hc_bytealign_S (w5[0], w5[1], offset); + w7[3] = hc_bytealign_S (w4[3], w5[0], offset); + w7[2] = hc_bytealign_S (w4[2], w4[3], offset); + w7[1] = hc_bytealign_S (w4[1], w4[2], offset); + w7[0] = hc_bytealign_S (w4[0], w4[1], offset); + w6[3] = hc_bytealign_S (w3[3], w4[0], offset); + w6[2] = hc_bytealign_S (w3[2], w3[3], offset); + w6[1] = hc_bytealign_S (w3[1], w3[2], offset); + w6[0] = hc_bytealign_S (w3[0], w3[1], offset); + w5[3] = hc_bytealign_S (w2[3], w3[0], offset); + w5[2] = hc_bytealign_S (w2[2], w2[3], offset); + w5[1] = hc_bytealign_S (w2[1], w2[2], offset); + w5[0] = hc_bytealign_S (w2[0], w2[1], offset); + w4[3] = hc_bytealign_S (w1[3], w2[0], offset); + w4[2] = hc_bytealign_S (w1[2], w1[3], offset); + w4[1] = hc_bytealign_S (w1[1], w1[2], offset); + w4[0] = hc_bytealign_S (w1[0], w1[1], offset); + w3[3] = hc_bytealign_S (w0[3], w1[0], offset); + w3[2] = hc_bytealign_S (w0[2], w0[3], offset); + w3[1] = hc_bytealign_S (w0[1], w0[2], offset); + w3[0] = hc_bytealign_S (w0[0], w0[1], offset); + w2[3] = hc_bytealign_S ( 0, w0[0], offset); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -39472,39 +39472,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 12: - c3[0] = amd_bytealign_S (w7[3], 0, offset); - c2[3] = amd_bytealign_S (w7[2], w7[3], offset); - c2[2] = amd_bytealign_S (w7[1], w7[2], offset); - c2[1] = amd_bytealign_S (w7[0], w7[1], offset); - c2[0] = amd_bytealign_S (w6[3], w7[0], offset); - c1[3] = amd_bytealign_S (w6[2], w6[3], offset); - c1[2] = amd_bytealign_S (w6[1], w6[2], offset); - c1[1] = amd_bytealign_S (w6[0], w6[1], offset); - c1[0] = amd_bytealign_S (w5[3], w6[0], offset); - c0[3] = amd_bytealign_S (w5[2], w5[3], offset); - c0[2] = amd_bytealign_S (w5[1], w5[2], offset); - c0[1] = amd_bytealign_S (w5[0], w5[1], offset); - c0[0] = amd_bytealign_S (w4[3], w5[0], offset); - w7[3] = amd_bytealign_S (w4[2], w4[3], offset); - w7[2] = amd_bytealign_S (w4[1], w4[2], offset); - w7[1] = amd_bytealign_S (w4[0], w4[1], offset); - w7[0] = amd_bytealign_S (w3[3], w4[0], offset); - w6[3] = amd_bytealign_S (w3[2], w3[3], offset); - w6[2] = amd_bytealign_S (w3[1], w3[2], offset); - w6[1] = amd_bytealign_S (w3[0], w3[1], offset); - w6[0] = amd_bytealign_S (w2[3], w3[0], offset); - w5[3] = amd_bytealign_S (w2[2], w2[3], offset); - w5[2] = amd_bytealign_S (w2[1], w2[2], offset); - w5[1] = amd_bytealign_S (w2[0], w2[1], offset); - w5[0] = amd_bytealign_S (w1[3], w2[0], offset); - w4[3] = amd_bytealign_S (w1[2], w1[3], offset); - w4[2] = amd_bytealign_S (w1[1], w1[2], offset); - w4[1] = amd_bytealign_S (w1[0], w1[1], offset); - w4[0] = amd_bytealign_S (w0[3], w1[0], offset); - w3[3] = amd_bytealign_S (w0[2], w0[3], offset); - w3[2] = amd_bytealign_S (w0[1], w0[2], offset); - w3[1] = amd_bytealign_S (w0[0], w0[1], offset); - w3[0] = amd_bytealign_S ( 0, w0[0], offset); + c3[0] = hc_bytealign_S (w7[3], 0, offset); + c2[3] = hc_bytealign_S (w7[2], w7[3], offset); + c2[2] = hc_bytealign_S (w7[1], w7[2], offset); + c2[1] = hc_bytealign_S (w7[0], w7[1], offset); + c2[0] = hc_bytealign_S (w6[3], w7[0], offset); + c1[3] = hc_bytealign_S (w6[2], w6[3], offset); + c1[2] = hc_bytealign_S (w6[1], w6[2], offset); + c1[1] = hc_bytealign_S (w6[0], w6[1], offset); + c1[0] = hc_bytealign_S (w5[3], w6[0], offset); + c0[3] = hc_bytealign_S (w5[2], w5[3], offset); + c0[2] = hc_bytealign_S (w5[1], w5[2], offset); + c0[1] = hc_bytealign_S (w5[0], w5[1], offset); + c0[0] = hc_bytealign_S (w4[3], w5[0], offset); + w7[3] = hc_bytealign_S (w4[2], w4[3], offset); + w7[2] = hc_bytealign_S (w4[1], w4[2], offset); + w7[1] = hc_bytealign_S (w4[0], w4[1], offset); + w7[0] = hc_bytealign_S (w3[3], w4[0], offset); + w6[3] = hc_bytealign_S (w3[2], w3[3], offset); + w6[2] = hc_bytealign_S (w3[1], w3[2], offset); + w6[1] = hc_bytealign_S (w3[0], w3[1], offset); + w6[0] = hc_bytealign_S (w2[3], w3[0], offset); + w5[3] = hc_bytealign_S (w2[2], w2[3], offset); + w5[2] = hc_bytealign_S (w2[1], w2[2], offset); + w5[1] = hc_bytealign_S (w2[0], w2[1], offset); + w5[0] = hc_bytealign_S (w1[3], w2[0], offset); + w4[3] = hc_bytealign_S (w1[2], w1[3], offset); + w4[2] = hc_bytealign_S (w1[1], w1[2], offset); + w4[1] = hc_bytealign_S (w1[0], w1[1], offset); + w4[0] = hc_bytealign_S (w0[3], w1[0], offset); + w3[3] = hc_bytealign_S (w0[2], w0[3], offset); + w3[2] = hc_bytealign_S (w0[1], w0[2], offset); + w3[1] = hc_bytealign_S (w0[0], w0[1], offset); + w3[0] = hc_bytealign_S ( 0, w0[0], offset); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -39521,39 +39521,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 13: - c3[1] = amd_bytealign_S (w7[3], 0, offset); - c3[0] = amd_bytealign_S (w7[2], w7[3], offset); - c2[3] = amd_bytealign_S (w7[1], w7[2], offset); - c2[2] = amd_bytealign_S (w7[0], w7[1], offset); - c2[1] = amd_bytealign_S (w6[3], w7[0], offset); - c2[0] = amd_bytealign_S (w6[2], w6[3], offset); - c1[3] = amd_bytealign_S (w6[1], w6[2], offset); - c1[2] = amd_bytealign_S (w6[0], w6[1], offset); - c1[1] = amd_bytealign_S (w5[3], w6[0], offset); - c1[0] = amd_bytealign_S (w5[2], w5[3], offset); - c0[3] = amd_bytealign_S (w5[1], w5[2], offset); - c0[2] = amd_bytealign_S (w5[0], w5[1], offset); - c0[1] = amd_bytealign_S (w4[3], w5[0], offset); - c0[0] = amd_bytealign_S (w4[2], w4[3], offset); - w7[3] = amd_bytealign_S (w4[1], w4[2], offset); - w7[2] = amd_bytealign_S (w4[0], w4[1], offset); - w7[1] = amd_bytealign_S (w3[3], w4[0], offset); - w7[0] = amd_bytealign_S (w3[2], w3[3], offset); - w6[3] = amd_bytealign_S (w3[1], w3[2], offset); - w6[2] = amd_bytealign_S (w3[0], w3[1], offset); - w6[1] = amd_bytealign_S (w2[3], w3[0], offset); - w6[0] = amd_bytealign_S (w2[2], w2[3], offset); - w5[3] = amd_bytealign_S (w2[1], w2[2], offset); - w5[2] = amd_bytealign_S (w2[0], w2[1], offset); - w5[1] = amd_bytealign_S (w1[3], w2[0], offset); - w5[0] = amd_bytealign_S (w1[2], w1[3], offset); - w4[3] = amd_bytealign_S (w1[1], w1[2], offset); - w4[2] = amd_bytealign_S (w1[0], w1[1], offset); - w4[1] = amd_bytealign_S (w0[3], w1[0], offset); - w4[0] = amd_bytealign_S (w0[2], w0[3], offset); - w3[3] = amd_bytealign_S (w0[1], w0[2], offset); - w3[2] = amd_bytealign_S (w0[0], w0[1], offset); - w3[1] = amd_bytealign_S ( 0, w0[0], offset); + c3[1] = hc_bytealign_S (w7[3], 0, offset); + c3[0] = hc_bytealign_S (w7[2], w7[3], offset); + c2[3] = hc_bytealign_S (w7[1], w7[2], offset); + c2[2] = hc_bytealign_S (w7[0], w7[1], offset); + c2[1] = hc_bytealign_S (w6[3], w7[0], offset); + c2[0] = hc_bytealign_S (w6[2], w6[3], offset); + c1[3] = hc_bytealign_S (w6[1], w6[2], offset); + c1[2] = hc_bytealign_S (w6[0], w6[1], offset); + c1[1] = hc_bytealign_S (w5[3], w6[0], offset); + c1[0] = hc_bytealign_S (w5[2], w5[3], offset); + c0[3] = hc_bytealign_S (w5[1], w5[2], offset); + c0[2] = hc_bytealign_S (w5[0], w5[1], offset); + c0[1] = hc_bytealign_S (w4[3], w5[0], offset); + c0[0] = hc_bytealign_S (w4[2], w4[3], offset); + w7[3] = hc_bytealign_S (w4[1], w4[2], offset); + w7[2] = hc_bytealign_S (w4[0], w4[1], offset); + w7[1] = hc_bytealign_S (w3[3], w4[0], offset); + w7[0] = hc_bytealign_S (w3[2], w3[3], offset); + w6[3] = hc_bytealign_S (w3[1], w3[2], offset); + w6[2] = hc_bytealign_S (w3[0], w3[1], offset); + w6[1] = hc_bytealign_S (w2[3], w3[0], offset); + w6[0] = hc_bytealign_S (w2[2], w2[3], offset); + w5[3] = hc_bytealign_S (w2[1], w2[2], offset); + w5[2] = hc_bytealign_S (w2[0], w2[1], offset); + w5[1] = hc_bytealign_S (w1[3], w2[0], offset); + w5[0] = hc_bytealign_S (w1[2], w1[3], offset); + w4[3] = hc_bytealign_S (w1[1], w1[2], offset); + w4[2] = hc_bytealign_S (w1[0], w1[1], offset); + w4[1] = hc_bytealign_S (w0[3], w1[0], offset); + w4[0] = hc_bytealign_S (w0[2], w0[3], offset); + w3[3] = hc_bytealign_S (w0[1], w0[2], offset); + w3[2] = hc_bytealign_S (w0[0], w0[1], offset); + w3[1] = hc_bytealign_S ( 0, w0[0], offset); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -39571,39 +39571,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 14: - c3[2] = amd_bytealign_S (w7[3], 0, offset); - c3[1] = amd_bytealign_S (w7[2], w7[3], offset); - c3[0] = amd_bytealign_S (w7[1], w7[2], offset); - c2[3] = amd_bytealign_S (w7[0], w7[1], offset); - c2[2] = amd_bytealign_S (w6[3], w7[0], offset); - c2[1] = amd_bytealign_S (w6[2], w6[3], offset); - c2[0] = amd_bytealign_S (w6[1], w6[2], offset); - c1[3] = amd_bytealign_S (w6[0], w6[1], offset); - c1[2] = amd_bytealign_S (w5[3], w6[0], offset); - c1[1] = amd_bytealign_S (w5[2], w5[3], offset); - c1[0] = amd_bytealign_S (w5[1], w5[2], offset); - c0[3] = amd_bytealign_S (w5[0], w5[1], offset); - c0[2] = amd_bytealign_S (w4[3], w5[0], offset); - c0[1] = amd_bytealign_S (w4[2], w4[3], offset); - c0[0] = amd_bytealign_S (w4[1], w4[2], offset); - w7[3] = amd_bytealign_S (w4[0], w4[1], offset); - w7[2] = amd_bytealign_S (w3[3], w4[0], offset); - w7[1] = amd_bytealign_S (w3[2], w3[3], offset); - w7[0] = amd_bytealign_S (w3[1], w3[2], offset); - w6[3] = amd_bytealign_S (w3[0], w3[1], offset); - w6[2] = amd_bytealign_S (w2[3], w3[0], offset); - w6[1] = amd_bytealign_S (w2[2], w2[3], offset); - w6[0] = amd_bytealign_S (w2[1], w2[2], offset); - w5[3] = amd_bytealign_S (w2[0], w2[1], offset); - w5[2] = amd_bytealign_S (w1[3], w2[0], offset); - w5[1] = amd_bytealign_S (w1[2], w1[3], offset); - w5[0] = amd_bytealign_S (w1[1], w1[2], offset); - w4[3] = amd_bytealign_S (w1[0], w1[1], offset); - w4[2] = amd_bytealign_S (w0[3], w1[0], offset); - w4[1] = amd_bytealign_S (w0[2], w0[3], offset); - w4[0] = amd_bytealign_S (w0[1], w0[2], offset); - w3[3] = amd_bytealign_S (w0[0], w0[1], offset); - w3[2] = amd_bytealign_S ( 0, w0[0], offset); + c3[2] = hc_bytealign_S (w7[3], 0, offset); + c3[1] = hc_bytealign_S (w7[2], w7[3], offset); + c3[0] = hc_bytealign_S (w7[1], w7[2], offset); + c2[3] = hc_bytealign_S (w7[0], w7[1], offset); + c2[2] = hc_bytealign_S (w6[3], w7[0], offset); + c2[1] = hc_bytealign_S (w6[2], w6[3], offset); + c2[0] = hc_bytealign_S (w6[1], w6[2], offset); + c1[3] = hc_bytealign_S (w6[0], w6[1], offset); + c1[2] = hc_bytealign_S (w5[3], w6[0], offset); + c1[1] = hc_bytealign_S (w5[2], w5[3], offset); + c1[0] = hc_bytealign_S (w5[1], w5[2], offset); + c0[3] = hc_bytealign_S (w5[0], w5[1], offset); + c0[2] = hc_bytealign_S (w4[3], w5[0], offset); + c0[1] = hc_bytealign_S (w4[2], w4[3], offset); + c0[0] = hc_bytealign_S (w4[1], w4[2], offset); + w7[3] = hc_bytealign_S (w4[0], w4[1], offset); + w7[2] = hc_bytealign_S (w3[3], w4[0], offset); + w7[1] = hc_bytealign_S (w3[2], w3[3], offset); + w7[0] = hc_bytealign_S (w3[1], w3[2], offset); + w6[3] = hc_bytealign_S (w3[0], w3[1], offset); + w6[2] = hc_bytealign_S (w2[3], w3[0], offset); + w6[1] = hc_bytealign_S (w2[2], w2[3], offset); + w6[0] = hc_bytealign_S (w2[1], w2[2], offset); + w5[3] = hc_bytealign_S (w2[0], w2[1], offset); + w5[2] = hc_bytealign_S (w1[3], w2[0], offset); + w5[1] = hc_bytealign_S (w1[2], w1[3], offset); + w5[0] = hc_bytealign_S (w1[1], w1[2], offset); + w4[3] = hc_bytealign_S (w1[0], w1[1], offset); + w4[2] = hc_bytealign_S (w0[3], w1[0], offset); + w4[1] = hc_bytealign_S (w0[2], w0[3], offset); + w4[0] = hc_bytealign_S (w0[1], w0[2], offset); + w3[3] = hc_bytealign_S (w0[0], w0[1], offset); + w3[2] = hc_bytealign_S ( 0, w0[0], offset); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -39622,39 +39622,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 15: - c3[3] = amd_bytealign_S (w7[3], 0, offset); - c3[2] = amd_bytealign_S (w7[2], w7[3], offset); - c3[1] = amd_bytealign_S (w7[1], w7[2], offset); - c3[0] = amd_bytealign_S (w7[0], w7[1], offset); - c2[3] = amd_bytealign_S (w6[3], w7[0], offset); - c2[2] = amd_bytealign_S (w6[2], w6[3], offset); - c2[1] = amd_bytealign_S (w6[1], w6[2], offset); - c2[0] = amd_bytealign_S (w6[0], w6[1], offset); - c1[3] = amd_bytealign_S (w5[3], w6[0], offset); - c1[2] = amd_bytealign_S (w5[2], w5[3], offset); - c1[1] = amd_bytealign_S (w5[1], w5[2], offset); - c1[0] = amd_bytealign_S (w5[0], w5[1], offset); - c0[3] = amd_bytealign_S (w4[3], w5[0], offset); - c0[2] = amd_bytealign_S (w4[2], w4[3], offset); - c0[1] = amd_bytealign_S (w4[1], w4[2], offset); - c0[0] = amd_bytealign_S (w4[0], w4[1], offset); - w7[3] = amd_bytealign_S (w3[3], w4[0], offset); - w7[2] = amd_bytealign_S (w3[2], w3[3], offset); - w7[1] = amd_bytealign_S (w3[1], w3[2], offset); - w7[0] = amd_bytealign_S (w3[0], w3[1], offset); - w6[3] = amd_bytealign_S (w2[3], w3[0], offset); - w6[2] = amd_bytealign_S (w2[2], w2[3], offset); - w6[1] = amd_bytealign_S (w2[1], w2[2], offset); - w6[0] = amd_bytealign_S (w2[0], w2[1], offset); - w5[3] = amd_bytealign_S (w1[3], w2[0], offset); - w5[2] = amd_bytealign_S (w1[2], w1[3], offset); - w5[1] = amd_bytealign_S (w1[1], w1[2], offset); - w5[0] = amd_bytealign_S (w1[0], w1[1], offset); - w4[3] = amd_bytealign_S (w0[3], w1[0], offset); - w4[2] = amd_bytealign_S (w0[2], w0[3], offset); - w4[1] = amd_bytealign_S (w0[1], w0[2], offset); - w4[0] = amd_bytealign_S (w0[0], w0[1], offset); - w3[3] = amd_bytealign_S ( 0, w0[0], offset); + c3[3] = hc_bytealign_S (w7[3], 0, offset); + c3[2] = hc_bytealign_S (w7[2], w7[3], offset); + c3[1] = hc_bytealign_S (w7[1], w7[2], offset); + c3[0] = hc_bytealign_S (w7[0], w7[1], offset); + c2[3] = hc_bytealign_S (w6[3], w7[0], offset); + c2[2] = hc_bytealign_S (w6[2], w6[3], offset); + c2[1] = hc_bytealign_S (w6[1], w6[2], offset); + c2[0] = hc_bytealign_S (w6[0], w6[1], offset); + c1[3] = hc_bytealign_S (w5[3], w6[0], offset); + c1[2] = hc_bytealign_S (w5[2], w5[3], offset); + c1[1] = hc_bytealign_S (w5[1], w5[2], offset); + c1[0] = hc_bytealign_S (w5[0], w5[1], offset); + c0[3] = hc_bytealign_S (w4[3], w5[0], offset); + c0[2] = hc_bytealign_S (w4[2], w4[3], offset); + c0[1] = hc_bytealign_S (w4[1], w4[2], offset); + c0[0] = hc_bytealign_S (w4[0], w4[1], offset); + w7[3] = hc_bytealign_S (w3[3], w4[0], offset); + w7[2] = hc_bytealign_S (w3[2], w3[3], offset); + w7[1] = hc_bytealign_S (w3[1], w3[2], offset); + w7[0] = hc_bytealign_S (w3[0], w3[1], offset); + w6[3] = hc_bytealign_S (w2[3], w3[0], offset); + w6[2] = hc_bytealign_S (w2[2], w2[3], offset); + w6[1] = hc_bytealign_S (w2[1], w2[2], offset); + w6[0] = hc_bytealign_S (w2[0], w2[1], offset); + w5[3] = hc_bytealign_S (w1[3], w2[0], offset); + w5[2] = hc_bytealign_S (w1[2], w1[3], offset); + w5[1] = hc_bytealign_S (w1[1], w1[2], offset); + w5[0] = hc_bytealign_S (w1[0], w1[1], offset); + w4[3] = hc_bytealign_S (w0[3], w1[0], offset); + w4[2] = hc_bytealign_S (w0[2], w0[3], offset); + w4[1] = hc_bytealign_S (w0[1], w0[2], offset); + w4[0] = hc_bytealign_S (w0[0], w0[1], offset); + w3[3] = hc_bytealign_S ( 0, w0[0], offset); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -39674,39 +39674,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 16: - c4[0] = amd_bytealign_S (w7[3], 0, offset); - c3[3] = amd_bytealign_S (w7[2], w7[3], offset); - c3[2] = amd_bytealign_S (w7[1], w7[2], offset); - c3[1] = amd_bytealign_S (w7[0], w7[1], offset); - c3[0] = amd_bytealign_S (w6[3], w7[0], offset); - c2[3] = amd_bytealign_S (w6[2], w6[3], offset); - c2[2] = amd_bytealign_S (w6[1], w6[2], offset); - c2[1] = amd_bytealign_S (w6[0], w6[1], offset); - c2[0] = amd_bytealign_S (w5[3], w6[0], offset); - c1[3] = amd_bytealign_S (w5[2], w5[3], offset); - c1[2] = amd_bytealign_S (w5[1], w5[2], offset); - c1[1] = amd_bytealign_S (w5[0], w5[1], offset); - c1[0] = amd_bytealign_S (w4[3], w5[0], offset); - c0[3] = amd_bytealign_S (w4[2], w4[3], offset); - c0[2] = amd_bytealign_S (w4[1], w4[2], offset); - c0[1] = amd_bytealign_S (w4[0], w4[1], offset); - c0[0] = amd_bytealign_S (w3[3], w4[0], offset); - w7[3] = amd_bytealign_S (w3[2], w3[3], offset); - w7[2] = amd_bytealign_S (w3[1], w3[2], offset); - w7[1] = amd_bytealign_S (w3[0], w3[1], offset); - w7[0] = amd_bytealign_S (w2[3], w3[0], offset); - w6[3] = amd_bytealign_S (w2[2], w2[3], offset); - w6[2] = amd_bytealign_S (w2[1], w2[2], offset); - w6[1] = amd_bytealign_S (w2[0], w2[1], offset); - w6[0] = amd_bytealign_S (w1[3], w2[0], offset); - w5[3] = amd_bytealign_S (w1[2], w1[3], offset); - w5[2] = amd_bytealign_S (w1[1], w1[2], offset); - w5[1] = amd_bytealign_S (w1[0], w1[1], offset); - w5[0] = amd_bytealign_S (w0[3], w1[0], offset); - w4[3] = amd_bytealign_S (w0[2], w0[3], offset); - w4[2] = amd_bytealign_S (w0[1], w0[2], offset); - w4[1] = amd_bytealign_S (w0[0], w0[1], offset); - w4[0] = amd_bytealign_S ( 0, w0[0], offset); + c4[0] = hc_bytealign_S (w7[3], 0, offset); + c3[3] = hc_bytealign_S (w7[2], w7[3], offset); + c3[2] = hc_bytealign_S (w7[1], w7[2], offset); + c3[1] = hc_bytealign_S (w7[0], w7[1], offset); + c3[0] = hc_bytealign_S (w6[3], w7[0], offset); + c2[3] = hc_bytealign_S (w6[2], w6[3], offset); + c2[2] = hc_bytealign_S (w6[1], w6[2], offset); + c2[1] = hc_bytealign_S (w6[0], w6[1], offset); + c2[0] = hc_bytealign_S (w5[3], w6[0], offset); + c1[3] = hc_bytealign_S (w5[2], w5[3], offset); + c1[2] = hc_bytealign_S (w5[1], w5[2], offset); + c1[1] = hc_bytealign_S (w5[0], w5[1], offset); + c1[0] = hc_bytealign_S (w4[3], w5[0], offset); + c0[3] = hc_bytealign_S (w4[2], w4[3], offset); + c0[2] = hc_bytealign_S (w4[1], w4[2], offset); + c0[1] = hc_bytealign_S (w4[0], w4[1], offset); + c0[0] = hc_bytealign_S (w3[3], w4[0], offset); + w7[3] = hc_bytealign_S (w3[2], w3[3], offset); + w7[2] = hc_bytealign_S (w3[1], w3[2], offset); + w7[1] = hc_bytealign_S (w3[0], w3[1], offset); + w7[0] = hc_bytealign_S (w2[3], w3[0], offset); + w6[3] = hc_bytealign_S (w2[2], w2[3], offset); + w6[2] = hc_bytealign_S (w2[1], w2[2], offset); + w6[1] = hc_bytealign_S (w2[0], w2[1], offset); + w6[0] = hc_bytealign_S (w1[3], w2[0], offset); + w5[3] = hc_bytealign_S (w1[2], w1[3], offset); + w5[2] = hc_bytealign_S (w1[1], w1[2], offset); + w5[1] = hc_bytealign_S (w1[0], w1[1], offset); + w5[0] = hc_bytealign_S (w0[3], w1[0], offset); + w4[3] = hc_bytealign_S (w0[2], w0[3], offset); + w4[2] = hc_bytealign_S (w0[1], w0[2], offset); + w4[1] = hc_bytealign_S (w0[0], w0[1], offset); + w4[0] = hc_bytealign_S ( 0, w0[0], offset); w3[3] = 0; w3[2] = 0; w3[1] = 0; @@ -39727,39 +39727,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 17: - c4[1] = amd_bytealign_S (w7[3], 0, offset); - c4[0] = amd_bytealign_S (w7[2], w7[3], offset); - c3[3] = amd_bytealign_S (w7[1], w7[2], offset); - c3[2] = amd_bytealign_S (w7[0], w7[1], offset); - c3[1] = amd_bytealign_S (w6[3], w7[0], offset); - c3[0] = amd_bytealign_S (w6[2], w6[3], offset); - c2[3] = amd_bytealign_S (w6[1], w6[2], offset); - c2[2] = amd_bytealign_S (w6[0], w6[1], offset); - c2[1] = amd_bytealign_S (w5[3], w6[0], offset); - c2[0] = amd_bytealign_S (w5[2], w5[3], offset); - c1[3] = amd_bytealign_S (w5[1], w5[2], offset); - c1[2] = amd_bytealign_S (w5[0], w5[1], offset); - c1[1] = amd_bytealign_S (w4[3], w5[0], offset); - c1[0] = amd_bytealign_S (w4[2], w4[3], offset); - c0[3] = amd_bytealign_S (w4[1], w4[2], offset); - c0[2] = amd_bytealign_S (w4[0], w4[1], offset); - c0[1] = amd_bytealign_S (w3[3], w4[0], offset); - c0[0] = amd_bytealign_S (w3[2], w3[3], offset); - w7[3] = amd_bytealign_S (w3[1], w3[2], offset); - w7[2] = amd_bytealign_S (w3[0], w3[1], offset); - w7[1] = amd_bytealign_S (w2[3], w3[0], offset); - w7[0] = amd_bytealign_S (w2[2], w2[3], offset); - w6[3] = amd_bytealign_S (w2[1], w2[2], offset); - w6[2] = amd_bytealign_S (w2[0], w2[1], offset); - w6[1] = amd_bytealign_S (w1[3], w2[0], offset); - w6[0] = amd_bytealign_S (w1[2], w1[3], offset); - w5[3] = amd_bytealign_S (w1[1], w1[2], offset); - w5[2] = amd_bytealign_S (w1[0], w1[1], offset); - w5[1] = amd_bytealign_S (w0[3], w1[0], offset); - w5[0] = amd_bytealign_S (w0[2], w0[3], offset); - w4[3] = amd_bytealign_S (w0[1], w0[2], offset); - w4[2] = amd_bytealign_S (w0[0], w0[1], offset); - w4[1] = amd_bytealign_S ( 0, w0[0], offset); + c4[1] = hc_bytealign_S (w7[3], 0, offset); + c4[0] = hc_bytealign_S (w7[2], w7[3], offset); + c3[3] = hc_bytealign_S (w7[1], w7[2], offset); + c3[2] = hc_bytealign_S (w7[0], w7[1], offset); + c3[1] = hc_bytealign_S (w6[3], w7[0], offset); + c3[0] = hc_bytealign_S (w6[2], w6[3], offset); + c2[3] = hc_bytealign_S (w6[1], w6[2], offset); + c2[2] = hc_bytealign_S (w6[0], w6[1], offset); + c2[1] = hc_bytealign_S (w5[3], w6[0], offset); + c2[0] = hc_bytealign_S (w5[2], w5[3], offset); + c1[3] = hc_bytealign_S (w5[1], w5[2], offset); + c1[2] = hc_bytealign_S (w5[0], w5[1], offset); + c1[1] = hc_bytealign_S (w4[3], w5[0], offset); + c1[0] = hc_bytealign_S (w4[2], w4[3], offset); + c0[3] = hc_bytealign_S (w4[1], w4[2], offset); + c0[2] = hc_bytealign_S (w4[0], w4[1], offset); + c0[1] = hc_bytealign_S (w3[3], w4[0], offset); + c0[0] = hc_bytealign_S (w3[2], w3[3], offset); + w7[3] = hc_bytealign_S (w3[1], w3[2], offset); + w7[2] = hc_bytealign_S (w3[0], w3[1], offset); + w7[1] = hc_bytealign_S (w2[3], w3[0], offset); + w7[0] = hc_bytealign_S (w2[2], w2[3], offset); + w6[3] = hc_bytealign_S (w2[1], w2[2], offset); + w6[2] = hc_bytealign_S (w2[0], w2[1], offset); + w6[1] = hc_bytealign_S (w1[3], w2[0], offset); + w6[0] = hc_bytealign_S (w1[2], w1[3], offset); + w5[3] = hc_bytealign_S (w1[1], w1[2], offset); + w5[2] = hc_bytealign_S (w1[0], w1[1], offset); + w5[1] = hc_bytealign_S (w0[3], w1[0], offset); + w5[0] = hc_bytealign_S (w0[2], w0[3], offset); + w4[3] = hc_bytealign_S (w0[1], w0[2], offset); + w4[2] = hc_bytealign_S (w0[0], w0[1], offset); + w4[1] = hc_bytealign_S ( 0, w0[0], offset); w4[0] = 0; w3[3] = 0; w3[2] = 0; @@ -39781,39 +39781,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 18: - c4[2] = amd_bytealign_S (w7[3], 0, offset); - c4[1] = amd_bytealign_S (w7[2], w7[3], offset); - c4[0] = amd_bytealign_S (w7[1], w7[2], offset); - c3[3] = amd_bytealign_S (w7[0], w7[1], offset); - c3[2] = amd_bytealign_S (w6[3], w7[0], offset); - c3[1] = amd_bytealign_S (w6[2], w6[3], offset); - c3[0] = amd_bytealign_S (w6[1], w6[2], offset); - c2[3] = amd_bytealign_S (w6[0], w6[1], offset); - c2[2] = amd_bytealign_S (w5[3], w6[0], offset); - c2[1] = amd_bytealign_S (w5[2], w5[3], offset); - c2[0] = amd_bytealign_S (w5[1], w5[2], offset); - c1[3] = amd_bytealign_S (w5[0], w5[1], offset); - c1[2] = amd_bytealign_S (w4[3], w5[0], offset); - c1[1] = amd_bytealign_S (w4[2], w4[3], offset); - c1[0] = amd_bytealign_S (w4[1], w4[2], offset); - c0[3] = amd_bytealign_S (w4[0], w4[1], offset); - c0[2] = amd_bytealign_S (w3[3], w4[0], offset); - c0[1] = amd_bytealign_S (w3[2], w3[3], offset); - c0[0] = amd_bytealign_S (w3[1], w3[2], offset); - w7[3] = amd_bytealign_S (w3[0], w3[1], offset); - w7[2] = amd_bytealign_S (w2[3], w3[0], offset); - w7[1] = amd_bytealign_S (w2[2], w2[3], offset); - w7[0] = amd_bytealign_S (w2[1], w2[2], offset); - w6[3] = amd_bytealign_S (w2[0], w2[1], offset); - w6[2] = amd_bytealign_S (w1[3], w2[0], offset); - w6[1] = amd_bytealign_S (w1[2], w1[3], offset); - w6[0] = amd_bytealign_S (w1[1], w1[2], offset); - w5[3] = amd_bytealign_S (w1[0], w1[1], offset); - w5[2] = amd_bytealign_S (w0[3], w1[0], offset); - w5[1] = amd_bytealign_S (w0[2], w0[3], offset); - w5[0] = amd_bytealign_S (w0[1], w0[2], offset); - w4[3] = amd_bytealign_S (w0[0], w0[1], offset); - w4[2] = amd_bytealign_S ( 0, w0[0], offset); + c4[2] = hc_bytealign_S (w7[3], 0, offset); + c4[1] = hc_bytealign_S (w7[2], w7[3], offset); + c4[0] = hc_bytealign_S (w7[1], w7[2], offset); + c3[3] = hc_bytealign_S (w7[0], w7[1], offset); + c3[2] = hc_bytealign_S (w6[3], w7[0], offset); + c3[1] = hc_bytealign_S (w6[2], w6[3], offset); + c3[0] = hc_bytealign_S (w6[1], w6[2], offset); + c2[3] = hc_bytealign_S (w6[0], w6[1], offset); + c2[2] = hc_bytealign_S (w5[3], w6[0], offset); + c2[1] = hc_bytealign_S (w5[2], w5[3], offset); + c2[0] = hc_bytealign_S (w5[1], w5[2], offset); + c1[3] = hc_bytealign_S (w5[0], w5[1], offset); + c1[2] = hc_bytealign_S (w4[3], w5[0], offset); + c1[1] = hc_bytealign_S (w4[2], w4[3], offset); + c1[0] = hc_bytealign_S (w4[1], w4[2], offset); + c0[3] = hc_bytealign_S (w4[0], w4[1], offset); + c0[2] = hc_bytealign_S (w3[3], w4[0], offset); + c0[1] = hc_bytealign_S (w3[2], w3[3], offset); + c0[0] = hc_bytealign_S (w3[1], w3[2], offset); + w7[3] = hc_bytealign_S (w3[0], w3[1], offset); + w7[2] = hc_bytealign_S (w2[3], w3[0], offset); + w7[1] = hc_bytealign_S (w2[2], w2[3], offset); + w7[0] = hc_bytealign_S (w2[1], w2[2], offset); + w6[3] = hc_bytealign_S (w2[0], w2[1], offset); + w6[2] = hc_bytealign_S (w1[3], w2[0], offset); + w6[1] = hc_bytealign_S (w1[2], w1[3], offset); + w6[0] = hc_bytealign_S (w1[1], w1[2], offset); + w5[3] = hc_bytealign_S (w1[0], w1[1], offset); + w5[2] = hc_bytealign_S (w0[3], w1[0], offset); + w5[1] = hc_bytealign_S (w0[2], w0[3], offset); + w5[0] = hc_bytealign_S (w0[1], w0[2], offset); + w4[3] = hc_bytealign_S (w0[0], w0[1], offset); + w4[2] = hc_bytealign_S ( 0, w0[0], offset); w4[1] = 0; w4[0] = 0; w3[3] = 0; @@ -39836,39 +39836,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 19: - c4[3] = amd_bytealign_S (w7[3], 0, offset); - c4[2] = amd_bytealign_S (w7[2], w7[3], offset); - c4[1] = amd_bytealign_S (w7[1], w7[2], offset); - c4[0] = amd_bytealign_S (w7[0], w7[1], offset); - c3[3] = amd_bytealign_S (w6[3], w7[0], offset); - c3[2] = amd_bytealign_S (w6[2], w6[3], offset); - c3[1] = amd_bytealign_S (w6[1], w6[2], offset); - c3[0] = amd_bytealign_S (w6[0], w6[1], offset); - c2[3] = amd_bytealign_S (w5[3], w6[0], offset); - c2[2] = amd_bytealign_S (w5[2], w5[3], offset); - c2[1] = amd_bytealign_S (w5[1], w5[2], offset); - c2[0] = amd_bytealign_S (w5[0], w5[1], offset); - c1[3] = amd_bytealign_S (w4[3], w5[0], offset); - c1[2] = amd_bytealign_S (w4[2], w4[3], offset); - c1[1] = amd_bytealign_S (w4[1], w4[2], offset); - c1[0] = amd_bytealign_S (w4[0], w4[1], offset); - c0[3] = amd_bytealign_S (w3[3], w4[0], offset); - c0[2] = amd_bytealign_S (w3[2], w3[3], offset); - c0[1] = amd_bytealign_S (w3[1], w3[2], offset); - c0[0] = amd_bytealign_S (w3[0], w3[1], offset); - w7[3] = amd_bytealign_S (w2[3], w3[0], offset); - w7[2] = amd_bytealign_S (w2[2], w2[3], offset); - w7[1] = amd_bytealign_S (w2[1], w2[2], offset); - w7[0] = amd_bytealign_S (w2[0], w2[1], offset); - w6[3] = amd_bytealign_S (w1[3], w2[0], offset); - w6[2] = amd_bytealign_S (w1[2], w1[3], offset); - w6[1] = amd_bytealign_S (w1[1], w1[2], offset); - w6[0] = amd_bytealign_S (w1[0], w1[1], offset); - w5[3] = amd_bytealign_S (w0[3], w1[0], offset); - w5[2] = amd_bytealign_S (w0[2], w0[3], offset); - w5[1] = amd_bytealign_S (w0[1], w0[2], offset); - w5[0] = amd_bytealign_S (w0[0], w0[1], offset); - w4[3] = amd_bytealign_S ( 0, w0[0], offset); + c4[3] = hc_bytealign_S (w7[3], 0, offset); + c4[2] = hc_bytealign_S (w7[2], w7[3], offset); + c4[1] = hc_bytealign_S (w7[1], w7[2], offset); + c4[0] = hc_bytealign_S (w7[0], w7[1], offset); + c3[3] = hc_bytealign_S (w6[3], w7[0], offset); + c3[2] = hc_bytealign_S (w6[2], w6[3], offset); + c3[1] = hc_bytealign_S (w6[1], w6[2], offset); + c3[0] = hc_bytealign_S (w6[0], w6[1], offset); + c2[3] = hc_bytealign_S (w5[3], w6[0], offset); + c2[2] = hc_bytealign_S (w5[2], w5[3], offset); + c2[1] = hc_bytealign_S (w5[1], w5[2], offset); + c2[0] = hc_bytealign_S (w5[0], w5[1], offset); + c1[3] = hc_bytealign_S (w4[3], w5[0], offset); + c1[2] = hc_bytealign_S (w4[2], w4[3], offset); + c1[1] = hc_bytealign_S (w4[1], w4[2], offset); + c1[0] = hc_bytealign_S (w4[0], w4[1], offset); + c0[3] = hc_bytealign_S (w3[3], w4[0], offset); + c0[2] = hc_bytealign_S (w3[2], w3[3], offset); + c0[1] = hc_bytealign_S (w3[1], w3[2], offset); + c0[0] = hc_bytealign_S (w3[0], w3[1], offset); + w7[3] = hc_bytealign_S (w2[3], w3[0], offset); + w7[2] = hc_bytealign_S (w2[2], w2[3], offset); + w7[1] = hc_bytealign_S (w2[1], w2[2], offset); + w7[0] = hc_bytealign_S (w2[0], w2[1], offset); + w6[3] = hc_bytealign_S (w1[3], w2[0], offset); + w6[2] = hc_bytealign_S (w1[2], w1[3], offset); + w6[1] = hc_bytealign_S (w1[1], w1[2], offset); + w6[0] = hc_bytealign_S (w1[0], w1[1], offset); + w5[3] = hc_bytealign_S (w0[3], w1[0], offset); + w5[2] = hc_bytealign_S (w0[2], w0[3], offset); + w5[1] = hc_bytealign_S (w0[1], w0[2], offset); + w5[0] = hc_bytealign_S (w0[0], w0[1], offset); + w4[3] = hc_bytealign_S ( 0, w0[0], offset); w4[2] = 0; w4[1] = 0; w4[0] = 0; @@ -39892,39 +39892,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 20: - c5[0] = amd_bytealign_S (w7[3], 0, offset); - c4[3] = amd_bytealign_S (w7[2], w7[3], offset); - c4[2] = amd_bytealign_S (w7[1], w7[2], offset); - c4[1] = amd_bytealign_S (w7[0], w7[1], offset); - c4[0] = amd_bytealign_S (w6[3], w7[0], offset); - c3[3] = amd_bytealign_S (w6[2], w6[3], offset); - c3[2] = amd_bytealign_S (w6[1], w6[2], offset); - c3[1] = amd_bytealign_S (w6[0], w6[1], offset); - c3[0] = amd_bytealign_S (w5[3], w6[0], offset); - c2[3] = amd_bytealign_S (w5[2], w5[3], offset); - c2[2] = amd_bytealign_S (w5[1], w5[2], offset); - c2[1] = amd_bytealign_S (w5[0], w5[1], offset); - c2[0] = amd_bytealign_S (w4[3], w5[0], offset); - c1[3] = amd_bytealign_S (w4[2], w4[3], offset); - c1[2] = amd_bytealign_S (w4[1], w4[2], offset); - c1[1] = amd_bytealign_S (w4[0], w4[1], offset); - c1[0] = amd_bytealign_S (w3[3], w4[0], offset); - c0[3] = amd_bytealign_S (w3[2], w3[3], offset); - c0[2] = amd_bytealign_S (w3[1], w3[2], offset); - c0[1] = amd_bytealign_S (w3[0], w3[1], offset); - c0[0] = amd_bytealign_S (w2[3], w3[0], offset); - w7[3] = amd_bytealign_S (w2[2], w2[3], offset); - w7[2] = amd_bytealign_S (w2[1], w2[2], offset); - w7[1] = amd_bytealign_S (w2[0], w2[1], offset); - w7[0] = amd_bytealign_S (w1[3], w2[0], offset); - w6[3] = amd_bytealign_S (w1[2], w1[3], offset); - w6[2] = amd_bytealign_S (w1[1], w1[2], offset); - w6[1] = amd_bytealign_S (w1[0], w1[1], offset); - w6[0] = amd_bytealign_S (w0[3], w1[0], offset); - w5[3] = amd_bytealign_S (w0[2], w0[3], offset); - w5[2] = amd_bytealign_S (w0[1], w0[2], offset); - w5[1] = amd_bytealign_S (w0[0], w0[1], offset); - w5[0] = amd_bytealign_S ( 0, w0[0], offset); + c5[0] = hc_bytealign_S (w7[3], 0, offset); + c4[3] = hc_bytealign_S (w7[2], w7[3], offset); + c4[2] = hc_bytealign_S (w7[1], w7[2], offset); + c4[1] = hc_bytealign_S (w7[0], w7[1], offset); + c4[0] = hc_bytealign_S (w6[3], w7[0], offset); + c3[3] = hc_bytealign_S (w6[2], w6[3], offset); + c3[2] = hc_bytealign_S (w6[1], w6[2], offset); + c3[1] = hc_bytealign_S (w6[0], w6[1], offset); + c3[0] = hc_bytealign_S (w5[3], w6[0], offset); + c2[3] = hc_bytealign_S (w5[2], w5[3], offset); + c2[2] = hc_bytealign_S (w5[1], w5[2], offset); + c2[1] = hc_bytealign_S (w5[0], w5[1], offset); + c2[0] = hc_bytealign_S (w4[3], w5[0], offset); + c1[3] = hc_bytealign_S (w4[2], w4[3], offset); + c1[2] = hc_bytealign_S (w4[1], w4[2], offset); + c1[1] = hc_bytealign_S (w4[0], w4[1], offset); + c1[0] = hc_bytealign_S (w3[3], w4[0], offset); + c0[3] = hc_bytealign_S (w3[2], w3[3], offset); + c0[2] = hc_bytealign_S (w3[1], w3[2], offset); + c0[1] = hc_bytealign_S (w3[0], w3[1], offset); + c0[0] = hc_bytealign_S (w2[3], w3[0], offset); + w7[3] = hc_bytealign_S (w2[2], w2[3], offset); + w7[2] = hc_bytealign_S (w2[1], w2[2], offset); + w7[1] = hc_bytealign_S (w2[0], w2[1], offset); + w7[0] = hc_bytealign_S (w1[3], w2[0], offset); + w6[3] = hc_bytealign_S (w1[2], w1[3], offset); + w6[2] = hc_bytealign_S (w1[1], w1[2], offset); + w6[1] = hc_bytealign_S (w1[0], w1[1], offset); + w6[0] = hc_bytealign_S (w0[3], w1[0], offset); + w5[3] = hc_bytealign_S (w0[2], w0[3], offset); + w5[2] = hc_bytealign_S (w0[1], w0[2], offset); + w5[1] = hc_bytealign_S (w0[0], w0[1], offset); + w5[0] = hc_bytealign_S ( 0, w0[0], offset); w4[3] = 0; w4[2] = 0; w4[1] = 0; @@ -39949,39 +39949,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 21: - c5[1] = amd_bytealign_S (w7[3], 0, offset); - c5[0] = amd_bytealign_S (w7[2], w7[3], offset); - c4[3] = amd_bytealign_S (w7[1], w7[2], offset); - c4[2] = amd_bytealign_S (w7[0], w7[1], offset); - c4[1] = amd_bytealign_S (w6[3], w7[0], offset); - c4[0] = amd_bytealign_S (w6[2], w6[3], offset); - c3[3] = amd_bytealign_S (w6[1], w6[2], offset); - c3[2] = amd_bytealign_S (w6[0], w6[1], offset); - c3[1] = amd_bytealign_S (w5[3], w6[0], offset); - c3[0] = amd_bytealign_S (w5[2], w5[3], offset); - c2[3] = amd_bytealign_S (w5[1], w5[2], offset); - c2[2] = amd_bytealign_S (w5[0], w5[1], offset); - c2[1] = amd_bytealign_S (w4[3], w5[0], offset); - c2[0] = amd_bytealign_S (w4[2], w4[3], offset); - c1[3] = amd_bytealign_S (w4[1], w4[2], offset); - c1[2] = amd_bytealign_S (w4[0], w4[1], offset); - c1[1] = amd_bytealign_S (w3[3], w4[0], offset); - c1[0] = amd_bytealign_S (w3[2], w3[3], offset); - c0[3] = amd_bytealign_S (w3[1], w3[2], offset); - c0[2] = amd_bytealign_S (w3[0], w3[1], offset); - c0[1] = amd_bytealign_S (w2[3], w3[0], offset); - c0[0] = amd_bytealign_S (w2[2], w2[3], offset); - w7[3] = amd_bytealign_S (w2[1], w2[2], offset); - w7[2] = amd_bytealign_S (w2[0], w2[1], offset); - w7[1] = amd_bytealign_S (w1[3], w2[0], offset); - w7[0] = amd_bytealign_S (w1[2], w1[3], offset); - w6[3] = amd_bytealign_S (w1[1], w1[2], offset); - w6[2] = amd_bytealign_S (w1[0], w1[1], offset); - w6[1] = amd_bytealign_S (w0[3], w1[0], offset); - w6[0] = amd_bytealign_S (w0[2], w0[3], offset); - w5[3] = amd_bytealign_S (w0[1], w0[2], offset); - w5[2] = amd_bytealign_S (w0[0], w0[1], offset); - w5[1] = amd_bytealign_S ( 0, w0[0], offset); + c5[1] = hc_bytealign_S (w7[3], 0, offset); + c5[0] = hc_bytealign_S (w7[2], w7[3], offset); + c4[3] = hc_bytealign_S (w7[1], w7[2], offset); + c4[2] = hc_bytealign_S (w7[0], w7[1], offset); + c4[1] = hc_bytealign_S (w6[3], w7[0], offset); + c4[0] = hc_bytealign_S (w6[2], w6[3], offset); + c3[3] = hc_bytealign_S (w6[1], w6[2], offset); + c3[2] = hc_bytealign_S (w6[0], w6[1], offset); + c3[1] = hc_bytealign_S (w5[3], w6[0], offset); + c3[0] = hc_bytealign_S (w5[2], w5[3], offset); + c2[3] = hc_bytealign_S (w5[1], w5[2], offset); + c2[2] = hc_bytealign_S (w5[0], w5[1], offset); + c2[1] = hc_bytealign_S (w4[3], w5[0], offset); + c2[0] = hc_bytealign_S (w4[2], w4[3], offset); + c1[3] = hc_bytealign_S (w4[1], w4[2], offset); + c1[2] = hc_bytealign_S (w4[0], w4[1], offset); + c1[1] = hc_bytealign_S (w3[3], w4[0], offset); + c1[0] = hc_bytealign_S (w3[2], w3[3], offset); + c0[3] = hc_bytealign_S (w3[1], w3[2], offset); + c0[2] = hc_bytealign_S (w3[0], w3[1], offset); + c0[1] = hc_bytealign_S (w2[3], w3[0], offset); + c0[0] = hc_bytealign_S (w2[2], w2[3], offset); + w7[3] = hc_bytealign_S (w2[1], w2[2], offset); + w7[2] = hc_bytealign_S (w2[0], w2[1], offset); + w7[1] = hc_bytealign_S (w1[3], w2[0], offset); + w7[0] = hc_bytealign_S (w1[2], w1[3], offset); + w6[3] = hc_bytealign_S (w1[1], w1[2], offset); + w6[2] = hc_bytealign_S (w1[0], w1[1], offset); + w6[1] = hc_bytealign_S (w0[3], w1[0], offset); + w6[0] = hc_bytealign_S (w0[2], w0[3], offset); + w5[3] = hc_bytealign_S (w0[1], w0[2], offset); + w5[2] = hc_bytealign_S (w0[0], w0[1], offset); + w5[1] = hc_bytealign_S ( 0, w0[0], offset); w5[0] = 0; w4[3] = 0; w4[2] = 0; @@ -40007,39 +40007,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 22: - c5[2] = amd_bytealign_S (w7[3], 0, offset); - c5[1] = amd_bytealign_S (w7[2], w7[3], offset); - c5[0] = amd_bytealign_S (w7[1], w7[2], offset); - c4[3] = amd_bytealign_S (w7[0], w7[1], offset); - c4[2] = amd_bytealign_S (w6[3], w7[0], offset); - c4[1] = amd_bytealign_S (w6[2], w6[3], offset); - c4[0] = amd_bytealign_S (w6[1], w6[2], offset); - c3[3] = amd_bytealign_S (w6[0], w6[1], offset); - c3[2] = amd_bytealign_S (w5[3], w6[0], offset); - c3[1] = amd_bytealign_S (w5[2], w5[3], offset); - c3[0] = amd_bytealign_S (w5[1], w5[2], offset); - c2[3] = amd_bytealign_S (w5[0], w5[1], offset); - c2[2] = amd_bytealign_S (w4[3], w5[0], offset); - c2[1] = amd_bytealign_S (w4[2], w4[3], offset); - c2[0] = amd_bytealign_S (w4[1], w4[2], offset); - c1[3] = amd_bytealign_S (w4[0], w4[1], offset); - c1[2] = amd_bytealign_S (w3[3], w4[0], offset); - c1[1] = amd_bytealign_S (w3[2], w3[3], offset); - c1[0] = amd_bytealign_S (w3[1], w3[2], offset); - c0[3] = amd_bytealign_S (w3[0], w3[1], offset); - c0[2] = amd_bytealign_S (w2[3], w3[0], offset); - c0[1] = amd_bytealign_S (w2[2], w2[3], offset); - c0[0] = amd_bytealign_S (w2[1], w2[2], offset); - w7[3] = amd_bytealign_S (w2[0], w2[1], offset); - w7[2] = amd_bytealign_S (w1[3], w2[0], offset); - w7[1] = amd_bytealign_S (w1[2], w1[3], offset); - w7[0] = amd_bytealign_S (w1[1], w1[2], offset); - w6[3] = amd_bytealign_S (w1[0], w1[1], offset); - w6[2] = amd_bytealign_S (w0[3], w1[0], offset); - w6[1] = amd_bytealign_S (w0[2], w0[3], offset); - w6[0] = amd_bytealign_S (w0[1], w0[2], offset); - w5[3] = amd_bytealign_S (w0[0], w0[1], offset); - w5[2] = amd_bytealign_S ( 0, w0[0], offset); + c5[2] = hc_bytealign_S (w7[3], 0, offset); + c5[1] = hc_bytealign_S (w7[2], w7[3], offset); + c5[0] = hc_bytealign_S (w7[1], w7[2], offset); + c4[3] = hc_bytealign_S (w7[0], w7[1], offset); + c4[2] = hc_bytealign_S (w6[3], w7[0], offset); + c4[1] = hc_bytealign_S (w6[2], w6[3], offset); + c4[0] = hc_bytealign_S (w6[1], w6[2], offset); + c3[3] = hc_bytealign_S (w6[0], w6[1], offset); + c3[2] = hc_bytealign_S (w5[3], w6[0], offset); + c3[1] = hc_bytealign_S (w5[2], w5[3], offset); + c3[0] = hc_bytealign_S (w5[1], w5[2], offset); + c2[3] = hc_bytealign_S (w5[0], w5[1], offset); + c2[2] = hc_bytealign_S (w4[3], w5[0], offset); + c2[1] = hc_bytealign_S (w4[2], w4[3], offset); + c2[0] = hc_bytealign_S (w4[1], w4[2], offset); + c1[3] = hc_bytealign_S (w4[0], w4[1], offset); + c1[2] = hc_bytealign_S (w3[3], w4[0], offset); + c1[1] = hc_bytealign_S (w3[2], w3[3], offset); + c1[0] = hc_bytealign_S (w3[1], w3[2], offset); + c0[3] = hc_bytealign_S (w3[0], w3[1], offset); + c0[2] = hc_bytealign_S (w2[3], w3[0], offset); + c0[1] = hc_bytealign_S (w2[2], w2[3], offset); + c0[0] = hc_bytealign_S (w2[1], w2[2], offset); + w7[3] = hc_bytealign_S (w2[0], w2[1], offset); + w7[2] = hc_bytealign_S (w1[3], w2[0], offset); + w7[1] = hc_bytealign_S (w1[2], w1[3], offset); + w7[0] = hc_bytealign_S (w1[1], w1[2], offset); + w6[3] = hc_bytealign_S (w1[0], w1[1], offset); + w6[2] = hc_bytealign_S (w0[3], w1[0], offset); + w6[1] = hc_bytealign_S (w0[2], w0[3], offset); + w6[0] = hc_bytealign_S (w0[1], w0[2], offset); + w5[3] = hc_bytealign_S (w0[0], w0[1], offset); + w5[2] = hc_bytealign_S ( 0, w0[0], offset); w5[1] = 0; w5[0] = 0; w4[3] = 0; @@ -40066,39 +40066,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 23: - c5[3] = amd_bytealign_S (w7[3], 0, offset); - c5[2] = amd_bytealign_S (w7[2], w7[3], offset); - c5[1] = amd_bytealign_S (w7[1], w7[2], offset); - c5[0] = amd_bytealign_S (w7[0], w7[1], offset); - c4[3] = amd_bytealign_S (w6[3], w7[0], offset); - c4[2] = amd_bytealign_S (w6[2], w6[3], offset); - c4[1] = amd_bytealign_S (w6[1], w6[2], offset); - c4[0] = amd_bytealign_S (w6[0], w6[1], offset); - c3[3] = amd_bytealign_S (w5[3], w6[0], offset); - c3[2] = amd_bytealign_S (w5[2], w5[3], offset); - c3[1] = amd_bytealign_S (w5[1], w5[2], offset); - c3[0] = amd_bytealign_S (w5[0], w5[1], offset); - c2[3] = amd_bytealign_S (w4[3], w5[0], offset); - c2[2] = amd_bytealign_S (w4[2], w4[3], offset); - c2[1] = amd_bytealign_S (w4[1], w4[2], offset); - c2[0] = amd_bytealign_S (w4[0], w4[1], offset); - c1[3] = amd_bytealign_S (w3[3], w4[0], offset); - c1[2] = amd_bytealign_S (w3[2], w3[3], offset); - c1[1] = amd_bytealign_S (w3[1], w3[2], offset); - c1[0] = amd_bytealign_S (w3[0], w3[1], offset); - c0[3] = amd_bytealign_S (w2[3], w3[0], offset); - c0[2] = amd_bytealign_S (w2[2], w2[3], offset); - c0[1] = amd_bytealign_S (w2[1], w2[2], offset); - c0[0] = amd_bytealign_S (w2[0], w2[1], offset); - w7[3] = amd_bytealign_S (w1[3], w2[0], offset); - w7[2] = amd_bytealign_S (w1[2], w1[3], offset); - w7[1] = amd_bytealign_S (w1[1], w1[2], offset); - w7[0] = amd_bytealign_S (w1[0], w1[1], offset); - w6[3] = amd_bytealign_S (w0[3], w1[0], offset); - w6[2] = amd_bytealign_S (w0[2], w0[3], offset); - w6[1] = amd_bytealign_S (w0[1], w0[2], offset); - w6[0] = amd_bytealign_S (w0[0], w0[1], offset); - w5[3] = amd_bytealign_S ( 0, w0[0], offset); + c5[3] = hc_bytealign_S (w7[3], 0, offset); + c5[2] = hc_bytealign_S (w7[2], w7[3], offset); + c5[1] = hc_bytealign_S (w7[1], w7[2], offset); + c5[0] = hc_bytealign_S (w7[0], w7[1], offset); + c4[3] = hc_bytealign_S (w6[3], w7[0], offset); + c4[2] = hc_bytealign_S (w6[2], w6[3], offset); + c4[1] = hc_bytealign_S (w6[1], w6[2], offset); + c4[0] = hc_bytealign_S (w6[0], w6[1], offset); + c3[3] = hc_bytealign_S (w5[3], w6[0], offset); + c3[2] = hc_bytealign_S (w5[2], w5[3], offset); + c3[1] = hc_bytealign_S (w5[1], w5[2], offset); + c3[0] = hc_bytealign_S (w5[0], w5[1], offset); + c2[3] = hc_bytealign_S (w4[3], w5[0], offset); + c2[2] = hc_bytealign_S (w4[2], w4[3], offset); + c2[1] = hc_bytealign_S (w4[1], w4[2], offset); + c2[0] = hc_bytealign_S (w4[0], w4[1], offset); + c1[3] = hc_bytealign_S (w3[3], w4[0], offset); + c1[2] = hc_bytealign_S (w3[2], w3[3], offset); + c1[1] = hc_bytealign_S (w3[1], w3[2], offset); + c1[0] = hc_bytealign_S (w3[0], w3[1], offset); + c0[3] = hc_bytealign_S (w2[3], w3[0], offset); + c0[2] = hc_bytealign_S (w2[2], w2[3], offset); + c0[1] = hc_bytealign_S (w2[1], w2[2], offset); + c0[0] = hc_bytealign_S (w2[0], w2[1], offset); + w7[3] = hc_bytealign_S (w1[3], w2[0], offset); + w7[2] = hc_bytealign_S (w1[2], w1[3], offset); + w7[1] = hc_bytealign_S (w1[1], w1[2], offset); + w7[0] = hc_bytealign_S (w1[0], w1[1], offset); + w6[3] = hc_bytealign_S (w0[3], w1[0], offset); + w6[2] = hc_bytealign_S (w0[2], w0[3], offset); + w6[1] = hc_bytealign_S (w0[1], w0[2], offset); + w6[0] = hc_bytealign_S (w0[0], w0[1], offset); + w5[3] = hc_bytealign_S ( 0, w0[0], offset); w5[2] = 0; w5[1] = 0; w5[0] = 0; @@ -40126,39 +40126,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 24: - c6[0] = amd_bytealign_S (w7[3], 0, offset); - c5[3] = amd_bytealign_S (w7[2], w7[3], offset); - c5[2] = amd_bytealign_S (w7[1], w7[2], offset); - c5[1] = amd_bytealign_S (w7[0], w7[1], offset); - c5[0] = amd_bytealign_S (w6[3], w7[0], offset); - c4[3] = amd_bytealign_S (w6[2], w6[3], offset); - c4[2] = amd_bytealign_S (w6[1], w6[2], offset); - c4[1] = amd_bytealign_S (w6[0], w6[1], offset); - c4[0] = amd_bytealign_S (w5[3], w6[0], offset); - c3[3] = amd_bytealign_S (w5[2], w5[3], offset); - c3[2] = amd_bytealign_S (w5[1], w5[2], offset); - c3[1] = amd_bytealign_S (w5[0], w5[1], offset); - c3[0] = amd_bytealign_S (w4[3], w5[0], offset); - c2[3] = amd_bytealign_S (w4[2], w4[3], offset); - c2[2] = amd_bytealign_S (w4[1], w4[2], offset); - c2[1] = amd_bytealign_S (w4[0], w4[1], offset); - c2[0] = amd_bytealign_S (w3[3], w4[0], offset); - c1[3] = amd_bytealign_S (w3[2], w3[3], offset); - c1[2] = amd_bytealign_S (w3[1], w3[2], offset); - c1[1] = amd_bytealign_S (w3[0], w3[1], offset); - c1[0] = amd_bytealign_S (w2[3], w3[0], offset); - c0[3] = amd_bytealign_S (w2[2], w2[3], offset); - c0[2] = amd_bytealign_S (w2[1], w2[2], offset); - c0[1] = amd_bytealign_S (w2[0], w2[1], offset); - c0[0] = amd_bytealign_S (w1[3], w2[0], offset); - w7[3] = amd_bytealign_S (w1[2], w1[3], offset); - w7[2] = amd_bytealign_S (w1[1], w1[2], offset); - w7[1] = amd_bytealign_S (w1[0], w1[1], offset); - w7[0] = amd_bytealign_S (w0[3], w1[0], offset); - w6[3] = amd_bytealign_S (w0[2], w0[3], offset); - w6[2] = amd_bytealign_S (w0[1], w0[2], offset); - w6[1] = amd_bytealign_S (w0[0], w0[1], offset); - w6[0] = amd_bytealign_S ( 0, w0[0], offset); + c6[0] = hc_bytealign_S (w7[3], 0, offset); + c5[3] = hc_bytealign_S (w7[2], w7[3], offset); + c5[2] = hc_bytealign_S (w7[1], w7[2], offset); + c5[1] = hc_bytealign_S (w7[0], w7[1], offset); + c5[0] = hc_bytealign_S (w6[3], w7[0], offset); + c4[3] = hc_bytealign_S (w6[2], w6[3], offset); + c4[2] = hc_bytealign_S (w6[1], w6[2], offset); + c4[1] = hc_bytealign_S (w6[0], w6[1], offset); + c4[0] = hc_bytealign_S (w5[3], w6[0], offset); + c3[3] = hc_bytealign_S (w5[2], w5[3], offset); + c3[2] = hc_bytealign_S (w5[1], w5[2], offset); + c3[1] = hc_bytealign_S (w5[0], w5[1], offset); + c3[0] = hc_bytealign_S (w4[3], w5[0], offset); + c2[3] = hc_bytealign_S (w4[2], w4[3], offset); + c2[2] = hc_bytealign_S (w4[1], w4[2], offset); + c2[1] = hc_bytealign_S (w4[0], w4[1], offset); + c2[0] = hc_bytealign_S (w3[3], w4[0], offset); + c1[3] = hc_bytealign_S (w3[2], w3[3], offset); + c1[2] = hc_bytealign_S (w3[1], w3[2], offset); + c1[1] = hc_bytealign_S (w3[0], w3[1], offset); + c1[0] = hc_bytealign_S (w2[3], w3[0], offset); + c0[3] = hc_bytealign_S (w2[2], w2[3], offset); + c0[2] = hc_bytealign_S (w2[1], w2[2], offset); + c0[1] = hc_bytealign_S (w2[0], w2[1], offset); + c0[0] = hc_bytealign_S (w1[3], w2[0], offset); + w7[3] = hc_bytealign_S (w1[2], w1[3], offset); + w7[2] = hc_bytealign_S (w1[1], w1[2], offset); + w7[1] = hc_bytealign_S (w1[0], w1[1], offset); + w7[0] = hc_bytealign_S (w0[3], w1[0], offset); + w6[3] = hc_bytealign_S (w0[2], w0[3], offset); + w6[2] = hc_bytealign_S (w0[1], w0[2], offset); + w6[1] = hc_bytealign_S (w0[0], w0[1], offset); + w6[0] = hc_bytealign_S ( 0, w0[0], offset); w5[3] = 0; w5[2] = 0; w5[1] = 0; @@ -40187,39 +40187,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 25: - c6[1] = amd_bytealign_S (w7[3], 0, offset); - c6[0] = amd_bytealign_S (w7[2], w7[3], offset); - c5[3] = amd_bytealign_S (w7[1], w7[2], offset); - c5[2] = amd_bytealign_S (w7[0], w7[1], offset); - c5[1] = amd_bytealign_S (w6[3], w7[0], offset); - c5[0] = amd_bytealign_S (w6[2], w6[3], offset); - c4[3] = amd_bytealign_S (w6[1], w6[2], offset); - c4[2] = amd_bytealign_S (w6[0], w6[1], offset); - c4[1] = amd_bytealign_S (w5[3], w6[0], offset); - c4[0] = amd_bytealign_S (w5[2], w5[3], offset); - c3[3] = amd_bytealign_S (w5[1], w5[2], offset); - c3[2] = amd_bytealign_S (w5[0], w5[1], offset); - c3[1] = amd_bytealign_S (w4[3], w5[0], offset); - c3[0] = amd_bytealign_S (w4[2], w4[3], offset); - c2[3] = amd_bytealign_S (w4[1], w4[2], offset); - c2[2] = amd_bytealign_S (w4[0], w4[1], offset); - c2[1] = amd_bytealign_S (w3[3], w4[0], offset); - c2[0] = amd_bytealign_S (w3[2], w3[3], offset); - c1[3] = amd_bytealign_S (w3[1], w3[2], offset); - c1[2] = amd_bytealign_S (w3[0], w3[1], offset); - c1[1] = amd_bytealign_S (w2[3], w3[0], offset); - c1[0] = amd_bytealign_S (w2[2], w2[3], offset); - c0[3] = amd_bytealign_S (w2[1], w2[2], offset); - c0[2] = amd_bytealign_S (w2[0], w2[1], offset); - c0[1] = amd_bytealign_S (w1[3], w2[0], offset); - c0[0] = amd_bytealign_S (w1[2], w1[3], offset); - w7[3] = amd_bytealign_S (w1[1], w1[2], offset); - w7[2] = amd_bytealign_S (w1[0], w1[1], offset); - w7[1] = amd_bytealign_S (w0[3], w1[0], offset); - w7[0] = amd_bytealign_S (w0[2], w0[3], offset); - w6[3] = amd_bytealign_S (w0[1], w0[2], offset); - w6[2] = amd_bytealign_S (w0[0], w0[1], offset); - w6[1] = amd_bytealign_S ( 0, w0[0], offset); + c6[1] = hc_bytealign_S (w7[3], 0, offset); + c6[0] = hc_bytealign_S (w7[2], w7[3], offset); + c5[3] = hc_bytealign_S (w7[1], w7[2], offset); + c5[2] = hc_bytealign_S (w7[0], w7[1], offset); + c5[1] = hc_bytealign_S (w6[3], w7[0], offset); + c5[0] = hc_bytealign_S (w6[2], w6[3], offset); + c4[3] = hc_bytealign_S (w6[1], w6[2], offset); + c4[2] = hc_bytealign_S (w6[0], w6[1], offset); + c4[1] = hc_bytealign_S (w5[3], w6[0], offset); + c4[0] = hc_bytealign_S (w5[2], w5[3], offset); + c3[3] = hc_bytealign_S (w5[1], w5[2], offset); + c3[2] = hc_bytealign_S (w5[0], w5[1], offset); + c3[1] = hc_bytealign_S (w4[3], w5[0], offset); + c3[0] = hc_bytealign_S (w4[2], w4[3], offset); + c2[3] = hc_bytealign_S (w4[1], w4[2], offset); + c2[2] = hc_bytealign_S (w4[0], w4[1], offset); + c2[1] = hc_bytealign_S (w3[3], w4[0], offset); + c2[0] = hc_bytealign_S (w3[2], w3[3], offset); + c1[3] = hc_bytealign_S (w3[1], w3[2], offset); + c1[2] = hc_bytealign_S (w3[0], w3[1], offset); + c1[1] = hc_bytealign_S (w2[3], w3[0], offset); + c1[0] = hc_bytealign_S (w2[2], w2[3], offset); + c0[3] = hc_bytealign_S (w2[1], w2[2], offset); + c0[2] = hc_bytealign_S (w2[0], w2[1], offset); + c0[1] = hc_bytealign_S (w1[3], w2[0], offset); + c0[0] = hc_bytealign_S (w1[2], w1[3], offset); + w7[3] = hc_bytealign_S (w1[1], w1[2], offset); + w7[2] = hc_bytealign_S (w1[0], w1[1], offset); + w7[1] = hc_bytealign_S (w0[3], w1[0], offset); + w7[0] = hc_bytealign_S (w0[2], w0[3], offset); + w6[3] = hc_bytealign_S (w0[1], w0[2], offset); + w6[2] = hc_bytealign_S (w0[0], w0[1], offset); + w6[1] = hc_bytealign_S ( 0, w0[0], offset); w6[0] = 0; w5[3] = 0; w5[2] = 0; @@ -40249,39 +40249,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 26: - c6[2] = amd_bytealign_S (w7[3], 0, offset); - c6[1] = amd_bytealign_S (w7[2], w7[3], offset); - c6[0] = amd_bytealign_S (w7[1], w7[2], offset); - c5[3] = amd_bytealign_S (w7[0], w7[1], offset); - c5[2] = amd_bytealign_S (w6[3], w7[0], offset); - c5[1] = amd_bytealign_S (w6[2], w6[3], offset); - c5[0] = amd_bytealign_S (w6[1], w6[2], offset); - c4[3] = amd_bytealign_S (w6[0], w6[1], offset); - c4[2] = amd_bytealign_S (w5[3], w6[0], offset); - c4[1] = amd_bytealign_S (w5[2], w5[3], offset); - c4[0] = amd_bytealign_S (w5[1], w5[2], offset); - c3[3] = amd_bytealign_S (w5[0], w5[1], offset); - c3[2] = amd_bytealign_S (w4[3], w5[0], offset); - c3[1] = amd_bytealign_S (w4[2], w4[3], offset); - c3[0] = amd_bytealign_S (w4[1], w4[2], offset); - c2[3] = amd_bytealign_S (w4[0], w4[1], offset); - c2[2] = amd_bytealign_S (w3[3], w4[0], offset); - c2[1] = amd_bytealign_S (w3[2], w3[3], offset); - c2[0] = amd_bytealign_S (w3[1], w3[2], offset); - c1[3] = amd_bytealign_S (w3[0], w3[1], offset); - c1[2] = amd_bytealign_S (w2[3], w3[0], offset); - c1[1] = amd_bytealign_S (w2[2], w2[3], offset); - c1[0] = amd_bytealign_S (w2[1], w2[2], offset); - c0[3] = amd_bytealign_S (w2[0], w2[1], offset); - c0[2] = amd_bytealign_S (w1[3], w2[0], offset); - c0[1] = amd_bytealign_S (w1[2], w1[3], offset); - c0[0] = amd_bytealign_S (w1[1], w1[2], offset); - w7[3] = amd_bytealign_S (w1[0], w1[1], offset); - w7[2] = amd_bytealign_S (w0[3], w1[0], offset); - w7[1] = amd_bytealign_S (w0[2], w0[3], offset); - w7[0] = amd_bytealign_S (w0[1], w0[2], offset); - w6[3] = amd_bytealign_S (w0[0], w0[1], offset); - w6[2] = amd_bytealign_S ( 0, w0[0], offset); + c6[2] = hc_bytealign_S (w7[3], 0, offset); + c6[1] = hc_bytealign_S (w7[2], w7[3], offset); + c6[0] = hc_bytealign_S (w7[1], w7[2], offset); + c5[3] = hc_bytealign_S (w7[0], w7[1], offset); + c5[2] = hc_bytealign_S (w6[3], w7[0], offset); + c5[1] = hc_bytealign_S (w6[2], w6[3], offset); + c5[0] = hc_bytealign_S (w6[1], w6[2], offset); + c4[3] = hc_bytealign_S (w6[0], w6[1], offset); + c4[2] = hc_bytealign_S (w5[3], w6[0], offset); + c4[1] = hc_bytealign_S (w5[2], w5[3], offset); + c4[0] = hc_bytealign_S (w5[1], w5[2], offset); + c3[3] = hc_bytealign_S (w5[0], w5[1], offset); + c3[2] = hc_bytealign_S (w4[3], w5[0], offset); + c3[1] = hc_bytealign_S (w4[2], w4[3], offset); + c3[0] = hc_bytealign_S (w4[1], w4[2], offset); + c2[3] = hc_bytealign_S (w4[0], w4[1], offset); + c2[2] = hc_bytealign_S (w3[3], w4[0], offset); + c2[1] = hc_bytealign_S (w3[2], w3[3], offset); + c2[0] = hc_bytealign_S (w3[1], w3[2], offset); + c1[3] = hc_bytealign_S (w3[0], w3[1], offset); + c1[2] = hc_bytealign_S (w2[3], w3[0], offset); + c1[1] = hc_bytealign_S (w2[2], w2[3], offset); + c1[0] = hc_bytealign_S (w2[1], w2[2], offset); + c0[3] = hc_bytealign_S (w2[0], w2[1], offset); + c0[2] = hc_bytealign_S (w1[3], w2[0], offset); + c0[1] = hc_bytealign_S (w1[2], w1[3], offset); + c0[0] = hc_bytealign_S (w1[1], w1[2], offset); + w7[3] = hc_bytealign_S (w1[0], w1[1], offset); + w7[2] = hc_bytealign_S (w0[3], w1[0], offset); + w7[1] = hc_bytealign_S (w0[2], w0[3], offset); + w7[0] = hc_bytealign_S (w0[1], w0[2], offset); + w6[3] = hc_bytealign_S (w0[0], w0[1], offset); + w6[2] = hc_bytealign_S ( 0, w0[0], offset); w6[1] = 0; w6[0] = 0; w5[3] = 0; @@ -40312,39 +40312,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 27: - c6[3] = amd_bytealign_S (w7[3], 0, offset); - c6[2] = amd_bytealign_S (w7[2], w7[3], offset); - c6[1] = amd_bytealign_S (w7[1], w7[2], offset); - c6[0] = amd_bytealign_S (w7[0], w7[1], offset); - c5[3] = amd_bytealign_S (w6[3], w7[0], offset); - c5[2] = amd_bytealign_S (w6[2], w6[3], offset); - c5[1] = amd_bytealign_S (w6[1], w6[2], offset); - c5[0] = amd_bytealign_S (w6[0], w6[1], offset); - c4[3] = amd_bytealign_S (w5[3], w6[0], offset); - c4[2] = amd_bytealign_S (w5[2], w5[3], offset); - c4[1] = amd_bytealign_S (w5[1], w5[2], offset); - c4[0] = amd_bytealign_S (w5[0], w5[1], offset); - c3[3] = amd_bytealign_S (w4[3], w5[0], offset); - c3[2] = amd_bytealign_S (w4[2], w4[3], offset); - c3[1] = amd_bytealign_S (w4[1], w4[2], offset); - c3[0] = amd_bytealign_S (w4[0], w4[1], offset); - c2[3] = amd_bytealign_S (w3[3], w4[0], offset); - c2[2] = amd_bytealign_S (w3[2], w3[3], offset); - c2[1] = amd_bytealign_S (w3[1], w3[2], offset); - c2[0] = amd_bytealign_S (w3[0], w3[1], offset); - c1[3] = amd_bytealign_S (w2[3], w3[0], offset); - c1[2] = amd_bytealign_S (w2[2], w2[3], offset); - c1[1] = amd_bytealign_S (w2[1], w2[2], offset); - c1[0] = amd_bytealign_S (w2[0], w2[1], offset); - c0[3] = amd_bytealign_S (w1[3], w2[0], offset); - c0[2] = amd_bytealign_S (w1[2], w1[3], offset); - c0[1] = amd_bytealign_S (w1[1], w1[2], offset); - c0[0] = amd_bytealign_S (w1[0], w1[1], offset); - w7[3] = amd_bytealign_S (w0[3], w1[0], offset); - w7[2] = amd_bytealign_S (w0[2], w0[3], offset); - w7[1] = amd_bytealign_S (w0[1], w0[2], offset); - w7[0] = amd_bytealign_S (w0[0], w0[1], offset); - w6[3] = amd_bytealign_S ( 0, w0[0], offset); + c6[3] = hc_bytealign_S (w7[3], 0, offset); + c6[2] = hc_bytealign_S (w7[2], w7[3], offset); + c6[1] = hc_bytealign_S (w7[1], w7[2], offset); + c6[0] = hc_bytealign_S (w7[0], w7[1], offset); + c5[3] = hc_bytealign_S (w6[3], w7[0], offset); + c5[2] = hc_bytealign_S (w6[2], w6[3], offset); + c5[1] = hc_bytealign_S (w6[1], w6[2], offset); + c5[0] = hc_bytealign_S (w6[0], w6[1], offset); + c4[3] = hc_bytealign_S (w5[3], w6[0], offset); + c4[2] = hc_bytealign_S (w5[2], w5[3], offset); + c4[1] = hc_bytealign_S (w5[1], w5[2], offset); + c4[0] = hc_bytealign_S (w5[0], w5[1], offset); + c3[3] = hc_bytealign_S (w4[3], w5[0], offset); + c3[2] = hc_bytealign_S (w4[2], w4[3], offset); + c3[1] = hc_bytealign_S (w4[1], w4[2], offset); + c3[0] = hc_bytealign_S (w4[0], w4[1], offset); + c2[3] = hc_bytealign_S (w3[3], w4[0], offset); + c2[2] = hc_bytealign_S (w3[2], w3[3], offset); + c2[1] = hc_bytealign_S (w3[1], w3[2], offset); + c2[0] = hc_bytealign_S (w3[0], w3[1], offset); + c1[3] = hc_bytealign_S (w2[3], w3[0], offset); + c1[2] = hc_bytealign_S (w2[2], w2[3], offset); + c1[1] = hc_bytealign_S (w2[1], w2[2], offset); + c1[0] = hc_bytealign_S (w2[0], w2[1], offset); + c0[3] = hc_bytealign_S (w1[3], w2[0], offset); + c0[2] = hc_bytealign_S (w1[2], w1[3], offset); + c0[1] = hc_bytealign_S (w1[1], w1[2], offset); + c0[0] = hc_bytealign_S (w1[0], w1[1], offset); + w7[3] = hc_bytealign_S (w0[3], w1[0], offset); + w7[2] = hc_bytealign_S (w0[2], w0[3], offset); + w7[1] = hc_bytealign_S (w0[1], w0[2], offset); + w7[0] = hc_bytealign_S (w0[0], w0[1], offset); + w6[3] = hc_bytealign_S ( 0, w0[0], offset); w6[2] = 0; w6[1] = 0; w6[0] = 0; @@ -40376,39 +40376,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 28: - c7[0] = amd_bytealign_S (w7[3], 0, offset); - c6[3] = amd_bytealign_S (w7[2], w7[3], offset); - c6[2] = amd_bytealign_S (w7[1], w7[2], offset); - c6[1] = amd_bytealign_S (w7[0], w7[1], offset); - c6[0] = amd_bytealign_S (w6[3], w7[0], offset); - c5[3] = amd_bytealign_S (w6[2], w6[3], offset); - c5[2] = amd_bytealign_S (w6[1], w6[2], offset); - c5[1] = amd_bytealign_S (w6[0], w6[1], offset); - c5[0] = amd_bytealign_S (w5[3], w6[0], offset); - c4[3] = amd_bytealign_S (w5[2], w5[3], offset); - c4[2] = amd_bytealign_S (w5[1], w5[2], offset); - c4[1] = amd_bytealign_S (w5[0], w5[1], offset); - c4[0] = amd_bytealign_S (w4[3], w5[0], offset); - c3[3] = amd_bytealign_S (w4[2], w4[3], offset); - c3[2] = amd_bytealign_S (w4[1], w4[2], offset); - c3[1] = amd_bytealign_S (w4[0], w4[1], offset); - c3[0] = amd_bytealign_S (w3[3], w4[0], offset); - c2[3] = amd_bytealign_S (w3[2], w3[3], offset); - c2[2] = amd_bytealign_S (w3[1], w3[2], offset); - c2[1] = amd_bytealign_S (w3[0], w3[1], offset); - c2[0] = amd_bytealign_S (w2[3], w3[0], offset); - c1[3] = amd_bytealign_S (w2[2], w2[3], offset); - c1[2] = amd_bytealign_S (w2[1], w2[2], offset); - c1[1] = amd_bytealign_S (w2[0], w2[1], offset); - c1[0] = amd_bytealign_S (w1[3], w2[0], offset); - c0[3] = amd_bytealign_S (w1[2], w1[3], offset); - c0[2] = amd_bytealign_S (w1[1], w1[2], offset); - c0[1] = amd_bytealign_S (w1[0], w1[1], offset); - c0[0] = amd_bytealign_S (w0[3], w1[0], offset); - w7[3] = amd_bytealign_S (w0[2], w0[3], offset); - w7[2] = amd_bytealign_S (w0[1], w0[2], offset); - w7[1] = amd_bytealign_S (w0[0], w0[1], offset); - w7[0] = amd_bytealign_S ( 0, w0[0], offset); + c7[0] = hc_bytealign_S (w7[3], 0, offset); + c6[3] = hc_bytealign_S (w7[2], w7[3], offset); + c6[2] = hc_bytealign_S (w7[1], w7[2], offset); + c6[1] = hc_bytealign_S (w7[0], w7[1], offset); + c6[0] = hc_bytealign_S (w6[3], w7[0], offset); + c5[3] = hc_bytealign_S (w6[2], w6[3], offset); + c5[2] = hc_bytealign_S (w6[1], w6[2], offset); + c5[1] = hc_bytealign_S (w6[0], w6[1], offset); + c5[0] = hc_bytealign_S (w5[3], w6[0], offset); + c4[3] = hc_bytealign_S (w5[2], w5[3], offset); + c4[2] = hc_bytealign_S (w5[1], w5[2], offset); + c4[1] = hc_bytealign_S (w5[0], w5[1], offset); + c4[0] = hc_bytealign_S (w4[3], w5[0], offset); + c3[3] = hc_bytealign_S (w4[2], w4[3], offset); + c3[2] = hc_bytealign_S (w4[1], w4[2], offset); + c3[1] = hc_bytealign_S (w4[0], w4[1], offset); + c3[0] = hc_bytealign_S (w3[3], w4[0], offset); + c2[3] = hc_bytealign_S (w3[2], w3[3], offset); + c2[2] = hc_bytealign_S (w3[1], w3[2], offset); + c2[1] = hc_bytealign_S (w3[0], w3[1], offset); + c2[0] = hc_bytealign_S (w2[3], w3[0], offset); + c1[3] = hc_bytealign_S (w2[2], w2[3], offset); + c1[2] = hc_bytealign_S (w2[1], w2[2], offset); + c1[1] = hc_bytealign_S (w2[0], w2[1], offset); + c1[0] = hc_bytealign_S (w1[3], w2[0], offset); + c0[3] = hc_bytealign_S (w1[2], w1[3], offset); + c0[2] = hc_bytealign_S (w1[1], w1[2], offset); + c0[1] = hc_bytealign_S (w1[0], w1[1], offset); + c0[0] = hc_bytealign_S (w0[3], w1[0], offset); + w7[3] = hc_bytealign_S (w0[2], w0[3], offset); + w7[2] = hc_bytealign_S (w0[1], w0[2], offset); + w7[1] = hc_bytealign_S (w0[0], w0[1], offset); + w7[0] = hc_bytealign_S ( 0, w0[0], offset); w6[3] = 0; w6[2] = 0; w6[1] = 0; @@ -40441,39 +40441,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 29: - c7[1] = amd_bytealign_S (w7[3], 0, offset); - c7[0] = amd_bytealign_S (w7[2], w7[3], offset); - c6[3] = amd_bytealign_S (w7[1], w7[2], offset); - c6[2] = amd_bytealign_S (w7[0], w7[1], offset); - c6[1] = amd_bytealign_S (w6[3], w7[0], offset); - c6[0] = amd_bytealign_S (w6[2], w6[3], offset); - c5[3] = amd_bytealign_S (w6[1], w6[2], offset); - c5[2] = amd_bytealign_S (w6[0], w6[1], offset); - c5[1] = amd_bytealign_S (w5[3], w6[0], offset); - c5[0] = amd_bytealign_S (w5[2], w5[3], offset); - c4[3] = amd_bytealign_S (w5[1], w5[2], offset); - c4[2] = amd_bytealign_S (w5[0], w5[1], offset); - c4[1] = amd_bytealign_S (w4[3], w5[0], offset); - c4[0] = amd_bytealign_S (w4[2], w4[3], offset); - c3[3] = amd_bytealign_S (w4[1], w4[2], offset); - c3[2] = amd_bytealign_S (w4[0], w4[1], offset); - c3[1] = amd_bytealign_S (w3[3], w4[0], offset); - c3[0] = amd_bytealign_S (w3[2], w3[3], offset); - c2[3] = amd_bytealign_S (w3[1], w3[2], offset); - c2[2] = amd_bytealign_S (w3[0], w3[1], offset); - c2[1] = amd_bytealign_S (w2[3], w3[0], offset); - c2[0] = amd_bytealign_S (w2[2], w2[3], offset); - c1[3] = amd_bytealign_S (w2[1], w2[2], offset); - c1[2] = amd_bytealign_S (w2[0], w2[1], offset); - c1[1] = amd_bytealign_S (w1[3], w2[0], offset); - c1[0] = amd_bytealign_S (w1[2], w1[3], offset); - c0[3] = amd_bytealign_S (w1[1], w1[2], offset); - c0[2] = amd_bytealign_S (w1[0], w1[1], offset); - c0[1] = amd_bytealign_S (w0[3], w1[0], offset); - c0[0] = amd_bytealign_S (w0[2], w0[3], offset); - w7[3] = amd_bytealign_S (w0[1], w0[2], offset); - w7[2] = amd_bytealign_S (w0[0], w0[1], offset); - w7[1] = amd_bytealign_S ( 0, w0[0], offset); + c7[1] = hc_bytealign_S (w7[3], 0, offset); + c7[0] = hc_bytealign_S (w7[2], w7[3], offset); + c6[3] = hc_bytealign_S (w7[1], w7[2], offset); + c6[2] = hc_bytealign_S (w7[0], w7[1], offset); + c6[1] = hc_bytealign_S (w6[3], w7[0], offset); + c6[0] = hc_bytealign_S (w6[2], w6[3], offset); + c5[3] = hc_bytealign_S (w6[1], w6[2], offset); + c5[2] = hc_bytealign_S (w6[0], w6[1], offset); + c5[1] = hc_bytealign_S (w5[3], w6[0], offset); + c5[0] = hc_bytealign_S (w5[2], w5[3], offset); + c4[3] = hc_bytealign_S (w5[1], w5[2], offset); + c4[2] = hc_bytealign_S (w5[0], w5[1], offset); + c4[1] = hc_bytealign_S (w4[3], w5[0], offset); + c4[0] = hc_bytealign_S (w4[2], w4[3], offset); + c3[3] = hc_bytealign_S (w4[1], w4[2], offset); + c3[2] = hc_bytealign_S (w4[0], w4[1], offset); + c3[1] = hc_bytealign_S (w3[3], w4[0], offset); + c3[0] = hc_bytealign_S (w3[2], w3[3], offset); + c2[3] = hc_bytealign_S (w3[1], w3[2], offset); + c2[2] = hc_bytealign_S (w3[0], w3[1], offset); + c2[1] = hc_bytealign_S (w2[3], w3[0], offset); + c2[0] = hc_bytealign_S (w2[2], w2[3], offset); + c1[3] = hc_bytealign_S (w2[1], w2[2], offset); + c1[2] = hc_bytealign_S (w2[0], w2[1], offset); + c1[1] = hc_bytealign_S (w1[3], w2[0], offset); + c1[0] = hc_bytealign_S (w1[2], w1[3], offset); + c0[3] = hc_bytealign_S (w1[1], w1[2], offset); + c0[2] = hc_bytealign_S (w1[0], w1[1], offset); + c0[1] = hc_bytealign_S (w0[3], w1[0], offset); + c0[0] = hc_bytealign_S (w0[2], w0[3], offset); + w7[3] = hc_bytealign_S (w0[1], w0[2], offset); + w7[2] = hc_bytealign_S (w0[0], w0[1], offset); + w7[1] = hc_bytealign_S ( 0, w0[0], offset); w7[0] = 0; w6[3] = 0; w6[2] = 0; @@ -40507,39 +40507,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 30: - c7[2] = amd_bytealign_S (w7[3], 0, offset); - c7[1] = amd_bytealign_S (w7[2], w7[3], offset); - c7[0] = amd_bytealign_S (w7[1], w7[2], offset); - c6[3] = amd_bytealign_S (w7[0], w7[1], offset); - c6[2] = amd_bytealign_S (w6[3], w7[0], offset); - c6[1] = amd_bytealign_S (w6[2], w6[3], offset); - c6[0] = amd_bytealign_S (w6[1], w6[2], offset); - c5[3] = amd_bytealign_S (w6[0], w6[1], offset); - c5[2] = amd_bytealign_S (w5[3], w6[0], offset); - c5[1] = amd_bytealign_S (w5[2], w5[3], offset); - c5[0] = amd_bytealign_S (w5[1], w5[2], offset); - c4[3] = amd_bytealign_S (w5[0], w5[1], offset); - c4[2] = amd_bytealign_S (w4[3], w5[0], offset); - c4[1] = amd_bytealign_S (w4[2], w4[3], offset); - c4[0] = amd_bytealign_S (w4[1], w4[2], offset); - c3[3] = amd_bytealign_S (w4[0], w4[1], offset); - c3[2] = amd_bytealign_S (w3[3], w4[0], offset); - c3[1] = amd_bytealign_S (w3[2], w3[3], offset); - c3[0] = amd_bytealign_S (w3[1], w3[2], offset); - c2[3] = amd_bytealign_S (w3[0], w3[1], offset); - c2[2] = amd_bytealign_S (w2[3], w3[0], offset); - c2[1] = amd_bytealign_S (w2[2], w2[3], offset); - c2[0] = amd_bytealign_S (w2[1], w2[2], offset); - c1[3] = amd_bytealign_S (w2[0], w2[1], offset); - c1[2] = amd_bytealign_S (w1[3], w2[0], offset); - c1[1] = amd_bytealign_S (w1[2], w1[3], offset); - c1[0] = amd_bytealign_S (w1[1], w1[2], offset); - c0[3] = amd_bytealign_S (w1[0], w1[1], offset); - c0[2] = amd_bytealign_S (w0[3], w1[0], offset); - c0[1] = amd_bytealign_S (w0[2], w0[3], offset); - c0[0] = amd_bytealign_S (w0[1], w0[2], offset); - w7[3] = amd_bytealign_S (w0[0], w0[1], offset); - w7[2] = amd_bytealign_S ( 0, w0[0], offset); + c7[2] = hc_bytealign_S (w7[3], 0, offset); + c7[1] = hc_bytealign_S (w7[2], w7[3], offset); + c7[0] = hc_bytealign_S (w7[1], w7[2], offset); + c6[3] = hc_bytealign_S (w7[0], w7[1], offset); + c6[2] = hc_bytealign_S (w6[3], w7[0], offset); + c6[1] = hc_bytealign_S (w6[2], w6[3], offset); + c6[0] = hc_bytealign_S (w6[1], w6[2], offset); + c5[3] = hc_bytealign_S (w6[0], w6[1], offset); + c5[2] = hc_bytealign_S (w5[3], w6[0], offset); + c5[1] = hc_bytealign_S (w5[2], w5[3], offset); + c5[0] = hc_bytealign_S (w5[1], w5[2], offset); + c4[3] = hc_bytealign_S (w5[0], w5[1], offset); + c4[2] = hc_bytealign_S (w4[3], w5[0], offset); + c4[1] = hc_bytealign_S (w4[2], w4[3], offset); + c4[0] = hc_bytealign_S (w4[1], w4[2], offset); + c3[3] = hc_bytealign_S (w4[0], w4[1], offset); + c3[2] = hc_bytealign_S (w3[3], w4[0], offset); + c3[1] = hc_bytealign_S (w3[2], w3[3], offset); + c3[0] = hc_bytealign_S (w3[1], w3[2], offset); + c2[3] = hc_bytealign_S (w3[0], w3[1], offset); + c2[2] = hc_bytealign_S (w2[3], w3[0], offset); + c2[1] = hc_bytealign_S (w2[2], w2[3], offset); + c2[0] = hc_bytealign_S (w2[1], w2[2], offset); + c1[3] = hc_bytealign_S (w2[0], w2[1], offset); + c1[2] = hc_bytealign_S (w1[3], w2[0], offset); + c1[1] = hc_bytealign_S (w1[2], w1[3], offset); + c1[0] = hc_bytealign_S (w1[1], w1[2], offset); + c0[3] = hc_bytealign_S (w1[0], w1[1], offset); + c0[2] = hc_bytealign_S (w0[3], w1[0], offset); + c0[1] = hc_bytealign_S (w0[2], w0[3], offset); + c0[0] = hc_bytealign_S (w0[1], w0[2], offset); + w7[3] = hc_bytealign_S (w0[0], w0[1], offset); + w7[2] = hc_bytealign_S ( 0, w0[0], offset); w7[1] = 0; w7[0] = 0; w6[3] = 0; @@ -40574,39 +40574,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 31: - c7[3] = amd_bytealign_S (w7[3], 0, offset); - c7[2] = amd_bytealign_S (w7[2], w7[3], offset); - c7[1] = amd_bytealign_S (w7[1], w7[2], offset); - c7[0] = amd_bytealign_S (w7[0], w7[1], offset); - c6[3] = amd_bytealign_S (w6[3], w7[0], offset); - c6[2] = amd_bytealign_S (w6[2], w6[3], offset); - c6[1] = amd_bytealign_S (w6[1], w6[2], offset); - c6[0] = amd_bytealign_S (w6[0], w6[1], offset); - c5[3] = amd_bytealign_S (w5[3], w6[0], offset); - c5[2] = amd_bytealign_S (w5[2], w5[3], offset); - c5[1] = amd_bytealign_S (w5[1], w5[2], offset); - c5[0] = amd_bytealign_S (w5[0], w5[1], offset); - c4[3] = amd_bytealign_S (w4[3], w5[0], offset); - c4[2] = amd_bytealign_S (w4[2], w4[3], offset); - c4[1] = amd_bytealign_S (w4[1], w4[2], offset); - c4[0] = amd_bytealign_S (w4[0], w4[1], offset); - c3[3] = amd_bytealign_S (w3[3], w4[0], offset); - c3[2] = amd_bytealign_S (w3[2], w3[3], offset); - c3[1] = amd_bytealign_S (w3[1], w3[2], offset); - c3[0] = amd_bytealign_S (w3[0], w3[1], offset); - c2[3] = amd_bytealign_S (w2[3], w3[0], offset); - c2[2] = amd_bytealign_S (w2[2], w2[3], offset); - c2[1] = amd_bytealign_S (w2[1], w2[2], offset); - c2[0] = amd_bytealign_S (w2[0], w2[1], offset); - c1[3] = amd_bytealign_S (w1[3], w2[0], offset); - c1[2] = amd_bytealign_S (w1[2], w1[3], offset); - c1[1] = amd_bytealign_S (w1[1], w1[2], offset); - c1[0] = amd_bytealign_S (w1[0], w1[1], offset); - c0[3] = amd_bytealign_S (w0[3], w1[0], offset); - c0[2] = amd_bytealign_S (w0[2], w0[3], offset); - c0[1] = amd_bytealign_S (w0[1], w0[2], offset); - c0[0] = amd_bytealign_S (w0[0], w0[1], offset); - w7[3] = amd_bytealign_S ( 0, w0[0], offset); + c7[3] = hc_bytealign_S (w7[3], 0, offset); + c7[2] = hc_bytealign_S (w7[2], w7[3], offset); + c7[1] = hc_bytealign_S (w7[1], w7[2], offset); + c7[0] = hc_bytealign_S (w7[0], w7[1], offset); + c6[3] = hc_bytealign_S (w6[3], w7[0], offset); + c6[2] = hc_bytealign_S (w6[2], w6[3], offset); + c6[1] = hc_bytealign_S (w6[1], w6[2], offset); + c6[0] = hc_bytealign_S (w6[0], w6[1], offset); + c5[3] = hc_bytealign_S (w5[3], w6[0], offset); + c5[2] = hc_bytealign_S (w5[2], w5[3], offset); + c5[1] = hc_bytealign_S (w5[1], w5[2], offset); + c5[0] = hc_bytealign_S (w5[0], w5[1], offset); + c4[3] = hc_bytealign_S (w4[3], w5[0], offset); + c4[2] = hc_bytealign_S (w4[2], w4[3], offset); + c4[1] = hc_bytealign_S (w4[1], w4[2], offset); + c4[0] = hc_bytealign_S (w4[0], w4[1], offset); + c3[3] = hc_bytealign_S (w3[3], w4[0], offset); + c3[2] = hc_bytealign_S (w3[2], w3[3], offset); + c3[1] = hc_bytealign_S (w3[1], w3[2], offset); + c3[0] = hc_bytealign_S (w3[0], w3[1], offset); + c2[3] = hc_bytealign_S (w2[3], w3[0], offset); + c2[2] = hc_bytealign_S (w2[2], w2[3], offset); + c2[1] = hc_bytealign_S (w2[1], w2[2], offset); + c2[0] = hc_bytealign_S (w2[0], w2[1], offset); + c1[3] = hc_bytealign_S (w1[3], w2[0], offset); + c1[2] = hc_bytealign_S (w1[2], w1[3], offset); + c1[1] = hc_bytealign_S (w1[1], w1[2], offset); + c1[0] = hc_bytealign_S (w1[0], w1[1], offset); + c0[3] = hc_bytealign_S (w0[3], w1[0], offset); + c0[2] = hc_bytealign_S (w0[2], w0[3], offset); + c0[1] = hc_bytealign_S (w0[1], w0[2], offset); + c0[0] = hc_bytealign_S (w0[0], w0[1], offset); + w7[3] = hc_bytealign_S ( 0, w0[0], offset); w7[2] = 0; w7[1] = 0; w7[0] = 0; @@ -40656,153 +40656,153 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, switch (offset_switch) { case 0: - c0[0] = __byte_perm_S ( 0, w7[3], selector); - w7[3] = __byte_perm_S (w7[3], w7[2], selector); - w7[2] = __byte_perm_S (w7[2], w7[1], selector); - w7[1] = __byte_perm_S (w7[1], w7[0], selector); - w7[0] = __byte_perm_S (w7[0], w6[3], selector); - w6[3] = __byte_perm_S (w6[3], w6[2], selector); - w6[2] = __byte_perm_S (w6[2], w6[1], selector); - w6[1] = __byte_perm_S (w6[1], w6[0], selector); - w6[0] = __byte_perm_S (w6[0], w5[3], selector); - w5[3] = __byte_perm_S (w5[3], w5[2], selector); - w5[2] = __byte_perm_S (w5[2], w5[1], selector); - w5[1] = __byte_perm_S (w5[1], w5[0], selector); - w5[0] = __byte_perm_S (w5[0], w4[3], selector); - w4[3] = __byte_perm_S (w4[3], w4[2], selector); - w4[2] = __byte_perm_S (w4[2], w4[1], selector); - w4[1] = __byte_perm_S (w4[1], w4[0], selector); - w4[0] = __byte_perm_S (w4[0], w3[3], selector); - w3[3] = __byte_perm_S (w3[3], w3[2], selector); - w3[2] = __byte_perm_S (w3[2], w3[1], selector); - w3[1] = __byte_perm_S (w3[1], w3[0], selector); - w3[0] = __byte_perm_S (w3[0], w2[3], selector); - w2[3] = __byte_perm_S (w2[3], w2[2], selector); - w2[2] = __byte_perm_S (w2[2], w2[1], selector); - w2[1] = __byte_perm_S (w2[1], w2[0], selector); - w2[0] = __byte_perm_S (w2[0], w1[3], selector); - w1[3] = __byte_perm_S (w1[3], w1[2], selector); - w1[2] = __byte_perm_S (w1[2], w1[1], selector); - w1[1] = __byte_perm_S (w1[1], w1[0], selector); - w1[0] = __byte_perm_S (w1[0], w0[3], selector); - w0[3] = __byte_perm_S (w0[3], w0[2], selector); - w0[2] = __byte_perm_S (w0[2], w0[1], selector); - w0[1] = __byte_perm_S (w0[1], w0[0], selector); - w0[0] = __byte_perm_S (w0[0], 0, selector); + c0[0] = hc_byte_perm_S ( 0, w7[3], selector); + w7[3] = hc_byte_perm_S (w7[3], w7[2], selector); + w7[2] = hc_byte_perm_S (w7[2], w7[1], selector); + w7[1] = hc_byte_perm_S (w7[1], w7[0], selector); + w7[0] = hc_byte_perm_S (w7[0], w6[3], selector); + w6[3] = hc_byte_perm_S (w6[3], w6[2], selector); + w6[2] = hc_byte_perm_S (w6[2], w6[1], selector); + w6[1] = hc_byte_perm_S (w6[1], w6[0], selector); + w6[0] = hc_byte_perm_S (w6[0], w5[3], selector); + w5[3] = hc_byte_perm_S (w5[3], w5[2], selector); + w5[2] = hc_byte_perm_S (w5[2], w5[1], selector); + w5[1] = hc_byte_perm_S (w5[1], w5[0], selector); + w5[0] = hc_byte_perm_S (w5[0], w4[3], selector); + w4[3] = hc_byte_perm_S (w4[3], w4[2], selector); + w4[2] = hc_byte_perm_S (w4[2], w4[1], selector); + w4[1] = hc_byte_perm_S (w4[1], w4[0], selector); + w4[0] = hc_byte_perm_S (w4[0], w3[3], selector); + w3[3] = hc_byte_perm_S (w3[3], w3[2], selector); + w3[2] = hc_byte_perm_S (w3[2], w3[1], selector); + w3[1] = hc_byte_perm_S (w3[1], w3[0], selector); + w3[0] = hc_byte_perm_S (w3[0], w2[3], selector); + w2[3] = hc_byte_perm_S (w2[3], w2[2], selector); + w2[2] = hc_byte_perm_S (w2[2], w2[1], selector); + w2[1] = hc_byte_perm_S (w2[1], w2[0], selector); + w2[0] = hc_byte_perm_S (w2[0], w1[3], selector); + w1[3] = hc_byte_perm_S (w1[3], w1[2], selector); + w1[2] = hc_byte_perm_S (w1[2], w1[1], selector); + w1[1] = hc_byte_perm_S (w1[1], w1[0], selector); + w1[0] = hc_byte_perm_S (w1[0], w0[3], selector); + w0[3] = hc_byte_perm_S (w0[3], w0[2], selector); + w0[2] = hc_byte_perm_S (w0[2], w0[1], selector); + w0[1] = hc_byte_perm_S (w0[1], w0[0], selector); + w0[0] = hc_byte_perm_S (w0[0], 0, selector); break; case 1: - c0[1] = __byte_perm_S ( 0, w7[3], selector); - c0[0] = __byte_perm_S (w7[3], w7[2], selector); - w7[3] = __byte_perm_S (w7[2], w7[1], selector); - w7[2] = __byte_perm_S (w7[1], w7[0], selector); - w7[1] = __byte_perm_S (w7[0], w6[3], selector); - w7[0] = __byte_perm_S (w6[3], w6[2], selector); - w6[3] = __byte_perm_S (w6[2], w6[1], selector); - w6[2] = __byte_perm_S (w6[1], w6[0], selector); - w6[1] = __byte_perm_S (w6[0], w5[3], selector); - w6[0] = __byte_perm_S (w5[3], w5[2], selector); - w5[3] = __byte_perm_S (w5[2], w5[1], selector); - w5[2] = __byte_perm_S (w5[1], w5[0], selector); - w5[1] = __byte_perm_S (w5[0], w4[3], selector); - w5[0] = __byte_perm_S (w4[3], w4[2], selector); - w4[3] = __byte_perm_S (w4[2], w4[1], selector); - w4[2] = __byte_perm_S (w4[1], w4[0], selector); - w4[1] = __byte_perm_S (w4[0], w3[3], selector); - w4[0] = __byte_perm_S (w3[3], w3[2], selector); - w3[3] = __byte_perm_S (w3[2], w3[1], selector); - w3[2] = __byte_perm_S (w3[1], w3[0], selector); - w3[1] = __byte_perm_S (w3[0], w2[3], selector); - w3[0] = __byte_perm_S (w2[3], w2[2], selector); - w2[3] = __byte_perm_S (w2[2], w2[1], selector); - w2[2] = __byte_perm_S (w2[1], w2[0], selector); - w2[1] = __byte_perm_S (w2[0], w1[3], selector); - w2[0] = __byte_perm_S (w1[3], w1[2], selector); - w1[3] = __byte_perm_S (w1[2], w1[1], selector); - w1[2] = __byte_perm_S (w1[1], w1[0], selector); - w1[1] = __byte_perm_S (w1[0], w0[3], selector); - w1[0] = __byte_perm_S (w0[3], w0[2], selector); - w0[3] = __byte_perm_S (w0[2], w0[1], selector); - w0[2] = __byte_perm_S (w0[1], w0[0], selector); - w0[1] = __byte_perm_S (w0[0], 0, selector); + c0[1] = hc_byte_perm_S ( 0, w7[3], selector); + c0[0] = hc_byte_perm_S (w7[3], w7[2], selector); + w7[3] = hc_byte_perm_S (w7[2], w7[1], selector); + w7[2] = hc_byte_perm_S (w7[1], w7[0], selector); + w7[1] = hc_byte_perm_S (w7[0], w6[3], selector); + w7[0] = hc_byte_perm_S (w6[3], w6[2], selector); + w6[3] = hc_byte_perm_S (w6[2], w6[1], selector); + w6[2] = hc_byte_perm_S (w6[1], w6[0], selector); + w6[1] = hc_byte_perm_S (w6[0], w5[3], selector); + w6[0] = hc_byte_perm_S (w5[3], w5[2], selector); + w5[3] = hc_byte_perm_S (w5[2], w5[1], selector); + w5[2] = hc_byte_perm_S (w5[1], w5[0], selector); + w5[1] = hc_byte_perm_S (w5[0], w4[3], selector); + w5[0] = hc_byte_perm_S (w4[3], w4[2], selector); + w4[3] = hc_byte_perm_S (w4[2], w4[1], selector); + w4[2] = hc_byte_perm_S (w4[1], w4[0], selector); + w4[1] = hc_byte_perm_S (w4[0], w3[3], selector); + w4[0] = hc_byte_perm_S (w3[3], w3[2], selector); + w3[3] = hc_byte_perm_S (w3[2], w3[1], selector); + w3[2] = hc_byte_perm_S (w3[1], w3[0], selector); + w3[1] = hc_byte_perm_S (w3[0], w2[3], selector); + w3[0] = hc_byte_perm_S (w2[3], w2[2], selector); + w2[3] = hc_byte_perm_S (w2[2], w2[1], selector); + w2[2] = hc_byte_perm_S (w2[1], w2[0], selector); + w2[1] = hc_byte_perm_S (w2[0], w1[3], selector); + w2[0] = hc_byte_perm_S (w1[3], w1[2], selector); + w1[3] = hc_byte_perm_S (w1[2], w1[1], selector); + w1[2] = hc_byte_perm_S (w1[1], w1[0], selector); + w1[1] = hc_byte_perm_S (w1[0], w0[3], selector); + w1[0] = hc_byte_perm_S (w0[3], w0[2], selector); + w0[3] = hc_byte_perm_S (w0[2], w0[1], selector); + w0[2] = hc_byte_perm_S (w0[1], w0[0], selector); + w0[1] = hc_byte_perm_S (w0[0], 0, selector); w0[0] = 0; break; case 2: - c0[2] = __byte_perm_S ( 0, w7[3], selector); - c0[1] = __byte_perm_S (w7[3], w7[2], selector); - c0[0] = __byte_perm_S (w7[2], w7[1], selector); - w7[3] = __byte_perm_S (w7[1], w7[0], selector); - w7[2] = __byte_perm_S (w7[0], w6[3], selector); - w7[1] = __byte_perm_S (w6[3], w6[2], selector); - w7[0] = __byte_perm_S (w6[2], w6[1], selector); - w6[3] = __byte_perm_S (w6[1], w6[0], selector); - w6[2] = __byte_perm_S (w6[0], w5[3], selector); - w6[1] = __byte_perm_S (w5[3], w5[2], selector); - w6[0] = __byte_perm_S (w5[2], w5[1], selector); - w5[3] = __byte_perm_S (w5[1], w5[0], selector); - w5[2] = __byte_perm_S (w5[0], w4[3], selector); - w5[1] = __byte_perm_S (w4[3], w4[2], selector); - w5[0] = __byte_perm_S (w4[2], w4[1], selector); - w4[3] = __byte_perm_S (w4[1], w4[0], selector); - w4[2] = __byte_perm_S (w4[0], w3[3], selector); - w4[1] = __byte_perm_S (w3[3], w3[2], selector); - w4[0] = __byte_perm_S (w3[2], w3[1], selector); - w3[3] = __byte_perm_S (w3[1], w3[0], selector); - w3[2] = __byte_perm_S (w3[0], w2[3], selector); - w3[1] = __byte_perm_S (w2[3], w2[2], selector); - w3[0] = __byte_perm_S (w2[2], w2[1], selector); - w2[3] = __byte_perm_S (w2[1], w2[0], selector); - w2[2] = __byte_perm_S (w2[0], w1[3], selector); - w2[1] = __byte_perm_S (w1[3], w1[2], selector); - w2[0] = __byte_perm_S (w1[2], w1[1], selector); - w1[3] = __byte_perm_S (w1[1], w1[0], selector); - w1[2] = __byte_perm_S (w1[0], w0[3], selector); - w1[1] = __byte_perm_S (w0[3], w0[2], selector); - w1[0] = __byte_perm_S (w0[2], w0[1], selector); - w0[3] = __byte_perm_S (w0[1], w0[0], selector); - w0[2] = __byte_perm_S (w0[0], 0, selector); + c0[2] = hc_byte_perm_S ( 0, w7[3], selector); + c0[1] = hc_byte_perm_S (w7[3], w7[2], selector); + c0[0] = hc_byte_perm_S (w7[2], w7[1], selector); + w7[3] = hc_byte_perm_S (w7[1], w7[0], selector); + w7[2] = hc_byte_perm_S (w7[0], w6[3], selector); + w7[1] = hc_byte_perm_S (w6[3], w6[2], selector); + w7[0] = hc_byte_perm_S (w6[2], w6[1], selector); + w6[3] = hc_byte_perm_S (w6[1], w6[0], selector); + w6[2] = hc_byte_perm_S (w6[0], w5[3], selector); + w6[1] = hc_byte_perm_S (w5[3], w5[2], selector); + w6[0] = hc_byte_perm_S (w5[2], w5[1], selector); + w5[3] = hc_byte_perm_S (w5[1], w5[0], selector); + w5[2] = hc_byte_perm_S (w5[0], w4[3], selector); + w5[1] = hc_byte_perm_S (w4[3], w4[2], selector); + w5[0] = hc_byte_perm_S (w4[2], w4[1], selector); + w4[3] = hc_byte_perm_S (w4[1], w4[0], selector); + w4[2] = hc_byte_perm_S (w4[0], w3[3], selector); + w4[1] = hc_byte_perm_S (w3[3], w3[2], selector); + w4[0] = hc_byte_perm_S (w3[2], w3[1], selector); + w3[3] = hc_byte_perm_S (w3[1], w3[0], selector); + w3[2] = hc_byte_perm_S (w3[0], w2[3], selector); + w3[1] = hc_byte_perm_S (w2[3], w2[2], selector); + w3[0] = hc_byte_perm_S (w2[2], w2[1], selector); + w2[3] = hc_byte_perm_S (w2[1], w2[0], selector); + w2[2] = hc_byte_perm_S (w2[0], w1[3], selector); + w2[1] = hc_byte_perm_S (w1[3], w1[2], selector); + w2[0] = hc_byte_perm_S (w1[2], w1[1], selector); + w1[3] = hc_byte_perm_S (w1[1], w1[0], selector); + w1[2] = hc_byte_perm_S (w1[0], w0[3], selector); + w1[1] = hc_byte_perm_S (w0[3], w0[2], selector); + w1[0] = hc_byte_perm_S (w0[2], w0[1], selector); + w0[3] = hc_byte_perm_S (w0[1], w0[0], selector); + w0[2] = hc_byte_perm_S (w0[0], 0, selector); w0[1] = 0; w0[0] = 0; break; case 3: - c0[3] = __byte_perm_S ( 0, w7[3], selector); - c0[2] = __byte_perm_S (w7[3], w7[2], selector); - c0[1] = __byte_perm_S (w7[2], w7[1], selector); - c0[0] = __byte_perm_S (w7[1], w7[0], selector); - w7[3] = __byte_perm_S (w7[0], w6[3], selector); - w7[2] = __byte_perm_S (w6[3], w6[2], selector); - w7[1] = __byte_perm_S (w6[2], w6[1], selector); - w7[0] = __byte_perm_S (w6[1], w6[0], selector); - w6[3] = __byte_perm_S (w6[0], w5[3], selector); - w6[2] = __byte_perm_S (w5[3], w5[2], selector); - w6[1] = __byte_perm_S (w5[2], w5[1], selector); - w6[0] = __byte_perm_S (w5[1], w5[0], selector); - w5[3] = __byte_perm_S (w5[0], w4[3], selector); - w5[2] = __byte_perm_S (w4[3], w4[2], selector); - w5[1] = __byte_perm_S (w4[2], w4[1], selector); - w5[0] = __byte_perm_S (w4[1], w4[0], selector); - w4[3] = __byte_perm_S (w4[0], w3[3], selector); - w4[2] = __byte_perm_S (w3[3], w3[2], selector); - w4[1] = __byte_perm_S (w3[2], w3[1], selector); - w4[0] = __byte_perm_S (w3[1], w3[0], selector); - w3[3] = __byte_perm_S (w3[0], w2[3], selector); - w3[2] = __byte_perm_S (w2[3], w2[2], selector); - w3[1] = __byte_perm_S (w2[2], w2[1], selector); - w3[0] = __byte_perm_S (w2[1], w2[0], selector); - w2[3] = __byte_perm_S (w2[0], w1[3], selector); - w2[2] = __byte_perm_S (w1[3], w1[2], selector); - w2[1] = __byte_perm_S (w1[2], w1[1], selector); - w2[0] = __byte_perm_S (w1[1], w1[0], selector); - w1[3] = __byte_perm_S (w1[0], w0[3], selector); - w1[2] = __byte_perm_S (w0[3], w0[2], selector); - w1[1] = __byte_perm_S (w0[2], w0[1], selector); - w1[0] = __byte_perm_S (w0[1], w0[0], selector); - w0[3] = __byte_perm_S (w0[0], 0, selector); + c0[3] = hc_byte_perm_S ( 0, w7[3], selector); + c0[2] = hc_byte_perm_S (w7[3], w7[2], selector); + c0[1] = hc_byte_perm_S (w7[2], w7[1], selector); + c0[0] = hc_byte_perm_S (w7[1], w7[0], selector); + w7[3] = hc_byte_perm_S (w7[0], w6[3], selector); + w7[2] = hc_byte_perm_S (w6[3], w6[2], selector); + w7[1] = hc_byte_perm_S (w6[2], w6[1], selector); + w7[0] = hc_byte_perm_S (w6[1], w6[0], selector); + w6[3] = hc_byte_perm_S (w6[0], w5[3], selector); + w6[2] = hc_byte_perm_S (w5[3], w5[2], selector); + w6[1] = hc_byte_perm_S (w5[2], w5[1], selector); + w6[0] = hc_byte_perm_S (w5[1], w5[0], selector); + w5[3] = hc_byte_perm_S (w5[0], w4[3], selector); + w5[2] = hc_byte_perm_S (w4[3], w4[2], selector); + w5[1] = hc_byte_perm_S (w4[2], w4[1], selector); + w5[0] = hc_byte_perm_S (w4[1], w4[0], selector); + w4[3] = hc_byte_perm_S (w4[0], w3[3], selector); + w4[2] = hc_byte_perm_S (w3[3], w3[2], selector); + w4[1] = hc_byte_perm_S (w3[2], w3[1], selector); + w4[0] = hc_byte_perm_S (w3[1], w3[0], selector); + w3[3] = hc_byte_perm_S (w3[0], w2[3], selector); + w3[2] = hc_byte_perm_S (w2[3], w2[2], selector); + w3[1] = hc_byte_perm_S (w2[2], w2[1], selector); + w3[0] = hc_byte_perm_S (w2[1], w2[0], selector); + w2[3] = hc_byte_perm_S (w2[0], w1[3], selector); + w2[2] = hc_byte_perm_S (w1[3], w1[2], selector); + w2[1] = hc_byte_perm_S (w1[2], w1[1], selector); + w2[0] = hc_byte_perm_S (w1[1], w1[0], selector); + w1[3] = hc_byte_perm_S (w1[0], w0[3], selector); + w1[2] = hc_byte_perm_S (w0[3], w0[2], selector); + w1[1] = hc_byte_perm_S (w0[2], w0[1], selector); + w1[0] = hc_byte_perm_S (w0[1], w0[0], selector); + w0[3] = hc_byte_perm_S (w0[0], 0, selector); w0[2] = 0; w0[1] = 0; w0[0] = 0; @@ -40810,39 +40810,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 4: - c1[0] = __byte_perm_S ( 0, w7[3], selector); - c0[3] = __byte_perm_S (w7[3], w7[2], selector); - c0[2] = __byte_perm_S (w7[2], w7[1], selector); - c0[1] = __byte_perm_S (w7[1], w7[0], selector); - c0[0] = __byte_perm_S (w7[0], w6[3], selector); - w7[3] = __byte_perm_S (w6[3], w6[2], selector); - w7[2] = __byte_perm_S (w6[2], w6[1], selector); - w7[1] = __byte_perm_S (w6[1], w6[0], selector); - w7[0] = __byte_perm_S (w6[0], w5[3], selector); - w6[3] = __byte_perm_S (w5[3], w5[2], selector); - w6[2] = __byte_perm_S (w5[2], w5[1], selector); - w6[1] = __byte_perm_S (w5[1], w5[0], selector); - w6[0] = __byte_perm_S (w5[0], w4[3], selector); - w5[3] = __byte_perm_S (w4[3], w4[2], selector); - w5[2] = __byte_perm_S (w4[2], w4[1], selector); - w5[1] = __byte_perm_S (w4[1], w4[0], selector); - w5[0] = __byte_perm_S (w4[0], w3[3], selector); - w4[3] = __byte_perm_S (w3[3], w3[2], selector); - w4[2] = __byte_perm_S (w3[2], w3[1], selector); - w4[1] = __byte_perm_S (w3[1], w3[0], selector); - w4[0] = __byte_perm_S (w3[0], w2[3], selector); - w3[3] = __byte_perm_S (w2[3], w2[2], selector); - w3[2] = __byte_perm_S (w2[2], w2[1], selector); - w3[1] = __byte_perm_S (w2[1], w2[0], selector); - w3[0] = __byte_perm_S (w2[0], w1[3], selector); - w2[3] = __byte_perm_S (w1[3], w1[2], selector); - w2[2] = __byte_perm_S (w1[2], w1[1], selector); - w2[1] = __byte_perm_S (w1[1], w1[0], selector); - w2[0] = __byte_perm_S (w1[0], w0[3], selector); - w1[3] = __byte_perm_S (w0[3], w0[2], selector); - w1[2] = __byte_perm_S (w0[2], w0[1], selector); - w1[1] = __byte_perm_S (w0[1], w0[0], selector); - w1[0] = __byte_perm_S (w0[0], 0, selector); + c1[0] = hc_byte_perm_S ( 0, w7[3], selector); + c0[3] = hc_byte_perm_S (w7[3], w7[2], selector); + c0[2] = hc_byte_perm_S (w7[2], w7[1], selector); + c0[1] = hc_byte_perm_S (w7[1], w7[0], selector); + c0[0] = hc_byte_perm_S (w7[0], w6[3], selector); + w7[3] = hc_byte_perm_S (w6[3], w6[2], selector); + w7[2] = hc_byte_perm_S (w6[2], w6[1], selector); + w7[1] = hc_byte_perm_S (w6[1], w6[0], selector); + w7[0] = hc_byte_perm_S (w6[0], w5[3], selector); + w6[3] = hc_byte_perm_S (w5[3], w5[2], selector); + w6[2] = hc_byte_perm_S (w5[2], w5[1], selector); + w6[1] = hc_byte_perm_S (w5[1], w5[0], selector); + w6[0] = hc_byte_perm_S (w5[0], w4[3], selector); + w5[3] = hc_byte_perm_S (w4[3], w4[2], selector); + w5[2] = hc_byte_perm_S (w4[2], w4[1], selector); + w5[1] = hc_byte_perm_S (w4[1], w4[0], selector); + w5[0] = hc_byte_perm_S (w4[0], w3[3], selector); + w4[3] = hc_byte_perm_S (w3[3], w3[2], selector); + w4[2] = hc_byte_perm_S (w3[2], w3[1], selector); + w4[1] = hc_byte_perm_S (w3[1], w3[0], selector); + w4[0] = hc_byte_perm_S (w3[0], w2[3], selector); + w3[3] = hc_byte_perm_S (w2[3], w2[2], selector); + w3[2] = hc_byte_perm_S (w2[2], w2[1], selector); + w3[1] = hc_byte_perm_S (w2[1], w2[0], selector); + w3[0] = hc_byte_perm_S (w2[0], w1[3], selector); + w2[3] = hc_byte_perm_S (w1[3], w1[2], selector); + w2[2] = hc_byte_perm_S (w1[2], w1[1], selector); + w2[1] = hc_byte_perm_S (w1[1], w1[0], selector); + w2[0] = hc_byte_perm_S (w1[0], w0[3], selector); + w1[3] = hc_byte_perm_S (w0[3], w0[2], selector); + w1[2] = hc_byte_perm_S (w0[2], w0[1], selector); + w1[1] = hc_byte_perm_S (w0[1], w0[0], selector); + w1[0] = hc_byte_perm_S (w0[0], 0, selector); w0[3] = 0; w0[2] = 0; w0[1] = 0; @@ -40851,39 +40851,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 5: - c1[1] = __byte_perm_S ( 0, w7[3], selector); - c1[0] = __byte_perm_S (w7[3], w7[2], selector); - c0[3] = __byte_perm_S (w7[2], w7[1], selector); - c0[2] = __byte_perm_S (w7[1], w7[0], selector); - c0[1] = __byte_perm_S (w7[0], w6[3], selector); - c0[0] = __byte_perm_S (w6[3], w6[2], selector); - w7[3] = __byte_perm_S (w6[2], w6[1], selector); - w7[2] = __byte_perm_S (w6[1], w6[0], selector); - w7[1] = __byte_perm_S (w6[0], w5[3], selector); - w7[0] = __byte_perm_S (w5[3], w5[2], selector); - w6[3] = __byte_perm_S (w5[2], w5[1], selector); - w6[2] = __byte_perm_S (w5[1], w5[0], selector); - w6[1] = __byte_perm_S (w5[0], w4[3], selector); - w6[0] = __byte_perm_S (w4[3], w4[2], selector); - w5[3] = __byte_perm_S (w4[2], w4[1], selector); - w5[2] = __byte_perm_S (w4[1], w4[0], selector); - w5[1] = __byte_perm_S (w4[0], w3[3], selector); - w5[0] = __byte_perm_S (w3[3], w3[2], selector); - w4[3] = __byte_perm_S (w3[2], w3[1], selector); - w4[2] = __byte_perm_S (w3[1], w3[0], selector); - w4[1] = __byte_perm_S (w3[0], w2[3], selector); - w4[0] = __byte_perm_S (w2[3], w2[2], selector); - w3[3] = __byte_perm_S (w2[2], w2[1], selector); - w3[2] = __byte_perm_S (w2[1], w2[0], selector); - w3[1] = __byte_perm_S (w2[0], w1[3], selector); - w3[0] = __byte_perm_S (w1[3], w1[2], selector); - w2[3] = __byte_perm_S (w1[2], w1[1], selector); - w2[2] = __byte_perm_S (w1[1], w1[0], selector); - w2[1] = __byte_perm_S (w1[0], w0[3], selector); - w2[0] = __byte_perm_S (w0[3], w0[2], selector); - w1[3] = __byte_perm_S (w0[2], w0[1], selector); - w1[2] = __byte_perm_S (w0[1], w0[0], selector); - w1[1] = __byte_perm_S (w0[0], 0, selector); + c1[1] = hc_byte_perm_S ( 0, w7[3], selector); + c1[0] = hc_byte_perm_S (w7[3], w7[2], selector); + c0[3] = hc_byte_perm_S (w7[2], w7[1], selector); + c0[2] = hc_byte_perm_S (w7[1], w7[0], selector); + c0[1] = hc_byte_perm_S (w7[0], w6[3], selector); + c0[0] = hc_byte_perm_S (w6[3], w6[2], selector); + w7[3] = hc_byte_perm_S (w6[2], w6[1], selector); + w7[2] = hc_byte_perm_S (w6[1], w6[0], selector); + w7[1] = hc_byte_perm_S (w6[0], w5[3], selector); + w7[0] = hc_byte_perm_S (w5[3], w5[2], selector); + w6[3] = hc_byte_perm_S (w5[2], w5[1], selector); + w6[2] = hc_byte_perm_S (w5[1], w5[0], selector); + w6[1] = hc_byte_perm_S (w5[0], w4[3], selector); + w6[0] = hc_byte_perm_S (w4[3], w4[2], selector); + w5[3] = hc_byte_perm_S (w4[2], w4[1], selector); + w5[2] = hc_byte_perm_S (w4[1], w4[0], selector); + w5[1] = hc_byte_perm_S (w4[0], w3[3], selector); + w5[0] = hc_byte_perm_S (w3[3], w3[2], selector); + w4[3] = hc_byte_perm_S (w3[2], w3[1], selector); + w4[2] = hc_byte_perm_S (w3[1], w3[0], selector); + w4[1] = hc_byte_perm_S (w3[0], w2[3], selector); + w4[0] = hc_byte_perm_S (w2[3], w2[2], selector); + w3[3] = hc_byte_perm_S (w2[2], w2[1], selector); + w3[2] = hc_byte_perm_S (w2[1], w2[0], selector); + w3[1] = hc_byte_perm_S (w2[0], w1[3], selector); + w3[0] = hc_byte_perm_S (w1[3], w1[2], selector); + w2[3] = hc_byte_perm_S (w1[2], w1[1], selector); + w2[2] = hc_byte_perm_S (w1[1], w1[0], selector); + w2[1] = hc_byte_perm_S (w1[0], w0[3], selector); + w2[0] = hc_byte_perm_S (w0[3], w0[2], selector); + w1[3] = hc_byte_perm_S (w0[2], w0[1], selector); + w1[2] = hc_byte_perm_S (w0[1], w0[0], selector); + w1[1] = hc_byte_perm_S (w0[0], 0, selector); w1[0] = 0; w0[3] = 0; w0[2] = 0; @@ -40893,39 +40893,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 6: - c1[2] = __byte_perm_S ( 0, w7[3], selector); - c1[1] = __byte_perm_S (w7[3], w7[2], selector); - c1[0] = __byte_perm_S (w7[2], w7[1], selector); - c0[3] = __byte_perm_S (w7[1], w7[0], selector); - c0[2] = __byte_perm_S (w7[0], w6[3], selector); - c0[1] = __byte_perm_S (w6[3], w6[2], selector); - c0[0] = __byte_perm_S (w6[2], w6[1], selector); - w7[3] = __byte_perm_S (w6[1], w6[0], selector); - w7[2] = __byte_perm_S (w6[0], w5[3], selector); - w7[1] = __byte_perm_S (w5[3], w5[2], selector); - w7[0] = __byte_perm_S (w5[2], w5[1], selector); - w6[3] = __byte_perm_S (w5[1], w5[0], selector); - w6[2] = __byte_perm_S (w5[0], w4[3], selector); - w6[1] = __byte_perm_S (w4[3], w4[2], selector); - w6[0] = __byte_perm_S (w4[2], w4[1], selector); - w5[3] = __byte_perm_S (w4[1], w4[0], selector); - w5[2] = __byte_perm_S (w4[0], w3[3], selector); - w5[1] = __byte_perm_S (w3[3], w3[2], selector); - w5[0] = __byte_perm_S (w3[2], w3[1], selector); - w4[3] = __byte_perm_S (w3[1], w3[0], selector); - w4[2] = __byte_perm_S (w3[0], w2[3], selector); - w4[1] = __byte_perm_S (w2[3], w2[2], selector); - w4[0] = __byte_perm_S (w2[2], w2[1], selector); - w3[3] = __byte_perm_S (w2[1], w2[0], selector); - w3[2] = __byte_perm_S (w2[0], w1[3], selector); - w3[1] = __byte_perm_S (w1[3], w1[2], selector); - w3[0] = __byte_perm_S (w1[2], w1[1], selector); - w2[3] = __byte_perm_S (w1[1], w1[0], selector); - w2[2] = __byte_perm_S (w1[0], w0[3], selector); - w2[1] = __byte_perm_S (w0[3], w0[2], selector); - w2[0] = __byte_perm_S (w0[2], w0[1], selector); - w1[3] = __byte_perm_S (w0[1], w0[0], selector); - w1[2] = __byte_perm_S (w0[0], 0, selector); + c1[2] = hc_byte_perm_S ( 0, w7[3], selector); + c1[1] = hc_byte_perm_S (w7[3], w7[2], selector); + c1[0] = hc_byte_perm_S (w7[2], w7[1], selector); + c0[3] = hc_byte_perm_S (w7[1], w7[0], selector); + c0[2] = hc_byte_perm_S (w7[0], w6[3], selector); + c0[1] = hc_byte_perm_S (w6[3], w6[2], selector); + c0[0] = hc_byte_perm_S (w6[2], w6[1], selector); + w7[3] = hc_byte_perm_S (w6[1], w6[0], selector); + w7[2] = hc_byte_perm_S (w6[0], w5[3], selector); + w7[1] = hc_byte_perm_S (w5[3], w5[2], selector); + w7[0] = hc_byte_perm_S (w5[2], w5[1], selector); + w6[3] = hc_byte_perm_S (w5[1], w5[0], selector); + w6[2] = hc_byte_perm_S (w5[0], w4[3], selector); + w6[1] = hc_byte_perm_S (w4[3], w4[2], selector); + w6[0] = hc_byte_perm_S (w4[2], w4[1], selector); + w5[3] = hc_byte_perm_S (w4[1], w4[0], selector); + w5[2] = hc_byte_perm_S (w4[0], w3[3], selector); + w5[1] = hc_byte_perm_S (w3[3], w3[2], selector); + w5[0] = hc_byte_perm_S (w3[2], w3[1], selector); + w4[3] = hc_byte_perm_S (w3[1], w3[0], selector); + w4[2] = hc_byte_perm_S (w3[0], w2[3], selector); + w4[1] = hc_byte_perm_S (w2[3], w2[2], selector); + w4[0] = hc_byte_perm_S (w2[2], w2[1], selector); + w3[3] = hc_byte_perm_S (w2[1], w2[0], selector); + w3[2] = hc_byte_perm_S (w2[0], w1[3], selector); + w3[1] = hc_byte_perm_S (w1[3], w1[2], selector); + w3[0] = hc_byte_perm_S (w1[2], w1[1], selector); + w2[3] = hc_byte_perm_S (w1[1], w1[0], selector); + w2[2] = hc_byte_perm_S (w1[0], w0[3], selector); + w2[1] = hc_byte_perm_S (w0[3], w0[2], selector); + w2[0] = hc_byte_perm_S (w0[2], w0[1], selector); + w1[3] = hc_byte_perm_S (w0[1], w0[0], selector); + w1[2] = hc_byte_perm_S (w0[0], 0, selector); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -40936,39 +40936,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 7: - c1[3] = __byte_perm_S ( 0, w7[3], selector); - c1[2] = __byte_perm_S (w7[3], w7[2], selector); - c1[1] = __byte_perm_S (w7[2], w7[1], selector); - c1[0] = __byte_perm_S (w7[1], w7[0], selector); - c0[3] = __byte_perm_S (w7[0], w6[3], selector); - c0[2] = __byte_perm_S (w6[3], w6[2], selector); - c0[1] = __byte_perm_S (w6[2], w6[1], selector); - c0[0] = __byte_perm_S (w6[1], w6[0], selector); - w7[3] = __byte_perm_S (w6[0], w5[3], selector); - w7[2] = __byte_perm_S (w5[3], w5[2], selector); - w7[1] = __byte_perm_S (w5[2], w5[1], selector); - w7[0] = __byte_perm_S (w5[1], w5[0], selector); - w6[3] = __byte_perm_S (w5[0], w4[3], selector); - w6[2] = __byte_perm_S (w4[3], w4[2], selector); - w6[1] = __byte_perm_S (w4[2], w4[1], selector); - w6[0] = __byte_perm_S (w4[1], w4[0], selector); - w5[3] = __byte_perm_S (w4[0], w3[3], selector); - w5[2] = __byte_perm_S (w3[3], w3[2], selector); - w5[1] = __byte_perm_S (w3[2], w3[1], selector); - w5[0] = __byte_perm_S (w3[1], w3[0], selector); - w4[3] = __byte_perm_S (w3[0], w2[3], selector); - w4[2] = __byte_perm_S (w2[3], w2[2], selector); - w4[1] = __byte_perm_S (w2[2], w2[1], selector); - w4[0] = __byte_perm_S (w2[1], w2[0], selector); - w3[3] = __byte_perm_S (w2[0], w1[3], selector); - w3[2] = __byte_perm_S (w1[3], w1[2], selector); - w3[1] = __byte_perm_S (w1[2], w1[1], selector); - w3[0] = __byte_perm_S (w1[1], w1[0], selector); - w2[3] = __byte_perm_S (w1[0], w0[3], selector); - w2[2] = __byte_perm_S (w0[3], w0[2], selector); - w2[1] = __byte_perm_S (w0[2], w0[1], selector); - w2[0] = __byte_perm_S (w0[1], w0[0], selector); - w1[3] = __byte_perm_S (w0[0], 0, selector); + c1[3] = hc_byte_perm_S ( 0, w7[3], selector); + c1[2] = hc_byte_perm_S (w7[3], w7[2], selector); + c1[1] = hc_byte_perm_S (w7[2], w7[1], selector); + c1[0] = hc_byte_perm_S (w7[1], w7[0], selector); + c0[3] = hc_byte_perm_S (w7[0], w6[3], selector); + c0[2] = hc_byte_perm_S (w6[3], w6[2], selector); + c0[1] = hc_byte_perm_S (w6[2], w6[1], selector); + c0[0] = hc_byte_perm_S (w6[1], w6[0], selector); + w7[3] = hc_byte_perm_S (w6[0], w5[3], selector); + w7[2] = hc_byte_perm_S (w5[3], w5[2], selector); + w7[1] = hc_byte_perm_S (w5[2], w5[1], selector); + w7[0] = hc_byte_perm_S (w5[1], w5[0], selector); + w6[3] = hc_byte_perm_S (w5[0], w4[3], selector); + w6[2] = hc_byte_perm_S (w4[3], w4[2], selector); + w6[1] = hc_byte_perm_S (w4[2], w4[1], selector); + w6[0] = hc_byte_perm_S (w4[1], w4[0], selector); + w5[3] = hc_byte_perm_S (w4[0], w3[3], selector); + w5[2] = hc_byte_perm_S (w3[3], w3[2], selector); + w5[1] = hc_byte_perm_S (w3[2], w3[1], selector); + w5[0] = hc_byte_perm_S (w3[1], w3[0], selector); + w4[3] = hc_byte_perm_S (w3[0], w2[3], selector); + w4[2] = hc_byte_perm_S (w2[3], w2[2], selector); + w4[1] = hc_byte_perm_S (w2[2], w2[1], selector); + w4[0] = hc_byte_perm_S (w2[1], w2[0], selector); + w3[3] = hc_byte_perm_S (w2[0], w1[3], selector); + w3[2] = hc_byte_perm_S (w1[3], w1[2], selector); + w3[1] = hc_byte_perm_S (w1[2], w1[1], selector); + w3[0] = hc_byte_perm_S (w1[1], w1[0], selector); + w2[3] = hc_byte_perm_S (w1[0], w0[3], selector); + w2[2] = hc_byte_perm_S (w0[3], w0[2], selector); + w2[1] = hc_byte_perm_S (w0[2], w0[1], selector); + w2[0] = hc_byte_perm_S (w0[1], w0[0], selector); + w1[3] = hc_byte_perm_S (w0[0], 0, selector); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -40980,39 +40980,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 8: - c2[0] = __byte_perm_S ( 0, w7[3], selector); - c1[3] = __byte_perm_S (w7[3], w7[2], selector); - c1[2] = __byte_perm_S (w7[2], w7[1], selector); - c1[1] = __byte_perm_S (w7[1], w7[0], selector); - c1[0] = __byte_perm_S (w7[0], w6[3], selector); - c0[3] = __byte_perm_S (w6[3], w6[2], selector); - c0[2] = __byte_perm_S (w6[2], w6[1], selector); - c0[1] = __byte_perm_S (w6[1], w6[0], selector); - c0[0] = __byte_perm_S (w6[0], w5[3], selector); - w7[3] = __byte_perm_S (w5[3], w5[2], selector); - w7[2] = __byte_perm_S (w5[2], w5[1], selector); - w7[1] = __byte_perm_S (w5[1], w5[0], selector); - w7[0] = __byte_perm_S (w5[0], w4[3], selector); - w6[3] = __byte_perm_S (w4[3], w4[2], selector); - w6[2] = __byte_perm_S (w4[2], w4[1], selector); - w6[1] = __byte_perm_S (w4[1], w4[0], selector); - w6[0] = __byte_perm_S (w4[0], w3[3], selector); - w5[3] = __byte_perm_S (w3[3], w3[2], selector); - w5[2] = __byte_perm_S (w3[2], w3[1], selector); - w5[1] = __byte_perm_S (w3[1], w3[0], selector); - w5[0] = __byte_perm_S (w3[0], w2[3], selector); - w4[3] = __byte_perm_S (w2[3], w2[2], selector); - w4[2] = __byte_perm_S (w2[2], w2[1], selector); - w4[1] = __byte_perm_S (w2[1], w2[0], selector); - w4[0] = __byte_perm_S (w2[0], w1[3], selector); - w3[3] = __byte_perm_S (w1[3], w1[2], selector); - w3[2] = __byte_perm_S (w1[2], w1[1], selector); - w3[1] = __byte_perm_S (w1[1], w1[0], selector); - w3[0] = __byte_perm_S (w1[0], w0[3], selector); - w2[3] = __byte_perm_S (w0[3], w0[2], selector); - w2[2] = __byte_perm_S (w0[2], w0[1], selector); - w2[1] = __byte_perm_S (w0[1], w0[0], selector); - w2[0] = __byte_perm_S (w0[0], 0, selector); + c2[0] = hc_byte_perm_S ( 0, w7[3], selector); + c1[3] = hc_byte_perm_S (w7[3], w7[2], selector); + c1[2] = hc_byte_perm_S (w7[2], w7[1], selector); + c1[1] = hc_byte_perm_S (w7[1], w7[0], selector); + c1[0] = hc_byte_perm_S (w7[0], w6[3], selector); + c0[3] = hc_byte_perm_S (w6[3], w6[2], selector); + c0[2] = hc_byte_perm_S (w6[2], w6[1], selector); + c0[1] = hc_byte_perm_S (w6[1], w6[0], selector); + c0[0] = hc_byte_perm_S (w6[0], w5[3], selector); + w7[3] = hc_byte_perm_S (w5[3], w5[2], selector); + w7[2] = hc_byte_perm_S (w5[2], w5[1], selector); + w7[1] = hc_byte_perm_S (w5[1], w5[0], selector); + w7[0] = hc_byte_perm_S (w5[0], w4[3], selector); + w6[3] = hc_byte_perm_S (w4[3], w4[2], selector); + w6[2] = hc_byte_perm_S (w4[2], w4[1], selector); + w6[1] = hc_byte_perm_S (w4[1], w4[0], selector); + w6[0] = hc_byte_perm_S (w4[0], w3[3], selector); + w5[3] = hc_byte_perm_S (w3[3], w3[2], selector); + w5[2] = hc_byte_perm_S (w3[2], w3[1], selector); + w5[1] = hc_byte_perm_S (w3[1], w3[0], selector); + w5[0] = hc_byte_perm_S (w3[0], w2[3], selector); + w4[3] = hc_byte_perm_S (w2[3], w2[2], selector); + w4[2] = hc_byte_perm_S (w2[2], w2[1], selector); + w4[1] = hc_byte_perm_S (w2[1], w2[0], selector); + w4[0] = hc_byte_perm_S (w2[0], w1[3], selector); + w3[3] = hc_byte_perm_S (w1[3], w1[2], selector); + w3[2] = hc_byte_perm_S (w1[2], w1[1], selector); + w3[1] = hc_byte_perm_S (w1[1], w1[0], selector); + w3[0] = hc_byte_perm_S (w1[0], w0[3], selector); + w2[3] = hc_byte_perm_S (w0[3], w0[2], selector); + w2[2] = hc_byte_perm_S (w0[2], w0[1], selector); + w2[1] = hc_byte_perm_S (w0[1], w0[0], selector); + w2[0] = hc_byte_perm_S (w0[0], 0, selector); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -41025,39 +41025,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 9: - c2[1] = __byte_perm_S ( 0, w7[3], selector); - c2[0] = __byte_perm_S (w7[3], w7[2], selector); - c1[3] = __byte_perm_S (w7[2], w7[1], selector); - c1[2] = __byte_perm_S (w7[1], w7[0], selector); - c1[1] = __byte_perm_S (w7[0], w6[3], selector); - c1[0] = __byte_perm_S (w6[3], w6[2], selector); - c0[3] = __byte_perm_S (w6[2], w6[1], selector); - c0[2] = __byte_perm_S (w6[1], w6[0], selector); - c0[1] = __byte_perm_S (w6[0], w5[3], selector); - c0[0] = __byte_perm_S (w5[3], w5[2], selector); - w7[3] = __byte_perm_S (w5[2], w5[1], selector); - w7[2] = __byte_perm_S (w5[1], w5[0], selector); - w7[1] = __byte_perm_S (w5[0], w4[3], selector); - w7[0] = __byte_perm_S (w4[3], w4[2], selector); - w6[3] = __byte_perm_S (w4[2], w4[1], selector); - w6[2] = __byte_perm_S (w4[1], w4[0], selector); - w6[1] = __byte_perm_S (w4[0], w3[3], selector); - w6[0] = __byte_perm_S (w3[3], w3[2], selector); - w5[3] = __byte_perm_S (w3[2], w3[1], selector); - w5[2] = __byte_perm_S (w3[1], w3[0], selector); - w5[1] = __byte_perm_S (w3[0], w2[3], selector); - w5[0] = __byte_perm_S (w2[3], w2[2], selector); - w4[3] = __byte_perm_S (w2[2], w2[1], selector); - w4[2] = __byte_perm_S (w2[1], w2[0], selector); - w4[1] = __byte_perm_S (w2[0], w1[3], selector); - w4[0] = __byte_perm_S (w1[3], w1[2], selector); - w3[3] = __byte_perm_S (w1[2], w1[1], selector); - w3[2] = __byte_perm_S (w1[1], w1[0], selector); - w3[1] = __byte_perm_S (w1[0], w0[3], selector); - w3[0] = __byte_perm_S (w0[3], w0[2], selector); - w2[3] = __byte_perm_S (w0[2], w0[1], selector); - w2[2] = __byte_perm_S (w0[1], w0[0], selector); - w2[1] = __byte_perm_S (w0[0], 0, selector); + c2[1] = hc_byte_perm_S ( 0, w7[3], selector); + c2[0] = hc_byte_perm_S (w7[3], w7[2], selector); + c1[3] = hc_byte_perm_S (w7[2], w7[1], selector); + c1[2] = hc_byte_perm_S (w7[1], w7[0], selector); + c1[1] = hc_byte_perm_S (w7[0], w6[3], selector); + c1[0] = hc_byte_perm_S (w6[3], w6[2], selector); + c0[3] = hc_byte_perm_S (w6[2], w6[1], selector); + c0[2] = hc_byte_perm_S (w6[1], w6[0], selector); + c0[1] = hc_byte_perm_S (w6[0], w5[3], selector); + c0[0] = hc_byte_perm_S (w5[3], w5[2], selector); + w7[3] = hc_byte_perm_S (w5[2], w5[1], selector); + w7[2] = hc_byte_perm_S (w5[1], w5[0], selector); + w7[1] = hc_byte_perm_S (w5[0], w4[3], selector); + w7[0] = hc_byte_perm_S (w4[3], w4[2], selector); + w6[3] = hc_byte_perm_S (w4[2], w4[1], selector); + w6[2] = hc_byte_perm_S (w4[1], w4[0], selector); + w6[1] = hc_byte_perm_S (w4[0], w3[3], selector); + w6[0] = hc_byte_perm_S (w3[3], w3[2], selector); + w5[3] = hc_byte_perm_S (w3[2], w3[1], selector); + w5[2] = hc_byte_perm_S (w3[1], w3[0], selector); + w5[1] = hc_byte_perm_S (w3[0], w2[3], selector); + w5[0] = hc_byte_perm_S (w2[3], w2[2], selector); + w4[3] = hc_byte_perm_S (w2[2], w2[1], selector); + w4[2] = hc_byte_perm_S (w2[1], w2[0], selector); + w4[1] = hc_byte_perm_S (w2[0], w1[3], selector); + w4[0] = hc_byte_perm_S (w1[3], w1[2], selector); + w3[3] = hc_byte_perm_S (w1[2], w1[1], selector); + w3[2] = hc_byte_perm_S (w1[1], w1[0], selector); + w3[1] = hc_byte_perm_S (w1[0], w0[3], selector); + w3[0] = hc_byte_perm_S (w0[3], w0[2], selector); + w2[3] = hc_byte_perm_S (w0[2], w0[1], selector); + w2[2] = hc_byte_perm_S (w0[1], w0[0], selector); + w2[1] = hc_byte_perm_S (w0[0], 0, selector); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -41071,39 +41071,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 10: - c2[2] = __byte_perm_S ( 0, w7[3], selector); - c2[1] = __byte_perm_S (w7[3], w7[2], selector); - c2[0] = __byte_perm_S (w7[2], w7[1], selector); - c1[3] = __byte_perm_S (w7[1], w7[0], selector); - c1[2] = __byte_perm_S (w7[0], w6[3], selector); - c1[1] = __byte_perm_S (w6[3], w6[2], selector); - c1[0] = __byte_perm_S (w6[2], w6[1], selector); - c0[3] = __byte_perm_S (w6[1], w6[0], selector); - c0[2] = __byte_perm_S (w6[0], w5[3], selector); - c0[1] = __byte_perm_S (w5[3], w5[2], selector); - c0[0] = __byte_perm_S (w5[2], w5[1], selector); - w7[3] = __byte_perm_S (w5[1], w5[0], selector); - w7[2] = __byte_perm_S (w5[0], w4[3], selector); - w7[1] = __byte_perm_S (w4[3], w4[2], selector); - w7[0] = __byte_perm_S (w4[2], w4[1], selector); - w6[3] = __byte_perm_S (w4[1], w4[0], selector); - w6[2] = __byte_perm_S (w4[0], w3[3], selector); - w6[1] = __byte_perm_S (w3[3], w3[2], selector); - w6[0] = __byte_perm_S (w3[2], w3[1], selector); - w5[3] = __byte_perm_S (w3[1], w3[0], selector); - w5[2] = __byte_perm_S (w3[0], w2[3], selector); - w5[1] = __byte_perm_S (w2[3], w2[2], selector); - w5[0] = __byte_perm_S (w2[2], w2[1], selector); - w4[3] = __byte_perm_S (w2[1], w2[0], selector); - w4[2] = __byte_perm_S (w2[0], w1[3], selector); - w4[1] = __byte_perm_S (w1[3], w1[2], selector); - w4[0] = __byte_perm_S (w1[2], w1[1], selector); - w3[3] = __byte_perm_S (w1[1], w1[0], selector); - w3[2] = __byte_perm_S (w1[0], w0[3], selector); - w3[1] = __byte_perm_S (w0[3], w0[2], selector); - w3[0] = __byte_perm_S (w0[2], w0[1], selector); - w2[3] = __byte_perm_S (w0[1], w0[0], selector); - w2[2] = __byte_perm_S (w0[0], 0, selector); + c2[2] = hc_byte_perm_S ( 0, w7[3], selector); + c2[1] = hc_byte_perm_S (w7[3], w7[2], selector); + c2[0] = hc_byte_perm_S (w7[2], w7[1], selector); + c1[3] = hc_byte_perm_S (w7[1], w7[0], selector); + c1[2] = hc_byte_perm_S (w7[0], w6[3], selector); + c1[1] = hc_byte_perm_S (w6[3], w6[2], selector); + c1[0] = hc_byte_perm_S (w6[2], w6[1], selector); + c0[3] = hc_byte_perm_S (w6[1], w6[0], selector); + c0[2] = hc_byte_perm_S (w6[0], w5[3], selector); + c0[1] = hc_byte_perm_S (w5[3], w5[2], selector); + c0[0] = hc_byte_perm_S (w5[2], w5[1], selector); + w7[3] = hc_byte_perm_S (w5[1], w5[0], selector); + w7[2] = hc_byte_perm_S (w5[0], w4[3], selector); + w7[1] = hc_byte_perm_S (w4[3], w4[2], selector); + w7[0] = hc_byte_perm_S (w4[2], w4[1], selector); + w6[3] = hc_byte_perm_S (w4[1], w4[0], selector); + w6[2] = hc_byte_perm_S (w4[0], w3[3], selector); + w6[1] = hc_byte_perm_S (w3[3], w3[2], selector); + w6[0] = hc_byte_perm_S (w3[2], w3[1], selector); + w5[3] = hc_byte_perm_S (w3[1], w3[0], selector); + w5[2] = hc_byte_perm_S (w3[0], w2[3], selector); + w5[1] = hc_byte_perm_S (w2[3], w2[2], selector); + w5[0] = hc_byte_perm_S (w2[2], w2[1], selector); + w4[3] = hc_byte_perm_S (w2[1], w2[0], selector); + w4[2] = hc_byte_perm_S (w2[0], w1[3], selector); + w4[1] = hc_byte_perm_S (w1[3], w1[2], selector); + w4[0] = hc_byte_perm_S (w1[2], w1[1], selector); + w3[3] = hc_byte_perm_S (w1[1], w1[0], selector); + w3[2] = hc_byte_perm_S (w1[0], w0[3], selector); + w3[1] = hc_byte_perm_S (w0[3], w0[2], selector); + w3[0] = hc_byte_perm_S (w0[2], w0[1], selector); + w2[3] = hc_byte_perm_S (w0[1], w0[0], selector); + w2[2] = hc_byte_perm_S (w0[0], 0, selector); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -41118,39 +41118,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 11: - c2[3] = __byte_perm_S ( 0, w7[3], selector); - c2[2] = __byte_perm_S (w7[3], w7[2], selector); - c2[1] = __byte_perm_S (w7[2], w7[1], selector); - c2[0] = __byte_perm_S (w7[1], w7[0], selector); - c1[3] = __byte_perm_S (w7[0], w6[3], selector); - c1[2] = __byte_perm_S (w6[3], w6[2], selector); - c1[1] = __byte_perm_S (w6[2], w6[1], selector); - c1[0] = __byte_perm_S (w6[1], w6[0], selector); - c0[3] = __byte_perm_S (w6[0], w5[3], selector); - c0[2] = __byte_perm_S (w5[3], w5[2], selector); - c0[1] = __byte_perm_S (w5[2], w5[1], selector); - c0[0] = __byte_perm_S (w5[1], w5[0], selector); - w7[3] = __byte_perm_S (w5[0], w4[3], selector); - w7[2] = __byte_perm_S (w4[3], w4[2], selector); - w7[1] = __byte_perm_S (w4[2], w4[1], selector); - w7[0] = __byte_perm_S (w4[1], w4[0], selector); - w6[3] = __byte_perm_S (w4[0], w3[3], selector); - w6[2] = __byte_perm_S (w3[3], w3[2], selector); - w6[1] = __byte_perm_S (w3[2], w3[1], selector); - w6[0] = __byte_perm_S (w3[1], w3[0], selector); - w5[3] = __byte_perm_S (w3[0], w2[3], selector); - w5[2] = __byte_perm_S (w2[3], w2[2], selector); - w5[1] = __byte_perm_S (w2[2], w2[1], selector); - w5[0] = __byte_perm_S (w2[1], w2[0], selector); - w4[3] = __byte_perm_S (w2[0], w1[3], selector); - w4[2] = __byte_perm_S (w1[3], w1[2], selector); - w4[1] = __byte_perm_S (w1[2], w1[1], selector); - w4[0] = __byte_perm_S (w1[1], w1[0], selector); - w3[3] = __byte_perm_S (w1[0], w0[3], selector); - w3[2] = __byte_perm_S (w0[3], w0[2], selector); - w3[1] = __byte_perm_S (w0[2], w0[1], selector); - w3[0] = __byte_perm_S (w0[1], w0[0], selector); - w2[3] = __byte_perm_S (w0[0], 0, selector); + c2[3] = hc_byte_perm_S ( 0, w7[3], selector); + c2[2] = hc_byte_perm_S (w7[3], w7[2], selector); + c2[1] = hc_byte_perm_S (w7[2], w7[1], selector); + c2[0] = hc_byte_perm_S (w7[1], w7[0], selector); + c1[3] = hc_byte_perm_S (w7[0], w6[3], selector); + c1[2] = hc_byte_perm_S (w6[3], w6[2], selector); + c1[1] = hc_byte_perm_S (w6[2], w6[1], selector); + c1[0] = hc_byte_perm_S (w6[1], w6[0], selector); + c0[3] = hc_byte_perm_S (w6[0], w5[3], selector); + c0[2] = hc_byte_perm_S (w5[3], w5[2], selector); + c0[1] = hc_byte_perm_S (w5[2], w5[1], selector); + c0[0] = hc_byte_perm_S (w5[1], w5[0], selector); + w7[3] = hc_byte_perm_S (w5[0], w4[3], selector); + w7[2] = hc_byte_perm_S (w4[3], w4[2], selector); + w7[1] = hc_byte_perm_S (w4[2], w4[1], selector); + w7[0] = hc_byte_perm_S (w4[1], w4[0], selector); + w6[3] = hc_byte_perm_S (w4[0], w3[3], selector); + w6[2] = hc_byte_perm_S (w3[3], w3[2], selector); + w6[1] = hc_byte_perm_S (w3[2], w3[1], selector); + w6[0] = hc_byte_perm_S (w3[1], w3[0], selector); + w5[3] = hc_byte_perm_S (w3[0], w2[3], selector); + w5[2] = hc_byte_perm_S (w2[3], w2[2], selector); + w5[1] = hc_byte_perm_S (w2[2], w2[1], selector); + w5[0] = hc_byte_perm_S (w2[1], w2[0], selector); + w4[3] = hc_byte_perm_S (w2[0], w1[3], selector); + w4[2] = hc_byte_perm_S (w1[3], w1[2], selector); + w4[1] = hc_byte_perm_S (w1[2], w1[1], selector); + w4[0] = hc_byte_perm_S (w1[1], w1[0], selector); + w3[3] = hc_byte_perm_S (w1[0], w0[3], selector); + w3[2] = hc_byte_perm_S (w0[3], w0[2], selector); + w3[1] = hc_byte_perm_S (w0[2], w0[1], selector); + w3[0] = hc_byte_perm_S (w0[1], w0[0], selector); + w2[3] = hc_byte_perm_S (w0[0], 0, selector); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -41166,39 +41166,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 12: - c3[0] = __byte_perm_S ( 0, w7[3], selector); - c2[3] = __byte_perm_S (w7[3], w7[2], selector); - c2[2] = __byte_perm_S (w7[2], w7[1], selector); - c2[1] = __byte_perm_S (w7[1], w7[0], selector); - c2[0] = __byte_perm_S (w7[0], w6[3], selector); - c1[3] = __byte_perm_S (w6[3], w6[2], selector); - c1[2] = __byte_perm_S (w6[2], w6[1], selector); - c1[1] = __byte_perm_S (w6[1], w6[0], selector); - c1[0] = __byte_perm_S (w6[0], w5[3], selector); - c0[3] = __byte_perm_S (w5[3], w5[2], selector); - c0[2] = __byte_perm_S (w5[2], w5[1], selector); - c0[1] = __byte_perm_S (w5[1], w5[0], selector); - c0[0] = __byte_perm_S (w5[0], w4[3], selector); - w7[3] = __byte_perm_S (w4[3], w4[2], selector); - w7[2] = __byte_perm_S (w4[2], w4[1], selector); - w7[1] = __byte_perm_S (w4[1], w4[0], selector); - w7[0] = __byte_perm_S (w4[0], w3[3], selector); - w6[3] = __byte_perm_S (w3[3], w3[2], selector); - w6[2] = __byte_perm_S (w3[2], w3[1], selector); - w6[1] = __byte_perm_S (w3[1], w3[0], selector); - w6[0] = __byte_perm_S (w3[0], w2[3], selector); - w5[3] = __byte_perm_S (w2[3], w2[2], selector); - w5[2] = __byte_perm_S (w2[2], w2[1], selector); - w5[1] = __byte_perm_S (w2[1], w2[0], selector); - w5[0] = __byte_perm_S (w2[0], w1[3], selector); - w4[3] = __byte_perm_S (w1[3], w1[2], selector); - w4[2] = __byte_perm_S (w1[2], w1[1], selector); - w4[1] = __byte_perm_S (w1[1], w1[0], selector); - w4[0] = __byte_perm_S (w1[0], w0[3], selector); - w3[3] = __byte_perm_S (w0[3], w0[2], selector); - w3[2] = __byte_perm_S (w0[2], w0[1], selector); - w3[1] = __byte_perm_S (w0[1], w0[0], selector); - w3[0] = __byte_perm_S (w0[0], 0, selector); + c3[0] = hc_byte_perm_S ( 0, w7[3], selector); + c2[3] = hc_byte_perm_S (w7[3], w7[2], selector); + c2[2] = hc_byte_perm_S (w7[2], w7[1], selector); + c2[1] = hc_byte_perm_S (w7[1], w7[0], selector); + c2[0] = hc_byte_perm_S (w7[0], w6[3], selector); + c1[3] = hc_byte_perm_S (w6[3], w6[2], selector); + c1[2] = hc_byte_perm_S (w6[2], w6[1], selector); + c1[1] = hc_byte_perm_S (w6[1], w6[0], selector); + c1[0] = hc_byte_perm_S (w6[0], w5[3], selector); + c0[3] = hc_byte_perm_S (w5[3], w5[2], selector); + c0[2] = hc_byte_perm_S (w5[2], w5[1], selector); + c0[1] = hc_byte_perm_S (w5[1], w5[0], selector); + c0[0] = hc_byte_perm_S (w5[0], w4[3], selector); + w7[3] = hc_byte_perm_S (w4[3], w4[2], selector); + w7[2] = hc_byte_perm_S (w4[2], w4[1], selector); + w7[1] = hc_byte_perm_S (w4[1], w4[0], selector); + w7[0] = hc_byte_perm_S (w4[0], w3[3], selector); + w6[3] = hc_byte_perm_S (w3[3], w3[2], selector); + w6[2] = hc_byte_perm_S (w3[2], w3[1], selector); + w6[1] = hc_byte_perm_S (w3[1], w3[0], selector); + w6[0] = hc_byte_perm_S (w3[0], w2[3], selector); + w5[3] = hc_byte_perm_S (w2[3], w2[2], selector); + w5[2] = hc_byte_perm_S (w2[2], w2[1], selector); + w5[1] = hc_byte_perm_S (w2[1], w2[0], selector); + w5[0] = hc_byte_perm_S (w2[0], w1[3], selector); + w4[3] = hc_byte_perm_S (w1[3], w1[2], selector); + w4[2] = hc_byte_perm_S (w1[2], w1[1], selector); + w4[1] = hc_byte_perm_S (w1[1], w1[0], selector); + w4[0] = hc_byte_perm_S (w1[0], w0[3], selector); + w3[3] = hc_byte_perm_S (w0[3], w0[2], selector); + w3[2] = hc_byte_perm_S (w0[2], w0[1], selector); + w3[1] = hc_byte_perm_S (w0[1], w0[0], selector); + w3[0] = hc_byte_perm_S (w0[0], 0, selector); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -41215,39 +41215,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 13: - c3[1] = __byte_perm_S ( 0, w7[3], selector); - c3[0] = __byte_perm_S (w7[3], w7[2], selector); - c2[3] = __byte_perm_S (w7[2], w7[1], selector); - c2[2] = __byte_perm_S (w7[1], w7[0], selector); - c2[1] = __byte_perm_S (w7[0], w6[3], selector); - c2[0] = __byte_perm_S (w6[3], w6[2], selector); - c1[3] = __byte_perm_S (w6[2], w6[1], selector); - c1[2] = __byte_perm_S (w6[1], w6[0], selector); - c1[1] = __byte_perm_S (w6[0], w5[3], selector); - c1[0] = __byte_perm_S (w5[3], w5[2], selector); - c0[3] = __byte_perm_S (w5[2], w5[1], selector); - c0[2] = __byte_perm_S (w5[1], w5[0], selector); - c0[1] = __byte_perm_S (w5[0], w4[3], selector); - c0[0] = __byte_perm_S (w4[3], w4[2], selector); - w7[3] = __byte_perm_S (w4[2], w4[1], selector); - w7[2] = __byte_perm_S (w4[1], w4[0], selector); - w7[1] = __byte_perm_S (w4[0], w3[3], selector); - w7[0] = __byte_perm_S (w3[3], w3[2], selector); - w6[3] = __byte_perm_S (w3[2], w3[1], selector); - w6[2] = __byte_perm_S (w3[1], w3[0], selector); - w6[1] = __byte_perm_S (w3[0], w2[3], selector); - w6[0] = __byte_perm_S (w2[3], w2[2], selector); - w5[3] = __byte_perm_S (w2[2], w2[1], selector); - w5[2] = __byte_perm_S (w2[1], w2[0], selector); - w5[1] = __byte_perm_S (w2[0], w1[3], selector); - w5[0] = __byte_perm_S (w1[3], w1[2], selector); - w4[3] = __byte_perm_S (w1[2], w1[1], selector); - w4[2] = __byte_perm_S (w1[1], w1[0], selector); - w4[1] = __byte_perm_S (w1[0], w0[3], selector); - w4[0] = __byte_perm_S (w0[3], w0[2], selector); - w3[3] = __byte_perm_S (w0[2], w0[1], selector); - w3[2] = __byte_perm_S (w0[1], w0[0], selector); - w3[1] = __byte_perm_S (w0[0], 0, selector); + c3[1] = hc_byte_perm_S ( 0, w7[3], selector); + c3[0] = hc_byte_perm_S (w7[3], w7[2], selector); + c2[3] = hc_byte_perm_S (w7[2], w7[1], selector); + c2[2] = hc_byte_perm_S (w7[1], w7[0], selector); + c2[1] = hc_byte_perm_S (w7[0], w6[3], selector); + c2[0] = hc_byte_perm_S (w6[3], w6[2], selector); + c1[3] = hc_byte_perm_S (w6[2], w6[1], selector); + c1[2] = hc_byte_perm_S (w6[1], w6[0], selector); + c1[1] = hc_byte_perm_S (w6[0], w5[3], selector); + c1[0] = hc_byte_perm_S (w5[3], w5[2], selector); + c0[3] = hc_byte_perm_S (w5[2], w5[1], selector); + c0[2] = hc_byte_perm_S (w5[1], w5[0], selector); + c0[1] = hc_byte_perm_S (w5[0], w4[3], selector); + c0[0] = hc_byte_perm_S (w4[3], w4[2], selector); + w7[3] = hc_byte_perm_S (w4[2], w4[1], selector); + w7[2] = hc_byte_perm_S (w4[1], w4[0], selector); + w7[1] = hc_byte_perm_S (w4[0], w3[3], selector); + w7[0] = hc_byte_perm_S (w3[3], w3[2], selector); + w6[3] = hc_byte_perm_S (w3[2], w3[1], selector); + w6[2] = hc_byte_perm_S (w3[1], w3[0], selector); + w6[1] = hc_byte_perm_S (w3[0], w2[3], selector); + w6[0] = hc_byte_perm_S (w2[3], w2[2], selector); + w5[3] = hc_byte_perm_S (w2[2], w2[1], selector); + w5[2] = hc_byte_perm_S (w2[1], w2[0], selector); + w5[1] = hc_byte_perm_S (w2[0], w1[3], selector); + w5[0] = hc_byte_perm_S (w1[3], w1[2], selector); + w4[3] = hc_byte_perm_S (w1[2], w1[1], selector); + w4[2] = hc_byte_perm_S (w1[1], w1[0], selector); + w4[1] = hc_byte_perm_S (w1[0], w0[3], selector); + w4[0] = hc_byte_perm_S (w0[3], w0[2], selector); + w3[3] = hc_byte_perm_S (w0[2], w0[1], selector); + w3[2] = hc_byte_perm_S (w0[1], w0[0], selector); + w3[1] = hc_byte_perm_S (w0[0], 0, selector); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -41265,39 +41265,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 14: - c3[2] = __byte_perm_S ( 0, w7[3], selector); - c3[1] = __byte_perm_S (w7[3], w7[2], selector); - c3[0] = __byte_perm_S (w7[2], w7[1], selector); - c2[3] = __byte_perm_S (w7[1], w7[0], selector); - c2[2] = __byte_perm_S (w7[0], w6[3], selector); - c2[1] = __byte_perm_S (w6[3], w6[2], selector); - c2[0] = __byte_perm_S (w6[2], w6[1], selector); - c1[3] = __byte_perm_S (w6[1], w6[0], selector); - c1[2] = __byte_perm_S (w6[0], w5[3], selector); - c1[1] = __byte_perm_S (w5[3], w5[2], selector); - c1[0] = __byte_perm_S (w5[2], w5[1], selector); - c0[3] = __byte_perm_S (w5[1], w5[0], selector); - c0[2] = __byte_perm_S (w5[0], w4[3], selector); - c0[1] = __byte_perm_S (w4[3], w4[2], selector); - c0[0] = __byte_perm_S (w4[2], w4[1], selector); - w7[3] = __byte_perm_S (w4[1], w4[0], selector); - w7[2] = __byte_perm_S (w4[0], w3[3], selector); - w7[1] = __byte_perm_S (w3[3], w3[2], selector); - w7[0] = __byte_perm_S (w3[2], w3[1], selector); - w6[3] = __byte_perm_S (w3[1], w3[0], selector); - w6[2] = __byte_perm_S (w3[0], w2[3], selector); - w6[1] = __byte_perm_S (w2[3], w2[2], selector); - w6[0] = __byte_perm_S (w2[2], w2[1], selector); - w5[3] = __byte_perm_S (w2[1], w2[0], selector); - w5[2] = __byte_perm_S (w2[0], w1[3], selector); - w5[1] = __byte_perm_S (w1[3], w1[2], selector); - w5[0] = __byte_perm_S (w1[2], w1[1], selector); - w4[3] = __byte_perm_S (w1[1], w1[0], selector); - w4[2] = __byte_perm_S (w1[0], w0[3], selector); - w4[1] = __byte_perm_S (w0[3], w0[2], selector); - w4[0] = __byte_perm_S (w0[2], w0[1], selector); - w3[3] = __byte_perm_S (w0[1], w0[0], selector); - w3[2] = __byte_perm_S (w0[0], 0, selector); + c3[2] = hc_byte_perm_S ( 0, w7[3], selector); + c3[1] = hc_byte_perm_S (w7[3], w7[2], selector); + c3[0] = hc_byte_perm_S (w7[2], w7[1], selector); + c2[3] = hc_byte_perm_S (w7[1], w7[0], selector); + c2[2] = hc_byte_perm_S (w7[0], w6[3], selector); + c2[1] = hc_byte_perm_S (w6[3], w6[2], selector); + c2[0] = hc_byte_perm_S (w6[2], w6[1], selector); + c1[3] = hc_byte_perm_S (w6[1], w6[0], selector); + c1[2] = hc_byte_perm_S (w6[0], w5[3], selector); + c1[1] = hc_byte_perm_S (w5[3], w5[2], selector); + c1[0] = hc_byte_perm_S (w5[2], w5[1], selector); + c0[3] = hc_byte_perm_S (w5[1], w5[0], selector); + c0[2] = hc_byte_perm_S (w5[0], w4[3], selector); + c0[1] = hc_byte_perm_S (w4[3], w4[2], selector); + c0[0] = hc_byte_perm_S (w4[2], w4[1], selector); + w7[3] = hc_byte_perm_S (w4[1], w4[0], selector); + w7[2] = hc_byte_perm_S (w4[0], w3[3], selector); + w7[1] = hc_byte_perm_S (w3[3], w3[2], selector); + w7[0] = hc_byte_perm_S (w3[2], w3[1], selector); + w6[3] = hc_byte_perm_S (w3[1], w3[0], selector); + w6[2] = hc_byte_perm_S (w3[0], w2[3], selector); + w6[1] = hc_byte_perm_S (w2[3], w2[2], selector); + w6[0] = hc_byte_perm_S (w2[2], w2[1], selector); + w5[3] = hc_byte_perm_S (w2[1], w2[0], selector); + w5[2] = hc_byte_perm_S (w2[0], w1[3], selector); + w5[1] = hc_byte_perm_S (w1[3], w1[2], selector); + w5[0] = hc_byte_perm_S (w1[2], w1[1], selector); + w4[3] = hc_byte_perm_S (w1[1], w1[0], selector); + w4[2] = hc_byte_perm_S (w1[0], w0[3], selector); + w4[1] = hc_byte_perm_S (w0[3], w0[2], selector); + w4[0] = hc_byte_perm_S (w0[2], w0[1], selector); + w3[3] = hc_byte_perm_S (w0[1], w0[0], selector); + w3[2] = hc_byte_perm_S (w0[0], 0, selector); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -41316,39 +41316,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 15: - c3[3] = __byte_perm_S ( 0, w7[3], selector); - c3[2] = __byte_perm_S (w7[3], w7[2], selector); - c3[1] = __byte_perm_S (w7[2], w7[1], selector); - c3[0] = __byte_perm_S (w7[1], w7[0], selector); - c2[3] = __byte_perm_S (w7[0], w6[3], selector); - c2[2] = __byte_perm_S (w6[3], w6[2], selector); - c2[1] = __byte_perm_S (w6[2], w6[1], selector); - c2[0] = __byte_perm_S (w6[1], w6[0], selector); - c1[3] = __byte_perm_S (w6[0], w5[3], selector); - c1[2] = __byte_perm_S (w5[3], w5[2], selector); - c1[1] = __byte_perm_S (w5[2], w5[1], selector); - c1[0] = __byte_perm_S (w5[1], w5[0], selector); - c0[3] = __byte_perm_S (w5[0], w4[3], selector); - c0[2] = __byte_perm_S (w4[3], w4[2], selector); - c0[1] = __byte_perm_S (w4[2], w4[1], selector); - c0[0] = __byte_perm_S (w4[1], w4[0], selector); - w7[3] = __byte_perm_S (w4[0], w3[3], selector); - w7[2] = __byte_perm_S (w3[3], w3[2], selector); - w7[1] = __byte_perm_S (w3[2], w3[1], selector); - w7[0] = __byte_perm_S (w3[1], w3[0], selector); - w6[3] = __byte_perm_S (w3[0], w2[3], selector); - w6[2] = __byte_perm_S (w2[3], w2[2], selector); - w6[1] = __byte_perm_S (w2[2], w2[1], selector); - w6[0] = __byte_perm_S (w2[1], w2[0], selector); - w5[3] = __byte_perm_S (w2[0], w1[3], selector); - w5[2] = __byte_perm_S (w1[3], w1[2], selector); - w5[1] = __byte_perm_S (w1[2], w1[1], selector); - w5[0] = __byte_perm_S (w1[1], w1[0], selector); - w4[3] = __byte_perm_S (w1[0], w0[3], selector); - w4[2] = __byte_perm_S (w0[3], w0[2], selector); - w4[1] = __byte_perm_S (w0[2], w0[1], selector); - w4[0] = __byte_perm_S (w0[1], w0[0], selector); - w3[3] = __byte_perm_S (w0[0], 0, selector); + c3[3] = hc_byte_perm_S ( 0, w7[3], selector); + c3[2] = hc_byte_perm_S (w7[3], w7[2], selector); + c3[1] = hc_byte_perm_S (w7[2], w7[1], selector); + c3[0] = hc_byte_perm_S (w7[1], w7[0], selector); + c2[3] = hc_byte_perm_S (w7[0], w6[3], selector); + c2[2] = hc_byte_perm_S (w6[3], w6[2], selector); + c2[1] = hc_byte_perm_S (w6[2], w6[1], selector); + c2[0] = hc_byte_perm_S (w6[1], w6[0], selector); + c1[3] = hc_byte_perm_S (w6[0], w5[3], selector); + c1[2] = hc_byte_perm_S (w5[3], w5[2], selector); + c1[1] = hc_byte_perm_S (w5[2], w5[1], selector); + c1[0] = hc_byte_perm_S (w5[1], w5[0], selector); + c0[3] = hc_byte_perm_S (w5[0], w4[3], selector); + c0[2] = hc_byte_perm_S (w4[3], w4[2], selector); + c0[1] = hc_byte_perm_S (w4[2], w4[1], selector); + c0[0] = hc_byte_perm_S (w4[1], w4[0], selector); + w7[3] = hc_byte_perm_S (w4[0], w3[3], selector); + w7[2] = hc_byte_perm_S (w3[3], w3[2], selector); + w7[1] = hc_byte_perm_S (w3[2], w3[1], selector); + w7[0] = hc_byte_perm_S (w3[1], w3[0], selector); + w6[3] = hc_byte_perm_S (w3[0], w2[3], selector); + w6[2] = hc_byte_perm_S (w2[3], w2[2], selector); + w6[1] = hc_byte_perm_S (w2[2], w2[1], selector); + w6[0] = hc_byte_perm_S (w2[1], w2[0], selector); + w5[3] = hc_byte_perm_S (w2[0], w1[3], selector); + w5[2] = hc_byte_perm_S (w1[3], w1[2], selector); + w5[1] = hc_byte_perm_S (w1[2], w1[1], selector); + w5[0] = hc_byte_perm_S (w1[1], w1[0], selector); + w4[3] = hc_byte_perm_S (w1[0], w0[3], selector); + w4[2] = hc_byte_perm_S (w0[3], w0[2], selector); + w4[1] = hc_byte_perm_S (w0[2], w0[1], selector); + w4[0] = hc_byte_perm_S (w0[1], w0[0], selector); + w3[3] = hc_byte_perm_S (w0[0], 0, selector); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -41368,39 +41368,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 16: - c4[0] = __byte_perm_S ( 0, w7[3], selector); - c3[3] = __byte_perm_S (w7[3], w7[2], selector); - c3[2] = __byte_perm_S (w7[2], w7[1], selector); - c3[1] = __byte_perm_S (w7[1], w7[0], selector); - c3[0] = __byte_perm_S (w7[0], w6[3], selector); - c2[3] = __byte_perm_S (w6[3], w6[2], selector); - c2[2] = __byte_perm_S (w6[2], w6[1], selector); - c2[1] = __byte_perm_S (w6[1], w6[0], selector); - c2[0] = __byte_perm_S (w6[0], w5[3], selector); - c1[3] = __byte_perm_S (w5[3], w5[2], selector); - c1[2] = __byte_perm_S (w5[2], w5[1], selector); - c1[1] = __byte_perm_S (w5[1], w5[0], selector); - c1[0] = __byte_perm_S (w5[0], w4[3], selector); - c0[3] = __byte_perm_S (w4[3], w4[2], selector); - c0[2] = __byte_perm_S (w4[2], w4[1], selector); - c0[1] = __byte_perm_S (w4[1], w4[0], selector); - c0[0] = __byte_perm_S (w4[0], w3[3], selector); - w7[3] = __byte_perm_S (w3[3], w3[2], selector); - w7[2] = __byte_perm_S (w3[2], w3[1], selector); - w7[1] = __byte_perm_S (w3[1], w3[0], selector); - w7[0] = __byte_perm_S (w3[0], w2[3], selector); - w6[3] = __byte_perm_S (w2[3], w2[2], selector); - w6[2] = __byte_perm_S (w2[2], w2[1], selector); - w6[1] = __byte_perm_S (w2[1], w2[0], selector); - w6[0] = __byte_perm_S (w2[0], w1[3], selector); - w5[3] = __byte_perm_S (w1[3], w1[2], selector); - w5[2] = __byte_perm_S (w1[2], w1[1], selector); - w5[1] = __byte_perm_S (w1[1], w1[0], selector); - w5[0] = __byte_perm_S (w1[0], w0[3], selector); - w4[3] = __byte_perm_S (w0[3], w0[2], selector); - w4[2] = __byte_perm_S (w0[2], w0[1], selector); - w4[1] = __byte_perm_S (w0[1], w0[0], selector); - w4[0] = __byte_perm_S (w0[0], 0, selector); + c4[0] = hc_byte_perm_S ( 0, w7[3], selector); + c3[3] = hc_byte_perm_S (w7[3], w7[2], selector); + c3[2] = hc_byte_perm_S (w7[2], w7[1], selector); + c3[1] = hc_byte_perm_S (w7[1], w7[0], selector); + c3[0] = hc_byte_perm_S (w7[0], w6[3], selector); + c2[3] = hc_byte_perm_S (w6[3], w6[2], selector); + c2[2] = hc_byte_perm_S (w6[2], w6[1], selector); + c2[1] = hc_byte_perm_S (w6[1], w6[0], selector); + c2[0] = hc_byte_perm_S (w6[0], w5[3], selector); + c1[3] = hc_byte_perm_S (w5[3], w5[2], selector); + c1[2] = hc_byte_perm_S (w5[2], w5[1], selector); + c1[1] = hc_byte_perm_S (w5[1], w5[0], selector); + c1[0] = hc_byte_perm_S (w5[0], w4[3], selector); + c0[3] = hc_byte_perm_S (w4[3], w4[2], selector); + c0[2] = hc_byte_perm_S (w4[2], w4[1], selector); + c0[1] = hc_byte_perm_S (w4[1], w4[0], selector); + c0[0] = hc_byte_perm_S (w4[0], w3[3], selector); + w7[3] = hc_byte_perm_S (w3[3], w3[2], selector); + w7[2] = hc_byte_perm_S (w3[2], w3[1], selector); + w7[1] = hc_byte_perm_S (w3[1], w3[0], selector); + w7[0] = hc_byte_perm_S (w3[0], w2[3], selector); + w6[3] = hc_byte_perm_S (w2[3], w2[2], selector); + w6[2] = hc_byte_perm_S (w2[2], w2[1], selector); + w6[1] = hc_byte_perm_S (w2[1], w2[0], selector); + w6[0] = hc_byte_perm_S (w2[0], w1[3], selector); + w5[3] = hc_byte_perm_S (w1[3], w1[2], selector); + w5[2] = hc_byte_perm_S (w1[2], w1[1], selector); + w5[1] = hc_byte_perm_S (w1[1], w1[0], selector); + w5[0] = hc_byte_perm_S (w1[0], w0[3], selector); + w4[3] = hc_byte_perm_S (w0[3], w0[2], selector); + w4[2] = hc_byte_perm_S (w0[2], w0[1], selector); + w4[1] = hc_byte_perm_S (w0[1], w0[0], selector); + w4[0] = hc_byte_perm_S (w0[0], 0, selector); w3[3] = 0; w3[2] = 0; w3[1] = 0; @@ -41421,39 +41421,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 17: - c4[1] = __byte_perm_S ( 0, w7[3], selector); - c4[0] = __byte_perm_S (w7[3], w7[2], selector); - c3[3] = __byte_perm_S (w7[2], w7[1], selector); - c3[2] = __byte_perm_S (w7[1], w7[0], selector); - c3[1] = __byte_perm_S (w7[0], w6[3], selector); - c3[0] = __byte_perm_S (w6[3], w6[2], selector); - c2[3] = __byte_perm_S (w6[2], w6[1], selector); - c2[2] = __byte_perm_S (w6[1], w6[0], selector); - c2[1] = __byte_perm_S (w6[0], w5[3], selector); - c2[0] = __byte_perm_S (w5[3], w5[2], selector); - c1[3] = __byte_perm_S (w5[2], w5[1], selector); - c1[2] = __byte_perm_S (w5[1], w5[0], selector); - c1[1] = __byte_perm_S (w5[0], w4[3], selector); - c1[0] = __byte_perm_S (w4[3], w4[2], selector); - c0[3] = __byte_perm_S (w4[2], w4[1], selector); - c0[2] = __byte_perm_S (w4[1], w4[0], selector); - c0[1] = __byte_perm_S (w4[0], w3[3], selector); - c0[0] = __byte_perm_S (w3[3], w3[2], selector); - w7[3] = __byte_perm_S (w3[2], w3[1], selector); - w7[2] = __byte_perm_S (w3[1], w3[0], selector); - w7[1] = __byte_perm_S (w3[0], w2[3], selector); - w7[0] = __byte_perm_S (w2[3], w2[2], selector); - w6[3] = __byte_perm_S (w2[2], w2[1], selector); - w6[2] = __byte_perm_S (w2[1], w2[0], selector); - w6[1] = __byte_perm_S (w2[0], w1[3], selector); - w6[0] = __byte_perm_S (w1[3], w1[2], selector); - w5[3] = __byte_perm_S (w1[2], w1[1], selector); - w5[2] = __byte_perm_S (w1[1], w1[0], selector); - w5[1] = __byte_perm_S (w1[0], w0[3], selector); - w5[0] = __byte_perm_S (w0[3], w0[2], selector); - w4[3] = __byte_perm_S (w0[2], w0[1], selector); - w4[2] = __byte_perm_S (w0[1], w0[0], selector); - w4[1] = __byte_perm_S (w0[0], 0, selector); + c4[1] = hc_byte_perm_S ( 0, w7[3], selector); + c4[0] = hc_byte_perm_S (w7[3], w7[2], selector); + c3[3] = hc_byte_perm_S (w7[2], w7[1], selector); + c3[2] = hc_byte_perm_S (w7[1], w7[0], selector); + c3[1] = hc_byte_perm_S (w7[0], w6[3], selector); + c3[0] = hc_byte_perm_S (w6[3], w6[2], selector); + c2[3] = hc_byte_perm_S (w6[2], w6[1], selector); + c2[2] = hc_byte_perm_S (w6[1], w6[0], selector); + c2[1] = hc_byte_perm_S (w6[0], w5[3], selector); + c2[0] = hc_byte_perm_S (w5[3], w5[2], selector); + c1[3] = hc_byte_perm_S (w5[2], w5[1], selector); + c1[2] = hc_byte_perm_S (w5[1], w5[0], selector); + c1[1] = hc_byte_perm_S (w5[0], w4[3], selector); + c1[0] = hc_byte_perm_S (w4[3], w4[2], selector); + c0[3] = hc_byte_perm_S (w4[2], w4[1], selector); + c0[2] = hc_byte_perm_S (w4[1], w4[0], selector); + c0[1] = hc_byte_perm_S (w4[0], w3[3], selector); + c0[0] = hc_byte_perm_S (w3[3], w3[2], selector); + w7[3] = hc_byte_perm_S (w3[2], w3[1], selector); + w7[2] = hc_byte_perm_S (w3[1], w3[0], selector); + w7[1] = hc_byte_perm_S (w3[0], w2[3], selector); + w7[0] = hc_byte_perm_S (w2[3], w2[2], selector); + w6[3] = hc_byte_perm_S (w2[2], w2[1], selector); + w6[2] = hc_byte_perm_S (w2[1], w2[0], selector); + w6[1] = hc_byte_perm_S (w2[0], w1[3], selector); + w6[0] = hc_byte_perm_S (w1[3], w1[2], selector); + w5[3] = hc_byte_perm_S (w1[2], w1[1], selector); + w5[2] = hc_byte_perm_S (w1[1], w1[0], selector); + w5[1] = hc_byte_perm_S (w1[0], w0[3], selector); + w5[0] = hc_byte_perm_S (w0[3], w0[2], selector); + w4[3] = hc_byte_perm_S (w0[2], w0[1], selector); + w4[2] = hc_byte_perm_S (w0[1], w0[0], selector); + w4[1] = hc_byte_perm_S (w0[0], 0, selector); w4[0] = 0; w3[3] = 0; w3[2] = 0; @@ -41475,39 +41475,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 18: - c4[2] = __byte_perm_S ( 0, w7[3], selector); - c4[1] = __byte_perm_S (w7[3], w7[2], selector); - c4[0] = __byte_perm_S (w7[2], w7[1], selector); - c3[3] = __byte_perm_S (w7[1], w7[0], selector); - c3[2] = __byte_perm_S (w7[0], w6[3], selector); - c3[1] = __byte_perm_S (w6[3], w6[2], selector); - c3[0] = __byte_perm_S (w6[2], w6[1], selector); - c2[3] = __byte_perm_S (w6[1], w6[0], selector); - c2[2] = __byte_perm_S (w6[0], w5[3], selector); - c2[1] = __byte_perm_S (w5[3], w5[2], selector); - c2[0] = __byte_perm_S (w5[2], w5[1], selector); - c1[3] = __byte_perm_S (w5[1], w5[0], selector); - c1[2] = __byte_perm_S (w5[0], w4[3], selector); - c1[1] = __byte_perm_S (w4[3], w4[2], selector); - c1[0] = __byte_perm_S (w4[2], w4[1], selector); - c0[3] = __byte_perm_S (w4[1], w4[0], selector); - c0[2] = __byte_perm_S (w4[0], w3[3], selector); - c0[1] = __byte_perm_S (w3[3], w3[2], selector); - c0[0] = __byte_perm_S (w3[2], w3[1], selector); - w7[3] = __byte_perm_S (w3[1], w3[0], selector); - w7[2] = __byte_perm_S (w3[0], w2[3], selector); - w7[1] = __byte_perm_S (w2[3], w2[2], selector); - w7[0] = __byte_perm_S (w2[2], w2[1], selector); - w6[3] = __byte_perm_S (w2[1], w2[0], selector); - w6[2] = __byte_perm_S (w2[0], w1[3], selector); - w6[1] = __byte_perm_S (w1[3], w1[2], selector); - w6[0] = __byte_perm_S (w1[2], w1[1], selector); - w5[3] = __byte_perm_S (w1[1], w1[0], selector); - w5[2] = __byte_perm_S (w1[0], w0[3], selector); - w5[1] = __byte_perm_S (w0[3], w0[2], selector); - w5[0] = __byte_perm_S (w0[2], w0[1], selector); - w4[3] = __byte_perm_S (w0[1], w0[0], selector); - w4[2] = __byte_perm_S (w0[0], 0, selector); + c4[2] = hc_byte_perm_S ( 0, w7[3], selector); + c4[1] = hc_byte_perm_S (w7[3], w7[2], selector); + c4[0] = hc_byte_perm_S (w7[2], w7[1], selector); + c3[3] = hc_byte_perm_S (w7[1], w7[0], selector); + c3[2] = hc_byte_perm_S (w7[0], w6[3], selector); + c3[1] = hc_byte_perm_S (w6[3], w6[2], selector); + c3[0] = hc_byte_perm_S (w6[2], w6[1], selector); + c2[3] = hc_byte_perm_S (w6[1], w6[0], selector); + c2[2] = hc_byte_perm_S (w6[0], w5[3], selector); + c2[1] = hc_byte_perm_S (w5[3], w5[2], selector); + c2[0] = hc_byte_perm_S (w5[2], w5[1], selector); + c1[3] = hc_byte_perm_S (w5[1], w5[0], selector); + c1[2] = hc_byte_perm_S (w5[0], w4[3], selector); + c1[1] = hc_byte_perm_S (w4[3], w4[2], selector); + c1[0] = hc_byte_perm_S (w4[2], w4[1], selector); + c0[3] = hc_byte_perm_S (w4[1], w4[0], selector); + c0[2] = hc_byte_perm_S (w4[0], w3[3], selector); + c0[1] = hc_byte_perm_S (w3[3], w3[2], selector); + c0[0] = hc_byte_perm_S (w3[2], w3[1], selector); + w7[3] = hc_byte_perm_S (w3[1], w3[0], selector); + w7[2] = hc_byte_perm_S (w3[0], w2[3], selector); + w7[1] = hc_byte_perm_S (w2[3], w2[2], selector); + w7[0] = hc_byte_perm_S (w2[2], w2[1], selector); + w6[3] = hc_byte_perm_S (w2[1], w2[0], selector); + w6[2] = hc_byte_perm_S (w2[0], w1[3], selector); + w6[1] = hc_byte_perm_S (w1[3], w1[2], selector); + w6[0] = hc_byte_perm_S (w1[2], w1[1], selector); + w5[3] = hc_byte_perm_S (w1[1], w1[0], selector); + w5[2] = hc_byte_perm_S (w1[0], w0[3], selector); + w5[1] = hc_byte_perm_S (w0[3], w0[2], selector); + w5[0] = hc_byte_perm_S (w0[2], w0[1], selector); + w4[3] = hc_byte_perm_S (w0[1], w0[0], selector); + w4[2] = hc_byte_perm_S (w0[0], 0, selector); w4[1] = 0; w4[0] = 0; w3[3] = 0; @@ -41530,39 +41530,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 19: - c4[3] = __byte_perm_S ( 0, w7[3], selector); - c4[2] = __byte_perm_S (w7[3], w7[2], selector); - c4[1] = __byte_perm_S (w7[2], w7[1], selector); - c4[0] = __byte_perm_S (w7[1], w7[0], selector); - c3[3] = __byte_perm_S (w7[0], w6[3], selector); - c3[2] = __byte_perm_S (w6[3], w6[2], selector); - c3[1] = __byte_perm_S (w6[2], w6[1], selector); - c3[0] = __byte_perm_S (w6[1], w6[0], selector); - c2[3] = __byte_perm_S (w6[0], w5[3], selector); - c2[2] = __byte_perm_S (w5[3], w5[2], selector); - c2[1] = __byte_perm_S (w5[2], w5[1], selector); - c2[0] = __byte_perm_S (w5[1], w5[0], selector); - c1[3] = __byte_perm_S (w5[0], w4[3], selector); - c1[2] = __byte_perm_S (w4[3], w4[2], selector); - c1[1] = __byte_perm_S (w4[2], w4[1], selector); - c1[0] = __byte_perm_S (w4[1], w4[0], selector); - c0[3] = __byte_perm_S (w4[0], w3[3], selector); - c0[2] = __byte_perm_S (w3[3], w3[2], selector); - c0[1] = __byte_perm_S (w3[2], w3[1], selector); - c0[0] = __byte_perm_S (w3[1], w3[0], selector); - w7[3] = __byte_perm_S (w3[0], w2[3], selector); - w7[2] = __byte_perm_S (w2[3], w2[2], selector); - w7[1] = __byte_perm_S (w2[2], w2[1], selector); - w7[0] = __byte_perm_S (w2[1], w2[0], selector); - w6[3] = __byte_perm_S (w2[0], w1[3], selector); - w6[2] = __byte_perm_S (w1[3], w1[2], selector); - w6[1] = __byte_perm_S (w1[2], w1[1], selector); - w6[0] = __byte_perm_S (w1[1], w1[0], selector); - w5[3] = __byte_perm_S (w1[0], w0[3], selector); - w5[2] = __byte_perm_S (w0[3], w0[2], selector); - w5[1] = __byte_perm_S (w0[2], w0[1], selector); - w5[0] = __byte_perm_S (w0[1], w0[0], selector); - w4[3] = __byte_perm_S (w0[0], 0, selector); + c4[3] = hc_byte_perm_S ( 0, w7[3], selector); + c4[2] = hc_byte_perm_S (w7[3], w7[2], selector); + c4[1] = hc_byte_perm_S (w7[2], w7[1], selector); + c4[0] = hc_byte_perm_S (w7[1], w7[0], selector); + c3[3] = hc_byte_perm_S (w7[0], w6[3], selector); + c3[2] = hc_byte_perm_S (w6[3], w6[2], selector); + c3[1] = hc_byte_perm_S (w6[2], w6[1], selector); + c3[0] = hc_byte_perm_S (w6[1], w6[0], selector); + c2[3] = hc_byte_perm_S (w6[0], w5[3], selector); + c2[2] = hc_byte_perm_S (w5[3], w5[2], selector); + c2[1] = hc_byte_perm_S (w5[2], w5[1], selector); + c2[0] = hc_byte_perm_S (w5[1], w5[0], selector); + c1[3] = hc_byte_perm_S (w5[0], w4[3], selector); + c1[2] = hc_byte_perm_S (w4[3], w4[2], selector); + c1[1] = hc_byte_perm_S (w4[2], w4[1], selector); + c1[0] = hc_byte_perm_S (w4[1], w4[0], selector); + c0[3] = hc_byte_perm_S (w4[0], w3[3], selector); + c0[2] = hc_byte_perm_S (w3[3], w3[2], selector); + c0[1] = hc_byte_perm_S (w3[2], w3[1], selector); + c0[0] = hc_byte_perm_S (w3[1], w3[0], selector); + w7[3] = hc_byte_perm_S (w3[0], w2[3], selector); + w7[2] = hc_byte_perm_S (w2[3], w2[2], selector); + w7[1] = hc_byte_perm_S (w2[2], w2[1], selector); + w7[0] = hc_byte_perm_S (w2[1], w2[0], selector); + w6[3] = hc_byte_perm_S (w2[0], w1[3], selector); + w6[2] = hc_byte_perm_S (w1[3], w1[2], selector); + w6[1] = hc_byte_perm_S (w1[2], w1[1], selector); + w6[0] = hc_byte_perm_S (w1[1], w1[0], selector); + w5[3] = hc_byte_perm_S (w1[0], w0[3], selector); + w5[2] = hc_byte_perm_S (w0[3], w0[2], selector); + w5[1] = hc_byte_perm_S (w0[2], w0[1], selector); + w5[0] = hc_byte_perm_S (w0[1], w0[0], selector); + w4[3] = hc_byte_perm_S (w0[0], 0, selector); w4[2] = 0; w4[1] = 0; w4[0] = 0; @@ -41586,39 +41586,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 20: - c5[0] = __byte_perm_S ( 0, w7[3], selector); - c4[3] = __byte_perm_S (w7[3], w7[2], selector); - c4[2] = __byte_perm_S (w7[2], w7[1], selector); - c4[1] = __byte_perm_S (w7[1], w7[0], selector); - c4[0] = __byte_perm_S (w7[0], w6[3], selector); - c3[3] = __byte_perm_S (w6[3], w6[2], selector); - c3[2] = __byte_perm_S (w6[2], w6[1], selector); - c3[1] = __byte_perm_S (w6[1], w6[0], selector); - c3[0] = __byte_perm_S (w6[0], w5[3], selector); - c2[3] = __byte_perm_S (w5[3], w5[2], selector); - c2[2] = __byte_perm_S (w5[2], w5[1], selector); - c2[1] = __byte_perm_S (w5[1], w5[0], selector); - c2[0] = __byte_perm_S (w5[0], w4[3], selector); - c1[3] = __byte_perm_S (w4[3], w4[2], selector); - c1[2] = __byte_perm_S (w4[2], w4[1], selector); - c1[1] = __byte_perm_S (w4[1], w4[0], selector); - c1[0] = __byte_perm_S (w4[0], w3[3], selector); - c0[3] = __byte_perm_S (w3[3], w3[2], selector); - c0[2] = __byte_perm_S (w3[2], w3[1], selector); - c0[1] = __byte_perm_S (w3[1], w3[0], selector); - c0[0] = __byte_perm_S (w3[0], w2[3], selector); - w7[3] = __byte_perm_S (w2[3], w2[2], selector); - w7[2] = __byte_perm_S (w2[2], w2[1], selector); - w7[1] = __byte_perm_S (w2[1], w2[0], selector); - w7[0] = __byte_perm_S (w2[0], w1[3], selector); - w6[3] = __byte_perm_S (w1[3], w1[2], selector); - w6[2] = __byte_perm_S (w1[2], w1[1], selector); - w6[1] = __byte_perm_S (w1[1], w1[0], selector); - w6[0] = __byte_perm_S (w1[0], w0[3], selector); - w5[3] = __byte_perm_S (w0[3], w0[2], selector); - w5[2] = __byte_perm_S (w0[2], w0[1], selector); - w5[1] = __byte_perm_S (w0[1], w0[0], selector); - w5[0] = __byte_perm_S (w0[0], 0, selector); + c5[0] = hc_byte_perm_S ( 0, w7[3], selector); + c4[3] = hc_byte_perm_S (w7[3], w7[2], selector); + c4[2] = hc_byte_perm_S (w7[2], w7[1], selector); + c4[1] = hc_byte_perm_S (w7[1], w7[0], selector); + c4[0] = hc_byte_perm_S (w7[0], w6[3], selector); + c3[3] = hc_byte_perm_S (w6[3], w6[2], selector); + c3[2] = hc_byte_perm_S (w6[2], w6[1], selector); + c3[1] = hc_byte_perm_S (w6[1], w6[0], selector); + c3[0] = hc_byte_perm_S (w6[0], w5[3], selector); + c2[3] = hc_byte_perm_S (w5[3], w5[2], selector); + c2[2] = hc_byte_perm_S (w5[2], w5[1], selector); + c2[1] = hc_byte_perm_S (w5[1], w5[0], selector); + c2[0] = hc_byte_perm_S (w5[0], w4[3], selector); + c1[3] = hc_byte_perm_S (w4[3], w4[2], selector); + c1[2] = hc_byte_perm_S (w4[2], w4[1], selector); + c1[1] = hc_byte_perm_S (w4[1], w4[0], selector); + c1[0] = hc_byte_perm_S (w4[0], w3[3], selector); + c0[3] = hc_byte_perm_S (w3[3], w3[2], selector); + c0[2] = hc_byte_perm_S (w3[2], w3[1], selector); + c0[1] = hc_byte_perm_S (w3[1], w3[0], selector); + c0[0] = hc_byte_perm_S (w3[0], w2[3], selector); + w7[3] = hc_byte_perm_S (w2[3], w2[2], selector); + w7[2] = hc_byte_perm_S (w2[2], w2[1], selector); + w7[1] = hc_byte_perm_S (w2[1], w2[0], selector); + w7[0] = hc_byte_perm_S (w2[0], w1[3], selector); + w6[3] = hc_byte_perm_S (w1[3], w1[2], selector); + w6[2] = hc_byte_perm_S (w1[2], w1[1], selector); + w6[1] = hc_byte_perm_S (w1[1], w1[0], selector); + w6[0] = hc_byte_perm_S (w1[0], w0[3], selector); + w5[3] = hc_byte_perm_S (w0[3], w0[2], selector); + w5[2] = hc_byte_perm_S (w0[2], w0[1], selector); + w5[1] = hc_byte_perm_S (w0[1], w0[0], selector); + w5[0] = hc_byte_perm_S (w0[0], 0, selector); w4[3] = 0; w4[2] = 0; w4[1] = 0; @@ -41643,39 +41643,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 21: - c5[1] = __byte_perm_S ( 0, w7[3], selector); - c5[0] = __byte_perm_S (w7[3], w7[2], selector); - c4[3] = __byte_perm_S (w7[2], w7[1], selector); - c4[2] = __byte_perm_S (w7[1], w7[0], selector); - c4[1] = __byte_perm_S (w7[0], w6[3], selector); - c4[0] = __byte_perm_S (w6[3], w6[2], selector); - c3[3] = __byte_perm_S (w6[2], w6[1], selector); - c3[2] = __byte_perm_S (w6[1], w6[0], selector); - c3[1] = __byte_perm_S (w6[0], w5[3], selector); - c3[0] = __byte_perm_S (w5[3], w5[2], selector); - c2[3] = __byte_perm_S (w5[2], w5[1], selector); - c2[2] = __byte_perm_S (w5[1], w5[0], selector); - c2[1] = __byte_perm_S (w5[0], w4[3], selector); - c2[0] = __byte_perm_S (w4[3], w4[2], selector); - c1[3] = __byte_perm_S (w4[2], w4[1], selector); - c1[2] = __byte_perm_S (w4[1], w4[0], selector); - c1[1] = __byte_perm_S (w4[0], w3[3], selector); - c1[0] = __byte_perm_S (w3[3], w3[2], selector); - c0[3] = __byte_perm_S (w3[2], w3[1], selector); - c0[2] = __byte_perm_S (w3[1], w3[0], selector); - c0[1] = __byte_perm_S (w3[0], w2[3], selector); - c0[0] = __byte_perm_S (w2[3], w2[2], selector); - w7[3] = __byte_perm_S (w2[2], w2[1], selector); - w7[2] = __byte_perm_S (w2[1], w2[0], selector); - w7[1] = __byte_perm_S (w2[0], w1[3], selector); - w7[0] = __byte_perm_S (w1[3], w1[2], selector); - w6[3] = __byte_perm_S (w1[2], w1[1], selector); - w6[2] = __byte_perm_S (w1[1], w1[0], selector); - w6[1] = __byte_perm_S (w1[0], w0[3], selector); - w6[0] = __byte_perm_S (w0[3], w0[2], selector); - w5[3] = __byte_perm_S (w0[2], w0[1], selector); - w5[2] = __byte_perm_S (w0[1], w0[0], selector); - w5[1] = __byte_perm_S (w0[0], 0, selector); + c5[1] = hc_byte_perm_S ( 0, w7[3], selector); + c5[0] = hc_byte_perm_S (w7[3], w7[2], selector); + c4[3] = hc_byte_perm_S (w7[2], w7[1], selector); + c4[2] = hc_byte_perm_S (w7[1], w7[0], selector); + c4[1] = hc_byte_perm_S (w7[0], w6[3], selector); + c4[0] = hc_byte_perm_S (w6[3], w6[2], selector); + c3[3] = hc_byte_perm_S (w6[2], w6[1], selector); + c3[2] = hc_byte_perm_S (w6[1], w6[0], selector); + c3[1] = hc_byte_perm_S (w6[0], w5[3], selector); + c3[0] = hc_byte_perm_S (w5[3], w5[2], selector); + c2[3] = hc_byte_perm_S (w5[2], w5[1], selector); + c2[2] = hc_byte_perm_S (w5[1], w5[0], selector); + c2[1] = hc_byte_perm_S (w5[0], w4[3], selector); + c2[0] = hc_byte_perm_S (w4[3], w4[2], selector); + c1[3] = hc_byte_perm_S (w4[2], w4[1], selector); + c1[2] = hc_byte_perm_S (w4[1], w4[0], selector); + c1[1] = hc_byte_perm_S (w4[0], w3[3], selector); + c1[0] = hc_byte_perm_S (w3[3], w3[2], selector); + c0[3] = hc_byte_perm_S (w3[2], w3[1], selector); + c0[2] = hc_byte_perm_S (w3[1], w3[0], selector); + c0[1] = hc_byte_perm_S (w3[0], w2[3], selector); + c0[0] = hc_byte_perm_S (w2[3], w2[2], selector); + w7[3] = hc_byte_perm_S (w2[2], w2[1], selector); + w7[2] = hc_byte_perm_S (w2[1], w2[0], selector); + w7[1] = hc_byte_perm_S (w2[0], w1[3], selector); + w7[0] = hc_byte_perm_S (w1[3], w1[2], selector); + w6[3] = hc_byte_perm_S (w1[2], w1[1], selector); + w6[2] = hc_byte_perm_S (w1[1], w1[0], selector); + w6[1] = hc_byte_perm_S (w1[0], w0[3], selector); + w6[0] = hc_byte_perm_S (w0[3], w0[2], selector); + w5[3] = hc_byte_perm_S (w0[2], w0[1], selector); + w5[2] = hc_byte_perm_S (w0[1], w0[0], selector); + w5[1] = hc_byte_perm_S (w0[0], 0, selector); w5[0] = 0; w4[3] = 0; w4[2] = 0; @@ -41701,39 +41701,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 22: - c5[2] = __byte_perm_S ( 0, w7[3], selector); - c5[1] = __byte_perm_S (w7[3], w7[2], selector); - c5[0] = __byte_perm_S (w7[2], w7[1], selector); - c4[3] = __byte_perm_S (w7[1], w7[0], selector); - c4[2] = __byte_perm_S (w7[0], w6[3], selector); - c4[1] = __byte_perm_S (w6[3], w6[2], selector); - c4[0] = __byte_perm_S (w6[2], w6[1], selector); - c3[3] = __byte_perm_S (w6[1], w6[0], selector); - c3[2] = __byte_perm_S (w6[0], w5[3], selector); - c3[1] = __byte_perm_S (w5[3], w5[2], selector); - c3[0] = __byte_perm_S (w5[2], w5[1], selector); - c2[3] = __byte_perm_S (w5[1], w5[0], selector); - c2[2] = __byte_perm_S (w5[0], w4[3], selector); - c2[1] = __byte_perm_S (w4[3], w4[2], selector); - c2[0] = __byte_perm_S (w4[2], w4[1], selector); - c1[3] = __byte_perm_S (w4[1], w4[0], selector); - c1[2] = __byte_perm_S (w4[0], w3[3], selector); - c1[1] = __byte_perm_S (w3[3], w3[2], selector); - c1[0] = __byte_perm_S (w3[2], w3[1], selector); - c0[3] = __byte_perm_S (w3[1], w3[0], selector); - c0[2] = __byte_perm_S (w3[0], w2[3], selector); - c0[1] = __byte_perm_S (w2[3], w2[2], selector); - c0[0] = __byte_perm_S (w2[2], w2[1], selector); - w7[3] = __byte_perm_S (w2[1], w2[0], selector); - w7[2] = __byte_perm_S (w2[0], w1[3], selector); - w7[1] = __byte_perm_S (w1[3], w1[2], selector); - w7[0] = __byte_perm_S (w1[2], w1[1], selector); - w6[3] = __byte_perm_S (w1[1], w1[0], selector); - w6[2] = __byte_perm_S (w1[0], w0[3], selector); - w6[1] = __byte_perm_S (w0[3], w0[2], selector); - w6[0] = __byte_perm_S (w0[2], w0[1], selector); - w5[3] = __byte_perm_S (w0[1], w0[0], selector); - w5[2] = __byte_perm_S (w0[0], 0, selector); + c5[2] = hc_byte_perm_S ( 0, w7[3], selector); + c5[1] = hc_byte_perm_S (w7[3], w7[2], selector); + c5[0] = hc_byte_perm_S (w7[2], w7[1], selector); + c4[3] = hc_byte_perm_S (w7[1], w7[0], selector); + c4[2] = hc_byte_perm_S (w7[0], w6[3], selector); + c4[1] = hc_byte_perm_S (w6[3], w6[2], selector); + c4[0] = hc_byte_perm_S (w6[2], w6[1], selector); + c3[3] = hc_byte_perm_S (w6[1], w6[0], selector); + c3[2] = hc_byte_perm_S (w6[0], w5[3], selector); + c3[1] = hc_byte_perm_S (w5[3], w5[2], selector); + c3[0] = hc_byte_perm_S (w5[2], w5[1], selector); + c2[3] = hc_byte_perm_S (w5[1], w5[0], selector); + c2[2] = hc_byte_perm_S (w5[0], w4[3], selector); + c2[1] = hc_byte_perm_S (w4[3], w4[2], selector); + c2[0] = hc_byte_perm_S (w4[2], w4[1], selector); + c1[3] = hc_byte_perm_S (w4[1], w4[0], selector); + c1[2] = hc_byte_perm_S (w4[0], w3[3], selector); + c1[1] = hc_byte_perm_S (w3[3], w3[2], selector); + c1[0] = hc_byte_perm_S (w3[2], w3[1], selector); + c0[3] = hc_byte_perm_S (w3[1], w3[0], selector); + c0[2] = hc_byte_perm_S (w3[0], w2[3], selector); + c0[1] = hc_byte_perm_S (w2[3], w2[2], selector); + c0[0] = hc_byte_perm_S (w2[2], w2[1], selector); + w7[3] = hc_byte_perm_S (w2[1], w2[0], selector); + w7[2] = hc_byte_perm_S (w2[0], w1[3], selector); + w7[1] = hc_byte_perm_S (w1[3], w1[2], selector); + w7[0] = hc_byte_perm_S (w1[2], w1[1], selector); + w6[3] = hc_byte_perm_S (w1[1], w1[0], selector); + w6[2] = hc_byte_perm_S (w1[0], w0[3], selector); + w6[1] = hc_byte_perm_S (w0[3], w0[2], selector); + w6[0] = hc_byte_perm_S (w0[2], w0[1], selector); + w5[3] = hc_byte_perm_S (w0[1], w0[0], selector); + w5[2] = hc_byte_perm_S (w0[0], 0, selector); w5[1] = 0; w5[0] = 0; w4[3] = 0; @@ -41760,39 +41760,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 23: - c5[3] = __byte_perm_S ( 0, w7[3], selector); - c5[2] = __byte_perm_S (w7[3], w7[2], selector); - c5[1] = __byte_perm_S (w7[2], w7[1], selector); - c5[0] = __byte_perm_S (w7[1], w7[0], selector); - c4[3] = __byte_perm_S (w7[0], w6[3], selector); - c4[2] = __byte_perm_S (w6[3], w6[2], selector); - c4[1] = __byte_perm_S (w6[2], w6[1], selector); - c4[0] = __byte_perm_S (w6[1], w6[0], selector); - c3[3] = __byte_perm_S (w6[0], w5[3], selector); - c3[2] = __byte_perm_S (w5[3], w5[2], selector); - c3[1] = __byte_perm_S (w5[2], w5[1], selector); - c3[0] = __byte_perm_S (w5[1], w5[0], selector); - c2[3] = __byte_perm_S (w5[0], w4[3], selector); - c2[2] = __byte_perm_S (w4[3], w4[2], selector); - c2[1] = __byte_perm_S (w4[2], w4[1], selector); - c2[0] = __byte_perm_S (w4[1], w4[0], selector); - c1[3] = __byte_perm_S (w4[0], w3[3], selector); - c1[2] = __byte_perm_S (w3[3], w3[2], selector); - c1[1] = __byte_perm_S (w3[2], w3[1], selector); - c1[0] = __byte_perm_S (w3[1], w3[0], selector); - c0[3] = __byte_perm_S (w3[0], w2[3], selector); - c0[2] = __byte_perm_S (w2[3], w2[2], selector); - c0[1] = __byte_perm_S (w2[2], w2[1], selector); - c0[0] = __byte_perm_S (w2[1], w2[0], selector); - w7[3] = __byte_perm_S (w2[0], w1[3], selector); - w7[2] = __byte_perm_S (w1[3], w1[2], selector); - w7[1] = __byte_perm_S (w1[2], w1[1], selector); - w7[0] = __byte_perm_S (w1[1], w1[0], selector); - w6[3] = __byte_perm_S (w1[0], w0[3], selector); - w6[2] = __byte_perm_S (w0[3], w0[2], selector); - w6[1] = __byte_perm_S (w0[2], w0[1], selector); - w6[0] = __byte_perm_S (w0[1], w0[0], selector); - w5[3] = __byte_perm_S (w0[0], 0, selector); + c5[3] = hc_byte_perm_S ( 0, w7[3], selector); + c5[2] = hc_byte_perm_S (w7[3], w7[2], selector); + c5[1] = hc_byte_perm_S (w7[2], w7[1], selector); + c5[0] = hc_byte_perm_S (w7[1], w7[0], selector); + c4[3] = hc_byte_perm_S (w7[0], w6[3], selector); + c4[2] = hc_byte_perm_S (w6[3], w6[2], selector); + c4[1] = hc_byte_perm_S (w6[2], w6[1], selector); + c4[0] = hc_byte_perm_S (w6[1], w6[0], selector); + c3[3] = hc_byte_perm_S (w6[0], w5[3], selector); + c3[2] = hc_byte_perm_S (w5[3], w5[2], selector); + c3[1] = hc_byte_perm_S (w5[2], w5[1], selector); + c3[0] = hc_byte_perm_S (w5[1], w5[0], selector); + c2[3] = hc_byte_perm_S (w5[0], w4[3], selector); + c2[2] = hc_byte_perm_S (w4[3], w4[2], selector); + c2[1] = hc_byte_perm_S (w4[2], w4[1], selector); + c2[0] = hc_byte_perm_S (w4[1], w4[0], selector); + c1[3] = hc_byte_perm_S (w4[0], w3[3], selector); + c1[2] = hc_byte_perm_S (w3[3], w3[2], selector); + c1[1] = hc_byte_perm_S (w3[2], w3[1], selector); + c1[0] = hc_byte_perm_S (w3[1], w3[0], selector); + c0[3] = hc_byte_perm_S (w3[0], w2[3], selector); + c0[2] = hc_byte_perm_S (w2[3], w2[2], selector); + c0[1] = hc_byte_perm_S (w2[2], w2[1], selector); + c0[0] = hc_byte_perm_S (w2[1], w2[0], selector); + w7[3] = hc_byte_perm_S (w2[0], w1[3], selector); + w7[2] = hc_byte_perm_S (w1[3], w1[2], selector); + w7[1] = hc_byte_perm_S (w1[2], w1[1], selector); + w7[0] = hc_byte_perm_S (w1[1], w1[0], selector); + w6[3] = hc_byte_perm_S (w1[0], w0[3], selector); + w6[2] = hc_byte_perm_S (w0[3], w0[2], selector); + w6[1] = hc_byte_perm_S (w0[2], w0[1], selector); + w6[0] = hc_byte_perm_S (w0[1], w0[0], selector); + w5[3] = hc_byte_perm_S (w0[0], 0, selector); w5[2] = 0; w5[1] = 0; w5[0] = 0; @@ -41820,39 +41820,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 24: - c6[0] = __byte_perm_S ( 0, w7[3], selector); - c5[3] = __byte_perm_S (w7[3], w7[2], selector); - c5[2] = __byte_perm_S (w7[2], w7[1], selector); - c5[1] = __byte_perm_S (w7[1], w7[0], selector); - c5[0] = __byte_perm_S (w7[0], w6[3], selector); - c4[3] = __byte_perm_S (w6[3], w6[2], selector); - c4[2] = __byte_perm_S (w6[2], w6[1], selector); - c4[1] = __byte_perm_S (w6[1], w6[0], selector); - c4[0] = __byte_perm_S (w6[0], w5[3], selector); - c3[3] = __byte_perm_S (w5[3], w5[2], selector); - c3[2] = __byte_perm_S (w5[2], w5[1], selector); - c3[1] = __byte_perm_S (w5[1], w5[0], selector); - c3[0] = __byte_perm_S (w5[0], w4[3], selector); - c2[3] = __byte_perm_S (w4[3], w4[2], selector); - c2[2] = __byte_perm_S (w4[2], w4[1], selector); - c2[1] = __byte_perm_S (w4[1], w4[0], selector); - c2[0] = __byte_perm_S (w4[0], w3[3], selector); - c1[3] = __byte_perm_S (w3[3], w3[2], selector); - c1[2] = __byte_perm_S (w3[2], w3[1], selector); - c1[1] = __byte_perm_S (w3[1], w3[0], selector); - c1[0] = __byte_perm_S (w3[0], w2[3], selector); - c0[3] = __byte_perm_S (w2[3], w2[2], selector); - c0[2] = __byte_perm_S (w2[2], w2[1], selector); - c0[1] = __byte_perm_S (w2[1], w2[0], selector); - c0[0] = __byte_perm_S (w2[0], w1[3], selector); - w7[3] = __byte_perm_S (w1[3], w1[2], selector); - w7[2] = __byte_perm_S (w1[2], w1[1], selector); - w7[1] = __byte_perm_S (w1[1], w1[0], selector); - w7[0] = __byte_perm_S (w1[0], w0[3], selector); - w6[3] = __byte_perm_S (w0[3], w0[2], selector); - w6[2] = __byte_perm_S (w0[2], w0[1], selector); - w6[1] = __byte_perm_S (w0[1], w0[0], selector); - w6[0] = __byte_perm_S (w0[0], 0, selector); + c6[0] = hc_byte_perm_S ( 0, w7[3], selector); + c5[3] = hc_byte_perm_S (w7[3], w7[2], selector); + c5[2] = hc_byte_perm_S (w7[2], w7[1], selector); + c5[1] = hc_byte_perm_S (w7[1], w7[0], selector); + c5[0] = hc_byte_perm_S (w7[0], w6[3], selector); + c4[3] = hc_byte_perm_S (w6[3], w6[2], selector); + c4[2] = hc_byte_perm_S (w6[2], w6[1], selector); + c4[1] = hc_byte_perm_S (w6[1], w6[0], selector); + c4[0] = hc_byte_perm_S (w6[0], w5[3], selector); + c3[3] = hc_byte_perm_S (w5[3], w5[2], selector); + c3[2] = hc_byte_perm_S (w5[2], w5[1], selector); + c3[1] = hc_byte_perm_S (w5[1], w5[0], selector); + c3[0] = hc_byte_perm_S (w5[0], w4[3], selector); + c2[3] = hc_byte_perm_S (w4[3], w4[2], selector); + c2[2] = hc_byte_perm_S (w4[2], w4[1], selector); + c2[1] = hc_byte_perm_S (w4[1], w4[0], selector); + c2[0] = hc_byte_perm_S (w4[0], w3[3], selector); + c1[3] = hc_byte_perm_S (w3[3], w3[2], selector); + c1[2] = hc_byte_perm_S (w3[2], w3[1], selector); + c1[1] = hc_byte_perm_S (w3[1], w3[0], selector); + c1[0] = hc_byte_perm_S (w3[0], w2[3], selector); + c0[3] = hc_byte_perm_S (w2[3], w2[2], selector); + c0[2] = hc_byte_perm_S (w2[2], w2[1], selector); + c0[1] = hc_byte_perm_S (w2[1], w2[0], selector); + c0[0] = hc_byte_perm_S (w2[0], w1[3], selector); + w7[3] = hc_byte_perm_S (w1[3], w1[2], selector); + w7[2] = hc_byte_perm_S (w1[2], w1[1], selector); + w7[1] = hc_byte_perm_S (w1[1], w1[0], selector); + w7[0] = hc_byte_perm_S (w1[0], w0[3], selector); + w6[3] = hc_byte_perm_S (w0[3], w0[2], selector); + w6[2] = hc_byte_perm_S (w0[2], w0[1], selector); + w6[1] = hc_byte_perm_S (w0[1], w0[0], selector); + w6[0] = hc_byte_perm_S (w0[0], 0, selector); w5[3] = 0; w5[2] = 0; w5[1] = 0; @@ -41881,39 +41881,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 25: - c6[1] = __byte_perm_S ( 0, w7[3], selector); - c6[0] = __byte_perm_S (w7[3], w7[2], selector); - c5[3] = __byte_perm_S (w7[2], w7[1], selector); - c5[2] = __byte_perm_S (w7[1], w7[0], selector); - c5[1] = __byte_perm_S (w7[0], w6[3], selector); - c5[0] = __byte_perm_S (w6[3], w6[2], selector); - c4[3] = __byte_perm_S (w6[2], w6[1], selector); - c4[2] = __byte_perm_S (w6[1], w6[0], selector); - c4[1] = __byte_perm_S (w6[0], w5[3], selector); - c4[0] = __byte_perm_S (w5[3], w5[2], selector); - c3[3] = __byte_perm_S (w5[2], w5[1], selector); - c3[2] = __byte_perm_S (w5[1], w5[0], selector); - c3[1] = __byte_perm_S (w5[0], w4[3], selector); - c3[0] = __byte_perm_S (w4[3], w4[2], selector); - c2[3] = __byte_perm_S (w4[2], w4[1], selector); - c2[2] = __byte_perm_S (w4[1], w4[0], selector); - c2[1] = __byte_perm_S (w4[0], w3[3], selector); - c2[0] = __byte_perm_S (w3[3], w3[2], selector); - c1[3] = __byte_perm_S (w3[2], w3[1], selector); - c1[2] = __byte_perm_S (w3[1], w3[0], selector); - c1[1] = __byte_perm_S (w3[0], w2[3], selector); - c1[0] = __byte_perm_S (w2[3], w2[2], selector); - c0[3] = __byte_perm_S (w2[2], w2[1], selector); - c0[2] = __byte_perm_S (w2[1], w2[0], selector); - c0[1] = __byte_perm_S (w2[0], w1[3], selector); - c0[0] = __byte_perm_S (w1[3], w1[2], selector); - w7[3] = __byte_perm_S (w1[2], w1[1], selector); - w7[2] = __byte_perm_S (w1[1], w1[0], selector); - w7[1] = __byte_perm_S (w1[0], w0[3], selector); - w7[0] = __byte_perm_S (w0[3], w0[2], selector); - w6[3] = __byte_perm_S (w0[2], w0[1], selector); - w6[2] = __byte_perm_S (w0[1], w0[0], selector); - w6[1] = __byte_perm_S (w0[0], 0, selector); + c6[1] = hc_byte_perm_S ( 0, w7[3], selector); + c6[0] = hc_byte_perm_S (w7[3], w7[2], selector); + c5[3] = hc_byte_perm_S (w7[2], w7[1], selector); + c5[2] = hc_byte_perm_S (w7[1], w7[0], selector); + c5[1] = hc_byte_perm_S (w7[0], w6[3], selector); + c5[0] = hc_byte_perm_S (w6[3], w6[2], selector); + c4[3] = hc_byte_perm_S (w6[2], w6[1], selector); + c4[2] = hc_byte_perm_S (w6[1], w6[0], selector); + c4[1] = hc_byte_perm_S (w6[0], w5[3], selector); + c4[0] = hc_byte_perm_S (w5[3], w5[2], selector); + c3[3] = hc_byte_perm_S (w5[2], w5[1], selector); + c3[2] = hc_byte_perm_S (w5[1], w5[0], selector); + c3[1] = hc_byte_perm_S (w5[0], w4[3], selector); + c3[0] = hc_byte_perm_S (w4[3], w4[2], selector); + c2[3] = hc_byte_perm_S (w4[2], w4[1], selector); + c2[2] = hc_byte_perm_S (w4[1], w4[0], selector); + c2[1] = hc_byte_perm_S (w4[0], w3[3], selector); + c2[0] = hc_byte_perm_S (w3[3], w3[2], selector); + c1[3] = hc_byte_perm_S (w3[2], w3[1], selector); + c1[2] = hc_byte_perm_S (w3[1], w3[0], selector); + c1[1] = hc_byte_perm_S (w3[0], w2[3], selector); + c1[0] = hc_byte_perm_S (w2[3], w2[2], selector); + c0[3] = hc_byte_perm_S (w2[2], w2[1], selector); + c0[2] = hc_byte_perm_S (w2[1], w2[0], selector); + c0[1] = hc_byte_perm_S (w2[0], w1[3], selector); + c0[0] = hc_byte_perm_S (w1[3], w1[2], selector); + w7[3] = hc_byte_perm_S (w1[2], w1[1], selector); + w7[2] = hc_byte_perm_S (w1[1], w1[0], selector); + w7[1] = hc_byte_perm_S (w1[0], w0[3], selector); + w7[0] = hc_byte_perm_S (w0[3], w0[2], selector); + w6[3] = hc_byte_perm_S (w0[2], w0[1], selector); + w6[2] = hc_byte_perm_S (w0[1], w0[0], selector); + w6[1] = hc_byte_perm_S (w0[0], 0, selector); w6[0] = 0; w5[3] = 0; w5[2] = 0; @@ -41943,39 +41943,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 26: - c6[2] = __byte_perm_S ( 0, w7[3], selector); - c6[1] = __byte_perm_S (w7[3], w7[2], selector); - c6[0] = __byte_perm_S (w7[2], w7[1], selector); - c5[3] = __byte_perm_S (w7[1], w7[0], selector); - c5[2] = __byte_perm_S (w7[0], w6[3], selector); - c5[1] = __byte_perm_S (w6[3], w6[2], selector); - c5[0] = __byte_perm_S (w6[2], w6[1], selector); - c4[3] = __byte_perm_S (w6[1], w6[0], selector); - c4[2] = __byte_perm_S (w6[0], w5[3], selector); - c4[1] = __byte_perm_S (w5[3], w5[2], selector); - c4[0] = __byte_perm_S (w5[2], w5[1], selector); - c3[3] = __byte_perm_S (w5[1], w5[0], selector); - c3[2] = __byte_perm_S (w5[0], w4[3], selector); - c3[1] = __byte_perm_S (w4[3], w4[2], selector); - c3[0] = __byte_perm_S (w4[2], w4[1], selector); - c2[3] = __byte_perm_S (w4[1], w4[0], selector); - c2[2] = __byte_perm_S (w4[0], w3[3], selector); - c2[1] = __byte_perm_S (w3[3], w3[2], selector); - c2[0] = __byte_perm_S (w3[2], w3[1], selector); - c1[3] = __byte_perm_S (w3[1], w3[0], selector); - c1[2] = __byte_perm_S (w3[0], w2[3], selector); - c1[1] = __byte_perm_S (w2[3], w2[2], selector); - c1[0] = __byte_perm_S (w2[2], w2[1], selector); - c0[3] = __byte_perm_S (w2[1], w2[0], selector); - c0[2] = __byte_perm_S (w2[0], w1[3], selector); - c0[1] = __byte_perm_S (w1[3], w1[2], selector); - c0[0] = __byte_perm_S (w1[2], w1[1], selector); - w7[3] = __byte_perm_S (w1[1], w1[0], selector); - w7[2] = __byte_perm_S (w1[0], w0[3], selector); - w7[1] = __byte_perm_S (w0[3], w0[2], selector); - w7[0] = __byte_perm_S (w0[2], w0[1], selector); - w6[3] = __byte_perm_S (w0[1], w0[0], selector); - w6[2] = __byte_perm_S (w0[0], 0, selector); + c6[2] = hc_byte_perm_S ( 0, w7[3], selector); + c6[1] = hc_byte_perm_S (w7[3], w7[2], selector); + c6[0] = hc_byte_perm_S (w7[2], w7[1], selector); + c5[3] = hc_byte_perm_S (w7[1], w7[0], selector); + c5[2] = hc_byte_perm_S (w7[0], w6[3], selector); + c5[1] = hc_byte_perm_S (w6[3], w6[2], selector); + c5[0] = hc_byte_perm_S (w6[2], w6[1], selector); + c4[3] = hc_byte_perm_S (w6[1], w6[0], selector); + c4[2] = hc_byte_perm_S (w6[0], w5[3], selector); + c4[1] = hc_byte_perm_S (w5[3], w5[2], selector); + c4[0] = hc_byte_perm_S (w5[2], w5[1], selector); + c3[3] = hc_byte_perm_S (w5[1], w5[0], selector); + c3[2] = hc_byte_perm_S (w5[0], w4[3], selector); + c3[1] = hc_byte_perm_S (w4[3], w4[2], selector); + c3[0] = hc_byte_perm_S (w4[2], w4[1], selector); + c2[3] = hc_byte_perm_S (w4[1], w4[0], selector); + c2[2] = hc_byte_perm_S (w4[0], w3[3], selector); + c2[1] = hc_byte_perm_S (w3[3], w3[2], selector); + c2[0] = hc_byte_perm_S (w3[2], w3[1], selector); + c1[3] = hc_byte_perm_S (w3[1], w3[0], selector); + c1[2] = hc_byte_perm_S (w3[0], w2[3], selector); + c1[1] = hc_byte_perm_S (w2[3], w2[2], selector); + c1[0] = hc_byte_perm_S (w2[2], w2[1], selector); + c0[3] = hc_byte_perm_S (w2[1], w2[0], selector); + c0[2] = hc_byte_perm_S (w2[0], w1[3], selector); + c0[1] = hc_byte_perm_S (w1[3], w1[2], selector); + c0[0] = hc_byte_perm_S (w1[2], w1[1], selector); + w7[3] = hc_byte_perm_S (w1[1], w1[0], selector); + w7[2] = hc_byte_perm_S (w1[0], w0[3], selector); + w7[1] = hc_byte_perm_S (w0[3], w0[2], selector); + w7[0] = hc_byte_perm_S (w0[2], w0[1], selector); + w6[3] = hc_byte_perm_S (w0[1], w0[0], selector); + w6[2] = hc_byte_perm_S (w0[0], 0, selector); w6[1] = 0; w6[0] = 0; w5[3] = 0; @@ -42006,39 +42006,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 27: - c6[3] = __byte_perm_S ( 0, w7[3], selector); - c6[2] = __byte_perm_S (w7[3], w7[2], selector); - c6[1] = __byte_perm_S (w7[2], w7[1], selector); - c6[0] = __byte_perm_S (w7[1], w7[0], selector); - c5[3] = __byte_perm_S (w7[0], w6[3], selector); - c5[2] = __byte_perm_S (w6[3], w6[2], selector); - c5[1] = __byte_perm_S (w6[2], w6[1], selector); - c5[0] = __byte_perm_S (w6[1], w6[0], selector); - c4[3] = __byte_perm_S (w6[0], w5[3], selector); - c4[2] = __byte_perm_S (w5[3], w5[2], selector); - c4[1] = __byte_perm_S (w5[2], w5[1], selector); - c4[0] = __byte_perm_S (w5[1], w5[0], selector); - c3[3] = __byte_perm_S (w5[0], w4[3], selector); - c3[2] = __byte_perm_S (w4[3], w4[2], selector); - c3[1] = __byte_perm_S (w4[2], w4[1], selector); - c3[0] = __byte_perm_S (w4[1], w4[0], selector); - c2[3] = __byte_perm_S (w4[0], w3[3], selector); - c2[2] = __byte_perm_S (w3[3], w3[2], selector); - c2[1] = __byte_perm_S (w3[2], w3[1], selector); - c2[0] = __byte_perm_S (w3[1], w3[0], selector); - c1[3] = __byte_perm_S (w3[0], w2[3], selector); - c1[2] = __byte_perm_S (w2[3], w2[2], selector); - c1[1] = __byte_perm_S (w2[2], w2[1], selector); - c1[0] = __byte_perm_S (w2[1], w2[0], selector); - c0[3] = __byte_perm_S (w2[0], w1[3], selector); - c0[2] = __byte_perm_S (w1[3], w1[2], selector); - c0[1] = __byte_perm_S (w1[2], w1[1], selector); - c0[0] = __byte_perm_S (w1[1], w1[0], selector); - w7[3] = __byte_perm_S (w1[0], w0[3], selector); - w7[2] = __byte_perm_S (w0[3], w0[2], selector); - w7[1] = __byte_perm_S (w0[2], w0[1], selector); - w7[0] = __byte_perm_S (w0[1], w0[0], selector); - w6[3] = __byte_perm_S (w0[0], 0, selector); + c6[3] = hc_byte_perm_S ( 0, w7[3], selector); + c6[2] = hc_byte_perm_S (w7[3], w7[2], selector); + c6[1] = hc_byte_perm_S (w7[2], w7[1], selector); + c6[0] = hc_byte_perm_S (w7[1], w7[0], selector); + c5[3] = hc_byte_perm_S (w7[0], w6[3], selector); + c5[2] = hc_byte_perm_S (w6[3], w6[2], selector); + c5[1] = hc_byte_perm_S (w6[2], w6[1], selector); + c5[0] = hc_byte_perm_S (w6[1], w6[0], selector); + c4[3] = hc_byte_perm_S (w6[0], w5[3], selector); + c4[2] = hc_byte_perm_S (w5[3], w5[2], selector); + c4[1] = hc_byte_perm_S (w5[2], w5[1], selector); + c4[0] = hc_byte_perm_S (w5[1], w5[0], selector); + c3[3] = hc_byte_perm_S (w5[0], w4[3], selector); + c3[2] = hc_byte_perm_S (w4[3], w4[2], selector); + c3[1] = hc_byte_perm_S (w4[2], w4[1], selector); + c3[0] = hc_byte_perm_S (w4[1], w4[0], selector); + c2[3] = hc_byte_perm_S (w4[0], w3[3], selector); + c2[2] = hc_byte_perm_S (w3[3], w3[2], selector); + c2[1] = hc_byte_perm_S (w3[2], w3[1], selector); + c2[0] = hc_byte_perm_S (w3[1], w3[0], selector); + c1[3] = hc_byte_perm_S (w3[0], w2[3], selector); + c1[2] = hc_byte_perm_S (w2[3], w2[2], selector); + c1[1] = hc_byte_perm_S (w2[2], w2[1], selector); + c1[0] = hc_byte_perm_S (w2[1], w2[0], selector); + c0[3] = hc_byte_perm_S (w2[0], w1[3], selector); + c0[2] = hc_byte_perm_S (w1[3], w1[2], selector); + c0[1] = hc_byte_perm_S (w1[2], w1[1], selector); + c0[0] = hc_byte_perm_S (w1[1], w1[0], selector); + w7[3] = hc_byte_perm_S (w1[0], w0[3], selector); + w7[2] = hc_byte_perm_S (w0[3], w0[2], selector); + w7[1] = hc_byte_perm_S (w0[2], w0[1], selector); + w7[0] = hc_byte_perm_S (w0[1], w0[0], selector); + w6[3] = hc_byte_perm_S (w0[0], 0, selector); w6[2] = 0; w6[1] = 0; w6[0] = 0; @@ -42070,39 +42070,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 28: - c7[0] = __byte_perm_S ( 0, w7[3], selector); - c6[3] = __byte_perm_S (w7[3], w7[2], selector); - c6[2] = __byte_perm_S (w7[2], w7[1], selector); - c6[1] = __byte_perm_S (w7[1], w7[0], selector); - c6[0] = __byte_perm_S (w7[0], w6[3], selector); - c5[3] = __byte_perm_S (w6[3], w6[2], selector); - c5[2] = __byte_perm_S (w6[2], w6[1], selector); - c5[1] = __byte_perm_S (w6[1], w6[0], selector); - c5[0] = __byte_perm_S (w6[0], w5[3], selector); - c4[3] = __byte_perm_S (w5[3], w5[2], selector); - c4[2] = __byte_perm_S (w5[2], w5[1], selector); - c4[1] = __byte_perm_S (w5[1], w5[0], selector); - c4[0] = __byte_perm_S (w5[0], w4[3], selector); - c3[3] = __byte_perm_S (w4[3], w4[2], selector); - c3[2] = __byte_perm_S (w4[2], w4[1], selector); - c3[1] = __byte_perm_S (w4[1], w4[0], selector); - c3[0] = __byte_perm_S (w4[0], w3[3], selector); - c2[3] = __byte_perm_S (w3[3], w3[2], selector); - c2[2] = __byte_perm_S (w3[2], w3[1], selector); - c2[1] = __byte_perm_S (w3[1], w3[0], selector); - c2[0] = __byte_perm_S (w3[0], w2[3], selector); - c1[3] = __byte_perm_S (w2[3], w2[2], selector); - c1[2] = __byte_perm_S (w2[2], w2[1], selector); - c1[1] = __byte_perm_S (w2[1], w2[0], selector); - c1[0] = __byte_perm_S (w2[0], w1[3], selector); - c0[3] = __byte_perm_S (w1[3], w1[2], selector); - c0[2] = __byte_perm_S (w1[2], w1[1], selector); - c0[1] = __byte_perm_S (w1[1], w1[0], selector); - c0[0] = __byte_perm_S (w1[0], w0[3], selector); - w7[3] = __byte_perm_S (w0[3], w0[2], selector); - w7[2] = __byte_perm_S (w0[2], w0[1], selector); - w7[1] = __byte_perm_S (w0[1], w0[0], selector); - w7[0] = __byte_perm_S (w0[0], 0, selector); + c7[0] = hc_byte_perm_S ( 0, w7[3], selector); + c6[3] = hc_byte_perm_S (w7[3], w7[2], selector); + c6[2] = hc_byte_perm_S (w7[2], w7[1], selector); + c6[1] = hc_byte_perm_S (w7[1], w7[0], selector); + c6[0] = hc_byte_perm_S (w7[0], w6[3], selector); + c5[3] = hc_byte_perm_S (w6[3], w6[2], selector); + c5[2] = hc_byte_perm_S (w6[2], w6[1], selector); + c5[1] = hc_byte_perm_S (w6[1], w6[0], selector); + c5[0] = hc_byte_perm_S (w6[0], w5[3], selector); + c4[3] = hc_byte_perm_S (w5[3], w5[2], selector); + c4[2] = hc_byte_perm_S (w5[2], w5[1], selector); + c4[1] = hc_byte_perm_S (w5[1], w5[0], selector); + c4[0] = hc_byte_perm_S (w5[0], w4[3], selector); + c3[3] = hc_byte_perm_S (w4[3], w4[2], selector); + c3[2] = hc_byte_perm_S (w4[2], w4[1], selector); + c3[1] = hc_byte_perm_S (w4[1], w4[0], selector); + c3[0] = hc_byte_perm_S (w4[0], w3[3], selector); + c2[3] = hc_byte_perm_S (w3[3], w3[2], selector); + c2[2] = hc_byte_perm_S (w3[2], w3[1], selector); + c2[1] = hc_byte_perm_S (w3[1], w3[0], selector); + c2[0] = hc_byte_perm_S (w3[0], w2[3], selector); + c1[3] = hc_byte_perm_S (w2[3], w2[2], selector); + c1[2] = hc_byte_perm_S (w2[2], w2[1], selector); + c1[1] = hc_byte_perm_S (w2[1], w2[0], selector); + c1[0] = hc_byte_perm_S (w2[0], w1[3], selector); + c0[3] = hc_byte_perm_S (w1[3], w1[2], selector); + c0[2] = hc_byte_perm_S (w1[2], w1[1], selector); + c0[1] = hc_byte_perm_S (w1[1], w1[0], selector); + c0[0] = hc_byte_perm_S (w1[0], w0[3], selector); + w7[3] = hc_byte_perm_S (w0[3], w0[2], selector); + w7[2] = hc_byte_perm_S (w0[2], w0[1], selector); + w7[1] = hc_byte_perm_S (w0[1], w0[0], selector); + w7[0] = hc_byte_perm_S (w0[0], 0, selector); w6[3] = 0; w6[2] = 0; w6[1] = 0; @@ -42135,39 +42135,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 29: - c7[1] = __byte_perm_S ( 0, w7[3], selector); - c7[0] = __byte_perm_S (w7[3], w7[2], selector); - c6[3] = __byte_perm_S (w7[2], w7[1], selector); - c6[2] = __byte_perm_S (w7[1], w7[0], selector); - c6[1] = __byte_perm_S (w7[0], w6[3], selector); - c6[0] = __byte_perm_S (w6[3], w6[2], selector); - c5[3] = __byte_perm_S (w6[2], w6[1], selector); - c5[2] = __byte_perm_S (w6[1], w6[0], selector); - c5[1] = __byte_perm_S (w6[0], w5[3], selector); - c5[0] = __byte_perm_S (w5[3], w5[2], selector); - c4[3] = __byte_perm_S (w5[2], w5[1], selector); - c4[2] = __byte_perm_S (w5[1], w5[0], selector); - c4[1] = __byte_perm_S (w5[0], w4[3], selector); - c4[0] = __byte_perm_S (w4[3], w4[2], selector); - c3[3] = __byte_perm_S (w4[2], w4[1], selector); - c3[2] = __byte_perm_S (w4[1], w4[0], selector); - c3[1] = __byte_perm_S (w4[0], w3[3], selector); - c3[0] = __byte_perm_S (w3[3], w3[2], selector); - c2[3] = __byte_perm_S (w3[2], w3[1], selector); - c2[2] = __byte_perm_S (w3[1], w3[0], selector); - c2[1] = __byte_perm_S (w3[0], w2[3], selector); - c2[0] = __byte_perm_S (w2[3], w2[2], selector); - c1[3] = __byte_perm_S (w2[2], w2[1], selector); - c1[2] = __byte_perm_S (w2[1], w2[0], selector); - c1[1] = __byte_perm_S (w2[0], w1[3], selector); - c1[0] = __byte_perm_S (w1[3], w1[2], selector); - c0[3] = __byte_perm_S (w1[2], w1[1], selector); - c0[2] = __byte_perm_S (w1[1], w1[0], selector); - c0[1] = __byte_perm_S (w1[0], w0[3], selector); - c0[0] = __byte_perm_S (w0[3], w0[2], selector); - w7[3] = __byte_perm_S (w0[2], w0[1], selector); - w7[2] = __byte_perm_S (w0[1], w0[0], selector); - w7[1] = __byte_perm_S (w0[0], 0, selector); + c7[1] = hc_byte_perm_S ( 0, w7[3], selector); + c7[0] = hc_byte_perm_S (w7[3], w7[2], selector); + c6[3] = hc_byte_perm_S (w7[2], w7[1], selector); + c6[2] = hc_byte_perm_S (w7[1], w7[0], selector); + c6[1] = hc_byte_perm_S (w7[0], w6[3], selector); + c6[0] = hc_byte_perm_S (w6[3], w6[2], selector); + c5[3] = hc_byte_perm_S (w6[2], w6[1], selector); + c5[2] = hc_byte_perm_S (w6[1], w6[0], selector); + c5[1] = hc_byte_perm_S (w6[0], w5[3], selector); + c5[0] = hc_byte_perm_S (w5[3], w5[2], selector); + c4[3] = hc_byte_perm_S (w5[2], w5[1], selector); + c4[2] = hc_byte_perm_S (w5[1], w5[0], selector); + c4[1] = hc_byte_perm_S (w5[0], w4[3], selector); + c4[0] = hc_byte_perm_S (w4[3], w4[2], selector); + c3[3] = hc_byte_perm_S (w4[2], w4[1], selector); + c3[2] = hc_byte_perm_S (w4[1], w4[0], selector); + c3[1] = hc_byte_perm_S (w4[0], w3[3], selector); + c3[0] = hc_byte_perm_S (w3[3], w3[2], selector); + c2[3] = hc_byte_perm_S (w3[2], w3[1], selector); + c2[2] = hc_byte_perm_S (w3[1], w3[0], selector); + c2[1] = hc_byte_perm_S (w3[0], w2[3], selector); + c2[0] = hc_byte_perm_S (w2[3], w2[2], selector); + c1[3] = hc_byte_perm_S (w2[2], w2[1], selector); + c1[2] = hc_byte_perm_S (w2[1], w2[0], selector); + c1[1] = hc_byte_perm_S (w2[0], w1[3], selector); + c1[0] = hc_byte_perm_S (w1[3], w1[2], selector); + c0[3] = hc_byte_perm_S (w1[2], w1[1], selector); + c0[2] = hc_byte_perm_S (w1[1], w1[0], selector); + c0[1] = hc_byte_perm_S (w1[0], w0[3], selector); + c0[0] = hc_byte_perm_S (w0[3], w0[2], selector); + w7[3] = hc_byte_perm_S (w0[2], w0[1], selector); + w7[2] = hc_byte_perm_S (w0[1], w0[0], selector); + w7[1] = hc_byte_perm_S (w0[0], 0, selector); w7[0] = 0; w6[3] = 0; w6[2] = 0; @@ -42201,39 +42201,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 30: - c7[2] = __byte_perm_S ( 0, w7[3], selector); - c7[1] = __byte_perm_S (w7[3], w7[2], selector); - c7[0] = __byte_perm_S (w7[2], w7[1], selector); - c6[3] = __byte_perm_S (w7[1], w7[0], selector); - c6[2] = __byte_perm_S (w7[0], w6[3], selector); - c6[1] = __byte_perm_S (w6[3], w6[2], selector); - c6[0] = __byte_perm_S (w6[2], w6[1], selector); - c5[3] = __byte_perm_S (w6[1], w6[0], selector); - c5[2] = __byte_perm_S (w6[0], w5[3], selector); - c5[1] = __byte_perm_S (w5[3], w5[2], selector); - c5[0] = __byte_perm_S (w5[2], w5[1], selector); - c4[3] = __byte_perm_S (w5[1], w5[0], selector); - c4[2] = __byte_perm_S (w5[0], w4[3], selector); - c4[1] = __byte_perm_S (w4[3], w4[2], selector); - c4[0] = __byte_perm_S (w4[2], w4[1], selector); - c3[3] = __byte_perm_S (w4[1], w4[0], selector); - c3[2] = __byte_perm_S (w4[0], w3[3], selector); - c3[1] = __byte_perm_S (w3[3], w3[2], selector); - c3[0] = __byte_perm_S (w3[2], w3[1], selector); - c2[3] = __byte_perm_S (w3[1], w3[0], selector); - c2[2] = __byte_perm_S (w3[0], w2[3], selector); - c2[1] = __byte_perm_S (w2[3], w2[2], selector); - c2[0] = __byte_perm_S (w2[2], w2[1], selector); - c1[3] = __byte_perm_S (w2[1], w2[0], selector); - c1[2] = __byte_perm_S (w2[0], w1[3], selector); - c1[1] = __byte_perm_S (w1[3], w1[2], selector); - c1[0] = __byte_perm_S (w1[2], w1[1], selector); - c0[3] = __byte_perm_S (w1[1], w1[0], selector); - c0[2] = __byte_perm_S (w1[0], w0[3], selector); - c0[1] = __byte_perm_S (w0[3], w0[2], selector); - c0[0] = __byte_perm_S (w0[2], w0[1], selector); - w7[3] = __byte_perm_S (w0[1], w0[0], selector); - w7[2] = __byte_perm_S (w0[0], 0, selector); + c7[2] = hc_byte_perm_S ( 0, w7[3], selector); + c7[1] = hc_byte_perm_S (w7[3], w7[2], selector); + c7[0] = hc_byte_perm_S (w7[2], w7[1], selector); + c6[3] = hc_byte_perm_S (w7[1], w7[0], selector); + c6[2] = hc_byte_perm_S (w7[0], w6[3], selector); + c6[1] = hc_byte_perm_S (w6[3], w6[2], selector); + c6[0] = hc_byte_perm_S (w6[2], w6[1], selector); + c5[3] = hc_byte_perm_S (w6[1], w6[0], selector); + c5[2] = hc_byte_perm_S (w6[0], w5[3], selector); + c5[1] = hc_byte_perm_S (w5[3], w5[2], selector); + c5[0] = hc_byte_perm_S (w5[2], w5[1], selector); + c4[3] = hc_byte_perm_S (w5[1], w5[0], selector); + c4[2] = hc_byte_perm_S (w5[0], w4[3], selector); + c4[1] = hc_byte_perm_S (w4[3], w4[2], selector); + c4[0] = hc_byte_perm_S (w4[2], w4[1], selector); + c3[3] = hc_byte_perm_S (w4[1], w4[0], selector); + c3[2] = hc_byte_perm_S (w4[0], w3[3], selector); + c3[1] = hc_byte_perm_S (w3[3], w3[2], selector); + c3[0] = hc_byte_perm_S (w3[2], w3[1], selector); + c2[3] = hc_byte_perm_S (w3[1], w3[0], selector); + c2[2] = hc_byte_perm_S (w3[0], w2[3], selector); + c2[1] = hc_byte_perm_S (w2[3], w2[2], selector); + c2[0] = hc_byte_perm_S (w2[2], w2[1], selector); + c1[3] = hc_byte_perm_S (w2[1], w2[0], selector); + c1[2] = hc_byte_perm_S (w2[0], w1[3], selector); + c1[1] = hc_byte_perm_S (w1[3], w1[2], selector); + c1[0] = hc_byte_perm_S (w1[2], w1[1], selector); + c0[3] = hc_byte_perm_S (w1[1], w1[0], selector); + c0[2] = hc_byte_perm_S (w1[0], w0[3], selector); + c0[1] = hc_byte_perm_S (w0[3], w0[2], selector); + c0[0] = hc_byte_perm_S (w0[2], w0[1], selector); + w7[3] = hc_byte_perm_S (w0[1], w0[0], selector); + w7[2] = hc_byte_perm_S (w0[0], 0, selector); w7[1] = 0; w7[0] = 0; w6[3] = 0; @@ -42268,39 +42268,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, break; case 31: - c7[3] = __byte_perm_S ( 0, w7[3], selector); - c7[2] = __byte_perm_S (w7[3], w7[2], selector); - c7[1] = __byte_perm_S (w7[2], w7[1], selector); - c7[0] = __byte_perm_S (w7[1], w7[0], selector); - c6[3] = __byte_perm_S (w7[0], w6[3], selector); - c6[2] = __byte_perm_S (w6[3], w6[2], selector); - c6[1] = __byte_perm_S (w6[2], w6[1], selector); - c6[0] = __byte_perm_S (w6[1], w6[0], selector); - c5[3] = __byte_perm_S (w6[0], w5[3], selector); - c5[2] = __byte_perm_S (w5[3], w5[2], selector); - c5[1] = __byte_perm_S (w5[2], w5[1], selector); - c5[0] = __byte_perm_S (w5[1], w5[0], selector); - c4[3] = __byte_perm_S (w5[0], w4[3], selector); - c4[2] = __byte_perm_S (w4[3], w4[2], selector); - c4[1] = __byte_perm_S (w4[2], w4[1], selector); - c4[0] = __byte_perm_S (w4[1], w4[0], selector); - c3[3] = __byte_perm_S (w4[0], w3[3], selector); - c3[2] = __byte_perm_S (w3[3], w3[2], selector); - c3[1] = __byte_perm_S (w3[2], w3[1], selector); - c3[0] = __byte_perm_S (w3[1], w3[0], selector); - c2[3] = __byte_perm_S (w3[0], w2[3], selector); - c2[2] = __byte_perm_S (w2[3], w2[2], selector); - c2[1] = __byte_perm_S (w2[2], w2[1], selector); - c2[0] = __byte_perm_S (w2[1], w2[0], selector); - c1[3] = __byte_perm_S (w2[0], w1[3], selector); - c1[2] = __byte_perm_S (w1[3], w1[2], selector); - c1[1] = __byte_perm_S (w1[2], w1[1], selector); - c1[0] = __byte_perm_S (w1[1], w1[0], selector); - c0[3] = __byte_perm_S (w1[0], w0[3], selector); - c0[2] = __byte_perm_S (w0[3], w0[2], selector); - c0[1] = __byte_perm_S (w0[2], w0[1], selector); - c0[0] = __byte_perm_S (w0[1], w0[0], selector); - w7[3] = __byte_perm_S (w0[0], 0, selector); + c7[3] = hc_byte_perm_S ( 0, w7[3], selector); + c7[2] = hc_byte_perm_S (w7[3], w7[2], selector); + c7[1] = hc_byte_perm_S (w7[2], w7[1], selector); + c7[0] = hc_byte_perm_S (w7[1], w7[0], selector); + c6[3] = hc_byte_perm_S (w7[0], w6[3], selector); + c6[2] = hc_byte_perm_S (w6[3], w6[2], selector); + c6[1] = hc_byte_perm_S (w6[2], w6[1], selector); + c6[0] = hc_byte_perm_S (w6[1], w6[0], selector); + c5[3] = hc_byte_perm_S (w6[0], w5[3], selector); + c5[2] = hc_byte_perm_S (w5[3], w5[2], selector); + c5[1] = hc_byte_perm_S (w5[2], w5[1], selector); + c5[0] = hc_byte_perm_S (w5[1], w5[0], selector); + c4[3] = hc_byte_perm_S (w5[0], w4[3], selector); + c4[2] = hc_byte_perm_S (w4[3], w4[2], selector); + c4[1] = hc_byte_perm_S (w4[2], w4[1], selector); + c4[0] = hc_byte_perm_S (w4[1], w4[0], selector); + c3[3] = hc_byte_perm_S (w4[0], w3[3], selector); + c3[2] = hc_byte_perm_S (w3[3], w3[2], selector); + c3[1] = hc_byte_perm_S (w3[2], w3[1], selector); + c3[0] = hc_byte_perm_S (w3[1], w3[0], selector); + c2[3] = hc_byte_perm_S (w3[0], w2[3], selector); + c2[2] = hc_byte_perm_S (w2[3], w2[2], selector); + c2[1] = hc_byte_perm_S (w2[2], w2[1], selector); + c2[0] = hc_byte_perm_S (w2[1], w2[0], selector); + c1[3] = hc_byte_perm_S (w2[0], w1[3], selector); + c1[2] = hc_byte_perm_S (w1[3], w1[2], selector); + c1[1] = hc_byte_perm_S (w1[2], w1[1], selector); + c1[0] = hc_byte_perm_S (w1[1], w1[0], selector); + c0[3] = hc_byte_perm_S (w1[0], w0[3], selector); + c0[2] = hc_byte_perm_S (w0[3], w0[2], selector); + c0[1] = hc_byte_perm_S (w0[2], w0[1], selector); + c0[0] = hc_byte_perm_S (w0[1], w0[0], selector); + w7[3] = hc_byte_perm_S (w0[0], 0, selector); w7[2] = 0; w7[1] = 0; w7[0] = 0; @@ -42354,271 +42354,271 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) switch (offset_switch) { case 0: - w[63] = amd_bytealign_S (w[62], w[63], offset); - w[62] = amd_bytealign_S (w[61], w[62], offset); - w[61] = amd_bytealign_S (w[60], w[61], offset); - w[60] = amd_bytealign_S (w[59], w[60], offset); - w[59] = amd_bytealign_S (w[58], w[59], offset); - w[58] = amd_bytealign_S (w[57], w[58], offset); - w[57] = amd_bytealign_S (w[56], w[57], offset); - w[56] = amd_bytealign_S (w[55], w[56], offset); - w[55] = amd_bytealign_S (w[54], w[55], offset); - w[54] = amd_bytealign_S (w[53], w[54], offset); - w[53] = amd_bytealign_S (w[52], w[53], offset); - w[52] = amd_bytealign_S (w[51], w[52], offset); - w[51] = amd_bytealign_S (w[50], w[51], offset); - w[50] = amd_bytealign_S (w[49], w[50], offset); - w[49] = amd_bytealign_S (w[48], w[49], offset); - w[48] = amd_bytealign_S (w[47], w[48], offset); - w[47] = amd_bytealign_S (w[46], w[47], offset); - w[46] = amd_bytealign_S (w[45], w[46], offset); - w[45] = amd_bytealign_S (w[44], w[45], offset); - w[44] = amd_bytealign_S (w[43], w[44], offset); - w[43] = amd_bytealign_S (w[42], w[43], offset); - w[42] = amd_bytealign_S (w[41], w[42], offset); - w[41] = amd_bytealign_S (w[40], w[41], offset); - w[40] = amd_bytealign_S (w[39], w[40], offset); - w[39] = amd_bytealign_S (w[38], w[39], offset); - w[38] = amd_bytealign_S (w[37], w[38], offset); - w[37] = amd_bytealign_S (w[36], w[37], offset); - w[36] = amd_bytealign_S (w[35], w[36], offset); - w[35] = amd_bytealign_S (w[34], w[35], offset); - w[34] = amd_bytealign_S (w[33], w[34], offset); - w[33] = amd_bytealign_S (w[32], w[33], offset); - w[32] = amd_bytealign_S (w[31], w[32], offset); - w[31] = amd_bytealign_S (w[30], w[31], offset); - w[30] = amd_bytealign_S (w[29], w[30], offset); - w[29] = amd_bytealign_S (w[28], w[29], offset); - w[28] = amd_bytealign_S (w[27], w[28], offset); - w[27] = amd_bytealign_S (w[26], w[27], offset); - w[26] = amd_bytealign_S (w[25], w[26], offset); - w[25] = amd_bytealign_S (w[24], w[25], offset); - w[24] = amd_bytealign_S (w[23], w[24], offset); - w[23] = amd_bytealign_S (w[22], w[23], offset); - w[22] = amd_bytealign_S (w[21], w[22], offset); - w[21] = amd_bytealign_S (w[20], w[21], offset); - w[20] = amd_bytealign_S (w[19], w[20], offset); - w[19] = amd_bytealign_S (w[18], w[19], offset); - w[18] = amd_bytealign_S (w[17], w[18], offset); - w[17] = amd_bytealign_S (w[16], w[17], offset); - w[16] = amd_bytealign_S (w[15], w[16], offset); - w[15] = amd_bytealign_S (w[14], w[15], offset); - w[14] = amd_bytealign_S (w[13], w[14], offset); - w[13] = amd_bytealign_S (w[12], w[13], offset); - w[12] = amd_bytealign_S (w[11], w[12], offset); - w[11] = amd_bytealign_S (w[10], w[11], offset); - w[10] = amd_bytealign_S (w[ 9], w[10], offset); - w[ 9] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[ 8] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[ 7] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[ 6] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[ 5] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[ 4] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[ 3] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[ 2] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[ 1] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[ 0] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[62], w[63], offset); + w[62] = hc_bytealign_S (w[61], w[62], offset); + w[61] = hc_bytealign_S (w[60], w[61], offset); + w[60] = hc_bytealign_S (w[59], w[60], offset); + w[59] = hc_bytealign_S (w[58], w[59], offset); + w[58] = hc_bytealign_S (w[57], w[58], offset); + w[57] = hc_bytealign_S (w[56], w[57], offset); + w[56] = hc_bytealign_S (w[55], w[56], offset); + w[55] = hc_bytealign_S (w[54], w[55], offset); + w[54] = hc_bytealign_S (w[53], w[54], offset); + w[53] = hc_bytealign_S (w[52], w[53], offset); + w[52] = hc_bytealign_S (w[51], w[52], offset); + w[51] = hc_bytealign_S (w[50], w[51], offset); + w[50] = hc_bytealign_S (w[49], w[50], offset); + w[49] = hc_bytealign_S (w[48], w[49], offset); + w[48] = hc_bytealign_S (w[47], w[48], offset); + w[47] = hc_bytealign_S (w[46], w[47], offset); + w[46] = hc_bytealign_S (w[45], w[46], offset); + w[45] = hc_bytealign_S (w[44], w[45], offset); + w[44] = hc_bytealign_S (w[43], w[44], offset); + w[43] = hc_bytealign_S (w[42], w[43], offset); + w[42] = hc_bytealign_S (w[41], w[42], offset); + w[41] = hc_bytealign_S (w[40], w[41], offset); + w[40] = hc_bytealign_S (w[39], w[40], offset); + w[39] = hc_bytealign_S (w[38], w[39], offset); + w[38] = hc_bytealign_S (w[37], w[38], offset); + w[37] = hc_bytealign_S (w[36], w[37], offset); + w[36] = hc_bytealign_S (w[35], w[36], offset); + w[35] = hc_bytealign_S (w[34], w[35], offset); + w[34] = hc_bytealign_S (w[33], w[34], offset); + w[33] = hc_bytealign_S (w[32], w[33], offset); + w[32] = hc_bytealign_S (w[31], w[32], offset); + w[31] = hc_bytealign_S (w[30], w[31], offset); + w[30] = hc_bytealign_S (w[29], w[30], offset); + w[29] = hc_bytealign_S (w[28], w[29], offset); + w[28] = hc_bytealign_S (w[27], w[28], offset); + w[27] = hc_bytealign_S (w[26], w[27], offset); + w[26] = hc_bytealign_S (w[25], w[26], offset); + w[25] = hc_bytealign_S (w[24], w[25], offset); + w[24] = hc_bytealign_S (w[23], w[24], offset); + w[23] = hc_bytealign_S (w[22], w[23], offset); + w[22] = hc_bytealign_S (w[21], w[22], offset); + w[21] = hc_bytealign_S (w[20], w[21], offset); + w[20] = hc_bytealign_S (w[19], w[20], offset); + w[19] = hc_bytealign_S (w[18], w[19], offset); + w[18] = hc_bytealign_S (w[17], w[18], offset); + w[17] = hc_bytealign_S (w[16], w[17], offset); + w[16] = hc_bytealign_S (w[15], w[16], offset); + w[15] = hc_bytealign_S (w[14], w[15], offset); + w[14] = hc_bytealign_S (w[13], w[14], offset); + w[13] = hc_bytealign_S (w[12], w[13], offset); + w[12] = hc_bytealign_S (w[11], w[12], offset); + w[11] = hc_bytealign_S (w[10], w[11], offset); + w[10] = hc_bytealign_S (w[ 9], w[10], offset); + w[ 9] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[ 8] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[ 7] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[ 6] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[ 5] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[ 4] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[ 3] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[ 2] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[ 1] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[ 0] = hc_bytealign_S ( 0, w[ 0], offset); break; case 1: - w[63] = amd_bytealign_S (w[61], w[62], offset); - w[62] = amd_bytealign_S (w[60], w[61], offset); - w[61] = amd_bytealign_S (w[59], w[60], offset); - w[60] = amd_bytealign_S (w[58], w[59], offset); - w[59] = amd_bytealign_S (w[57], w[58], offset); - w[58] = amd_bytealign_S (w[56], w[57], offset); - w[57] = amd_bytealign_S (w[55], w[56], offset); - w[56] = amd_bytealign_S (w[54], w[55], offset); - w[55] = amd_bytealign_S (w[53], w[54], offset); - w[54] = amd_bytealign_S (w[52], w[53], offset); - w[53] = amd_bytealign_S (w[51], w[52], offset); - w[52] = amd_bytealign_S (w[50], w[51], offset); - w[51] = amd_bytealign_S (w[49], w[50], offset); - w[50] = amd_bytealign_S (w[48], w[49], offset); - w[49] = amd_bytealign_S (w[47], w[48], offset); - w[48] = amd_bytealign_S (w[46], w[47], offset); - w[47] = amd_bytealign_S (w[45], w[46], offset); - w[46] = amd_bytealign_S (w[44], w[45], offset); - w[45] = amd_bytealign_S (w[43], w[44], offset); - w[44] = amd_bytealign_S (w[42], w[43], offset); - w[43] = amd_bytealign_S (w[41], w[42], offset); - w[42] = amd_bytealign_S (w[40], w[41], offset); - w[41] = amd_bytealign_S (w[39], w[40], offset); - w[40] = amd_bytealign_S (w[38], w[39], offset); - w[39] = amd_bytealign_S (w[37], w[38], offset); - w[38] = amd_bytealign_S (w[36], w[37], offset); - w[37] = amd_bytealign_S (w[35], w[36], offset); - w[36] = amd_bytealign_S (w[34], w[35], offset); - w[35] = amd_bytealign_S (w[33], w[34], offset); - w[34] = amd_bytealign_S (w[32], w[33], offset); - w[33] = amd_bytealign_S (w[31], w[32], offset); - w[32] = amd_bytealign_S (w[30], w[31], offset); - w[31] = amd_bytealign_S (w[29], w[30], offset); - w[30] = amd_bytealign_S (w[28], w[29], offset); - w[29] = amd_bytealign_S (w[27], w[28], offset); - w[28] = amd_bytealign_S (w[26], w[27], offset); - w[27] = amd_bytealign_S (w[25], w[26], offset); - w[26] = amd_bytealign_S (w[24], w[25], offset); - w[25] = amd_bytealign_S (w[23], w[24], offset); - w[24] = amd_bytealign_S (w[22], w[23], offset); - w[23] = amd_bytealign_S (w[21], w[22], offset); - w[22] = amd_bytealign_S (w[20], w[21], offset); - w[21] = amd_bytealign_S (w[19], w[20], offset); - w[20] = amd_bytealign_S (w[18], w[19], offset); - w[19] = amd_bytealign_S (w[17], w[18], offset); - w[18] = amd_bytealign_S (w[16], w[17], offset); - w[17] = amd_bytealign_S (w[15], w[16], offset); - w[16] = amd_bytealign_S (w[14], w[15], offset); - w[15] = amd_bytealign_S (w[13], w[14], offset); - w[14] = amd_bytealign_S (w[12], w[13], offset); - w[13] = amd_bytealign_S (w[11], w[12], offset); - w[12] = amd_bytealign_S (w[10], w[11], offset); - w[11] = amd_bytealign_S (w[ 9], w[10], offset); - w[10] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[ 9] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[ 8] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[ 7] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[ 6] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[ 5] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[ 4] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[ 3] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[ 2] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[ 1] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[61], w[62], offset); + w[62] = hc_bytealign_S (w[60], w[61], offset); + w[61] = hc_bytealign_S (w[59], w[60], offset); + w[60] = hc_bytealign_S (w[58], w[59], offset); + w[59] = hc_bytealign_S (w[57], w[58], offset); + w[58] = hc_bytealign_S (w[56], w[57], offset); + w[57] = hc_bytealign_S (w[55], w[56], offset); + w[56] = hc_bytealign_S (w[54], w[55], offset); + w[55] = hc_bytealign_S (w[53], w[54], offset); + w[54] = hc_bytealign_S (w[52], w[53], offset); + w[53] = hc_bytealign_S (w[51], w[52], offset); + w[52] = hc_bytealign_S (w[50], w[51], offset); + w[51] = hc_bytealign_S (w[49], w[50], offset); + w[50] = hc_bytealign_S (w[48], w[49], offset); + w[49] = hc_bytealign_S (w[47], w[48], offset); + w[48] = hc_bytealign_S (w[46], w[47], offset); + w[47] = hc_bytealign_S (w[45], w[46], offset); + w[46] = hc_bytealign_S (w[44], w[45], offset); + w[45] = hc_bytealign_S (w[43], w[44], offset); + w[44] = hc_bytealign_S (w[42], w[43], offset); + w[43] = hc_bytealign_S (w[41], w[42], offset); + w[42] = hc_bytealign_S (w[40], w[41], offset); + w[41] = hc_bytealign_S (w[39], w[40], offset); + w[40] = hc_bytealign_S (w[38], w[39], offset); + w[39] = hc_bytealign_S (w[37], w[38], offset); + w[38] = hc_bytealign_S (w[36], w[37], offset); + w[37] = hc_bytealign_S (w[35], w[36], offset); + w[36] = hc_bytealign_S (w[34], w[35], offset); + w[35] = hc_bytealign_S (w[33], w[34], offset); + w[34] = hc_bytealign_S (w[32], w[33], offset); + w[33] = hc_bytealign_S (w[31], w[32], offset); + w[32] = hc_bytealign_S (w[30], w[31], offset); + w[31] = hc_bytealign_S (w[29], w[30], offset); + w[30] = hc_bytealign_S (w[28], w[29], offset); + w[29] = hc_bytealign_S (w[27], w[28], offset); + w[28] = hc_bytealign_S (w[26], w[27], offset); + w[27] = hc_bytealign_S (w[25], w[26], offset); + w[26] = hc_bytealign_S (w[24], w[25], offset); + w[25] = hc_bytealign_S (w[23], w[24], offset); + w[24] = hc_bytealign_S (w[22], w[23], offset); + w[23] = hc_bytealign_S (w[21], w[22], offset); + w[22] = hc_bytealign_S (w[20], w[21], offset); + w[21] = hc_bytealign_S (w[19], w[20], offset); + w[20] = hc_bytealign_S (w[18], w[19], offset); + w[19] = hc_bytealign_S (w[17], w[18], offset); + w[18] = hc_bytealign_S (w[16], w[17], offset); + w[17] = hc_bytealign_S (w[15], w[16], offset); + w[16] = hc_bytealign_S (w[14], w[15], offset); + w[15] = hc_bytealign_S (w[13], w[14], offset); + w[14] = hc_bytealign_S (w[12], w[13], offset); + w[13] = hc_bytealign_S (w[11], w[12], offset); + w[12] = hc_bytealign_S (w[10], w[11], offset); + w[11] = hc_bytealign_S (w[ 9], w[10], offset); + w[10] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[ 9] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[ 8] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[ 7] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[ 6] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[ 5] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[ 4] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[ 3] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[ 2] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[ 1] = hc_bytealign_S ( 0, w[ 0], offset); w[ 0] = 0; break; case 2: - w[63] = amd_bytealign_S (w[60], w[61], offset); - w[62] = amd_bytealign_S (w[59], w[60], offset); - w[61] = amd_bytealign_S (w[58], w[59], offset); - w[60] = amd_bytealign_S (w[57], w[58], offset); - w[59] = amd_bytealign_S (w[56], w[57], offset); - w[58] = amd_bytealign_S (w[55], w[56], offset); - w[57] = amd_bytealign_S (w[54], w[55], offset); - w[56] = amd_bytealign_S (w[53], w[54], offset); - w[55] = amd_bytealign_S (w[52], w[53], offset); - w[54] = amd_bytealign_S (w[51], w[52], offset); - w[53] = amd_bytealign_S (w[50], w[51], offset); - w[52] = amd_bytealign_S (w[49], w[50], offset); - w[51] = amd_bytealign_S (w[48], w[49], offset); - w[50] = amd_bytealign_S (w[47], w[48], offset); - w[49] = amd_bytealign_S (w[46], w[47], offset); - w[48] = amd_bytealign_S (w[45], w[46], offset); - w[47] = amd_bytealign_S (w[44], w[45], offset); - w[46] = amd_bytealign_S (w[43], w[44], offset); - w[45] = amd_bytealign_S (w[42], w[43], offset); - w[44] = amd_bytealign_S (w[41], w[42], offset); - w[43] = amd_bytealign_S (w[40], w[41], offset); - w[42] = amd_bytealign_S (w[39], w[40], offset); - w[41] = amd_bytealign_S (w[38], w[39], offset); - w[40] = amd_bytealign_S (w[37], w[38], offset); - w[39] = amd_bytealign_S (w[36], w[37], offset); - w[38] = amd_bytealign_S (w[35], w[36], offset); - w[37] = amd_bytealign_S (w[34], w[35], offset); - w[36] = amd_bytealign_S (w[33], w[34], offset); - w[35] = amd_bytealign_S (w[32], w[33], offset); - w[34] = amd_bytealign_S (w[31], w[32], offset); - w[33] = amd_bytealign_S (w[30], w[31], offset); - w[32] = amd_bytealign_S (w[29], w[30], offset); - w[31] = amd_bytealign_S (w[28], w[29], offset); - w[30] = amd_bytealign_S (w[27], w[28], offset); - w[29] = amd_bytealign_S (w[26], w[27], offset); - w[28] = amd_bytealign_S (w[25], w[26], offset); - w[27] = amd_bytealign_S (w[24], w[25], offset); - w[26] = amd_bytealign_S (w[23], w[24], offset); - w[25] = amd_bytealign_S (w[22], w[23], offset); - w[24] = amd_bytealign_S (w[21], w[22], offset); - w[23] = amd_bytealign_S (w[20], w[21], offset); - w[22] = amd_bytealign_S (w[19], w[20], offset); - w[21] = amd_bytealign_S (w[18], w[19], offset); - w[20] = amd_bytealign_S (w[17], w[18], offset); - w[19] = amd_bytealign_S (w[16], w[17], offset); - w[18] = amd_bytealign_S (w[15], w[16], offset); - w[17] = amd_bytealign_S (w[14], w[15], offset); - w[16] = amd_bytealign_S (w[13], w[14], offset); - w[15] = amd_bytealign_S (w[12], w[13], offset); - w[14] = amd_bytealign_S (w[11], w[12], offset); - w[13] = amd_bytealign_S (w[10], w[11], offset); - w[12] = amd_bytealign_S (w[ 9], w[10], offset); - w[11] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[10] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[ 9] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[ 8] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[ 7] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[ 6] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[ 5] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[ 4] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[ 3] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[ 2] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[60], w[61], offset); + w[62] = hc_bytealign_S (w[59], w[60], offset); + w[61] = hc_bytealign_S (w[58], w[59], offset); + w[60] = hc_bytealign_S (w[57], w[58], offset); + w[59] = hc_bytealign_S (w[56], w[57], offset); + w[58] = hc_bytealign_S (w[55], w[56], offset); + w[57] = hc_bytealign_S (w[54], w[55], offset); + w[56] = hc_bytealign_S (w[53], w[54], offset); + w[55] = hc_bytealign_S (w[52], w[53], offset); + w[54] = hc_bytealign_S (w[51], w[52], offset); + w[53] = hc_bytealign_S (w[50], w[51], offset); + w[52] = hc_bytealign_S (w[49], w[50], offset); + w[51] = hc_bytealign_S (w[48], w[49], offset); + w[50] = hc_bytealign_S (w[47], w[48], offset); + w[49] = hc_bytealign_S (w[46], w[47], offset); + w[48] = hc_bytealign_S (w[45], w[46], offset); + w[47] = hc_bytealign_S (w[44], w[45], offset); + w[46] = hc_bytealign_S (w[43], w[44], offset); + w[45] = hc_bytealign_S (w[42], w[43], offset); + w[44] = hc_bytealign_S (w[41], w[42], offset); + w[43] = hc_bytealign_S (w[40], w[41], offset); + w[42] = hc_bytealign_S (w[39], w[40], offset); + w[41] = hc_bytealign_S (w[38], w[39], offset); + w[40] = hc_bytealign_S (w[37], w[38], offset); + w[39] = hc_bytealign_S (w[36], w[37], offset); + w[38] = hc_bytealign_S (w[35], w[36], offset); + w[37] = hc_bytealign_S (w[34], w[35], offset); + w[36] = hc_bytealign_S (w[33], w[34], offset); + w[35] = hc_bytealign_S (w[32], w[33], offset); + w[34] = hc_bytealign_S (w[31], w[32], offset); + w[33] = hc_bytealign_S (w[30], w[31], offset); + w[32] = hc_bytealign_S (w[29], w[30], offset); + w[31] = hc_bytealign_S (w[28], w[29], offset); + w[30] = hc_bytealign_S (w[27], w[28], offset); + w[29] = hc_bytealign_S (w[26], w[27], offset); + w[28] = hc_bytealign_S (w[25], w[26], offset); + w[27] = hc_bytealign_S (w[24], w[25], offset); + w[26] = hc_bytealign_S (w[23], w[24], offset); + w[25] = hc_bytealign_S (w[22], w[23], offset); + w[24] = hc_bytealign_S (w[21], w[22], offset); + w[23] = hc_bytealign_S (w[20], w[21], offset); + w[22] = hc_bytealign_S (w[19], w[20], offset); + w[21] = hc_bytealign_S (w[18], w[19], offset); + w[20] = hc_bytealign_S (w[17], w[18], offset); + w[19] = hc_bytealign_S (w[16], w[17], offset); + w[18] = hc_bytealign_S (w[15], w[16], offset); + w[17] = hc_bytealign_S (w[14], w[15], offset); + w[16] = hc_bytealign_S (w[13], w[14], offset); + w[15] = hc_bytealign_S (w[12], w[13], offset); + w[14] = hc_bytealign_S (w[11], w[12], offset); + w[13] = hc_bytealign_S (w[10], w[11], offset); + w[12] = hc_bytealign_S (w[ 9], w[10], offset); + w[11] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[10] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[ 9] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[ 8] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[ 7] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[ 6] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[ 5] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[ 4] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[ 3] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[ 2] = hc_bytealign_S ( 0, w[ 0], offset); w[ 1] = 0; w[ 0] = 0; break; case 3: - w[63] = amd_bytealign_S (w[59], w[60], offset); - w[62] = amd_bytealign_S (w[58], w[59], offset); - w[61] = amd_bytealign_S (w[57], w[58], offset); - w[60] = amd_bytealign_S (w[56], w[57], offset); - w[59] = amd_bytealign_S (w[55], w[56], offset); - w[58] = amd_bytealign_S (w[54], w[55], offset); - w[57] = amd_bytealign_S (w[53], w[54], offset); - w[56] = amd_bytealign_S (w[52], w[53], offset); - w[55] = amd_bytealign_S (w[51], w[52], offset); - w[54] = amd_bytealign_S (w[50], w[51], offset); - w[53] = amd_bytealign_S (w[49], w[50], offset); - w[52] = amd_bytealign_S (w[48], w[49], offset); - w[51] = amd_bytealign_S (w[47], w[48], offset); - w[50] = amd_bytealign_S (w[46], w[47], offset); - w[49] = amd_bytealign_S (w[45], w[46], offset); - w[48] = amd_bytealign_S (w[44], w[45], offset); - w[47] = amd_bytealign_S (w[43], w[44], offset); - w[46] = amd_bytealign_S (w[42], w[43], offset); - w[45] = amd_bytealign_S (w[41], w[42], offset); - w[44] = amd_bytealign_S (w[40], w[41], offset); - w[43] = amd_bytealign_S (w[39], w[40], offset); - w[42] = amd_bytealign_S (w[38], w[39], offset); - w[41] = amd_bytealign_S (w[37], w[38], offset); - w[40] = amd_bytealign_S (w[36], w[37], offset); - w[39] = amd_bytealign_S (w[35], w[36], offset); - w[38] = amd_bytealign_S (w[34], w[35], offset); - w[37] = amd_bytealign_S (w[33], w[34], offset); - w[36] = amd_bytealign_S (w[32], w[33], offset); - w[35] = amd_bytealign_S (w[31], w[32], offset); - w[34] = amd_bytealign_S (w[30], w[31], offset); - w[33] = amd_bytealign_S (w[29], w[30], offset); - w[32] = amd_bytealign_S (w[28], w[29], offset); - w[31] = amd_bytealign_S (w[27], w[28], offset); - w[30] = amd_bytealign_S (w[26], w[27], offset); - w[29] = amd_bytealign_S (w[25], w[26], offset); - w[28] = amd_bytealign_S (w[24], w[25], offset); - w[27] = amd_bytealign_S (w[23], w[24], offset); - w[26] = amd_bytealign_S (w[22], w[23], offset); - w[25] = amd_bytealign_S (w[21], w[22], offset); - w[24] = amd_bytealign_S (w[20], w[21], offset); - w[23] = amd_bytealign_S (w[19], w[20], offset); - w[22] = amd_bytealign_S (w[18], w[19], offset); - w[21] = amd_bytealign_S (w[17], w[18], offset); - w[20] = amd_bytealign_S (w[16], w[17], offset); - w[19] = amd_bytealign_S (w[15], w[16], offset); - w[18] = amd_bytealign_S (w[14], w[15], offset); - w[17] = amd_bytealign_S (w[13], w[14], offset); - w[16] = amd_bytealign_S (w[12], w[13], offset); - w[15] = amd_bytealign_S (w[11], w[12], offset); - w[14] = amd_bytealign_S (w[10], w[11], offset); - w[13] = amd_bytealign_S (w[ 9], w[10], offset); - w[12] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[11] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[10] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[ 9] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[ 8] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[ 7] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[ 6] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[ 5] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[ 4] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[ 3] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[59], w[60], offset); + w[62] = hc_bytealign_S (w[58], w[59], offset); + w[61] = hc_bytealign_S (w[57], w[58], offset); + w[60] = hc_bytealign_S (w[56], w[57], offset); + w[59] = hc_bytealign_S (w[55], w[56], offset); + w[58] = hc_bytealign_S (w[54], w[55], offset); + w[57] = hc_bytealign_S (w[53], w[54], offset); + w[56] = hc_bytealign_S (w[52], w[53], offset); + w[55] = hc_bytealign_S (w[51], w[52], offset); + w[54] = hc_bytealign_S (w[50], w[51], offset); + w[53] = hc_bytealign_S (w[49], w[50], offset); + w[52] = hc_bytealign_S (w[48], w[49], offset); + w[51] = hc_bytealign_S (w[47], w[48], offset); + w[50] = hc_bytealign_S (w[46], w[47], offset); + w[49] = hc_bytealign_S (w[45], w[46], offset); + w[48] = hc_bytealign_S (w[44], w[45], offset); + w[47] = hc_bytealign_S (w[43], w[44], offset); + w[46] = hc_bytealign_S (w[42], w[43], offset); + w[45] = hc_bytealign_S (w[41], w[42], offset); + w[44] = hc_bytealign_S (w[40], w[41], offset); + w[43] = hc_bytealign_S (w[39], w[40], offset); + w[42] = hc_bytealign_S (w[38], w[39], offset); + w[41] = hc_bytealign_S (w[37], w[38], offset); + w[40] = hc_bytealign_S (w[36], w[37], offset); + w[39] = hc_bytealign_S (w[35], w[36], offset); + w[38] = hc_bytealign_S (w[34], w[35], offset); + w[37] = hc_bytealign_S (w[33], w[34], offset); + w[36] = hc_bytealign_S (w[32], w[33], offset); + w[35] = hc_bytealign_S (w[31], w[32], offset); + w[34] = hc_bytealign_S (w[30], w[31], offset); + w[33] = hc_bytealign_S (w[29], w[30], offset); + w[32] = hc_bytealign_S (w[28], w[29], offset); + w[31] = hc_bytealign_S (w[27], w[28], offset); + w[30] = hc_bytealign_S (w[26], w[27], offset); + w[29] = hc_bytealign_S (w[25], w[26], offset); + w[28] = hc_bytealign_S (w[24], w[25], offset); + w[27] = hc_bytealign_S (w[23], w[24], offset); + w[26] = hc_bytealign_S (w[22], w[23], offset); + w[25] = hc_bytealign_S (w[21], w[22], offset); + w[24] = hc_bytealign_S (w[20], w[21], offset); + w[23] = hc_bytealign_S (w[19], w[20], offset); + w[22] = hc_bytealign_S (w[18], w[19], offset); + w[21] = hc_bytealign_S (w[17], w[18], offset); + w[20] = hc_bytealign_S (w[16], w[17], offset); + w[19] = hc_bytealign_S (w[15], w[16], offset); + w[18] = hc_bytealign_S (w[14], w[15], offset); + w[17] = hc_bytealign_S (w[13], w[14], offset); + w[16] = hc_bytealign_S (w[12], w[13], offset); + w[15] = hc_bytealign_S (w[11], w[12], offset); + w[14] = hc_bytealign_S (w[10], w[11], offset); + w[13] = hc_bytealign_S (w[ 9], w[10], offset); + w[12] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[11] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[10] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[ 9] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[ 8] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[ 7] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[ 6] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[ 5] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[ 4] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[ 3] = hc_bytealign_S ( 0, w[ 0], offset); w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; @@ -42626,66 +42626,66 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 4: - w[63] = amd_bytealign_S (w[58], w[59], offset); - w[62] = amd_bytealign_S (w[57], w[58], offset); - w[61] = amd_bytealign_S (w[56], w[57], offset); - w[60] = amd_bytealign_S (w[55], w[56], offset); - w[59] = amd_bytealign_S (w[54], w[55], offset); - w[58] = amd_bytealign_S (w[53], w[54], offset); - w[57] = amd_bytealign_S (w[52], w[53], offset); - w[56] = amd_bytealign_S (w[51], w[52], offset); - w[55] = amd_bytealign_S (w[50], w[51], offset); - w[54] = amd_bytealign_S (w[49], w[50], offset); - w[53] = amd_bytealign_S (w[48], w[49], offset); - w[52] = amd_bytealign_S (w[47], w[48], offset); - w[51] = amd_bytealign_S (w[46], w[47], offset); - w[50] = amd_bytealign_S (w[45], w[46], offset); - w[49] = amd_bytealign_S (w[44], w[45], offset); - w[48] = amd_bytealign_S (w[43], w[44], offset); - w[47] = amd_bytealign_S (w[42], w[43], offset); - w[46] = amd_bytealign_S (w[41], w[42], offset); - w[45] = amd_bytealign_S (w[40], w[41], offset); - w[44] = amd_bytealign_S (w[39], w[40], offset); - w[43] = amd_bytealign_S (w[38], w[39], offset); - w[42] = amd_bytealign_S (w[37], w[38], offset); - w[41] = amd_bytealign_S (w[36], w[37], offset); - w[40] = amd_bytealign_S (w[35], w[36], offset); - w[39] = amd_bytealign_S (w[34], w[35], offset); - w[38] = amd_bytealign_S (w[33], w[34], offset); - w[37] = amd_bytealign_S (w[32], w[33], offset); - w[36] = amd_bytealign_S (w[31], w[32], offset); - w[35] = amd_bytealign_S (w[30], w[31], offset); - w[34] = amd_bytealign_S (w[29], w[30], offset); - w[33] = amd_bytealign_S (w[28], w[29], offset); - w[32] = amd_bytealign_S (w[27], w[28], offset); - w[31] = amd_bytealign_S (w[26], w[27], offset); - w[30] = amd_bytealign_S (w[25], w[26], offset); - w[29] = amd_bytealign_S (w[24], w[25], offset); - w[28] = amd_bytealign_S (w[23], w[24], offset); - w[27] = amd_bytealign_S (w[22], w[23], offset); - w[26] = amd_bytealign_S (w[21], w[22], offset); - w[25] = amd_bytealign_S (w[20], w[21], offset); - w[24] = amd_bytealign_S (w[19], w[20], offset); - w[23] = amd_bytealign_S (w[18], w[19], offset); - w[22] = amd_bytealign_S (w[17], w[18], offset); - w[21] = amd_bytealign_S (w[16], w[17], offset); - w[20] = amd_bytealign_S (w[15], w[16], offset); - w[19] = amd_bytealign_S (w[14], w[15], offset); - w[18] = amd_bytealign_S (w[13], w[14], offset); - w[17] = amd_bytealign_S (w[12], w[13], offset); - w[16] = amd_bytealign_S (w[11], w[12], offset); - w[15] = amd_bytealign_S (w[10], w[11], offset); - w[14] = amd_bytealign_S (w[ 9], w[10], offset); - w[13] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[12] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[11] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[10] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[ 9] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[ 8] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[ 7] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[ 6] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[ 5] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[ 4] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[58], w[59], offset); + w[62] = hc_bytealign_S (w[57], w[58], offset); + w[61] = hc_bytealign_S (w[56], w[57], offset); + w[60] = hc_bytealign_S (w[55], w[56], offset); + w[59] = hc_bytealign_S (w[54], w[55], offset); + w[58] = hc_bytealign_S (w[53], w[54], offset); + w[57] = hc_bytealign_S (w[52], w[53], offset); + w[56] = hc_bytealign_S (w[51], w[52], offset); + w[55] = hc_bytealign_S (w[50], w[51], offset); + w[54] = hc_bytealign_S (w[49], w[50], offset); + w[53] = hc_bytealign_S (w[48], w[49], offset); + w[52] = hc_bytealign_S (w[47], w[48], offset); + w[51] = hc_bytealign_S (w[46], w[47], offset); + w[50] = hc_bytealign_S (w[45], w[46], offset); + w[49] = hc_bytealign_S (w[44], w[45], offset); + w[48] = hc_bytealign_S (w[43], w[44], offset); + w[47] = hc_bytealign_S (w[42], w[43], offset); + w[46] = hc_bytealign_S (w[41], w[42], offset); + w[45] = hc_bytealign_S (w[40], w[41], offset); + w[44] = hc_bytealign_S (w[39], w[40], offset); + w[43] = hc_bytealign_S (w[38], w[39], offset); + w[42] = hc_bytealign_S (w[37], w[38], offset); + w[41] = hc_bytealign_S (w[36], w[37], offset); + w[40] = hc_bytealign_S (w[35], w[36], offset); + w[39] = hc_bytealign_S (w[34], w[35], offset); + w[38] = hc_bytealign_S (w[33], w[34], offset); + w[37] = hc_bytealign_S (w[32], w[33], offset); + w[36] = hc_bytealign_S (w[31], w[32], offset); + w[35] = hc_bytealign_S (w[30], w[31], offset); + w[34] = hc_bytealign_S (w[29], w[30], offset); + w[33] = hc_bytealign_S (w[28], w[29], offset); + w[32] = hc_bytealign_S (w[27], w[28], offset); + w[31] = hc_bytealign_S (w[26], w[27], offset); + w[30] = hc_bytealign_S (w[25], w[26], offset); + w[29] = hc_bytealign_S (w[24], w[25], offset); + w[28] = hc_bytealign_S (w[23], w[24], offset); + w[27] = hc_bytealign_S (w[22], w[23], offset); + w[26] = hc_bytealign_S (w[21], w[22], offset); + w[25] = hc_bytealign_S (w[20], w[21], offset); + w[24] = hc_bytealign_S (w[19], w[20], offset); + w[23] = hc_bytealign_S (w[18], w[19], offset); + w[22] = hc_bytealign_S (w[17], w[18], offset); + w[21] = hc_bytealign_S (w[16], w[17], offset); + w[20] = hc_bytealign_S (w[15], w[16], offset); + w[19] = hc_bytealign_S (w[14], w[15], offset); + w[18] = hc_bytealign_S (w[13], w[14], offset); + w[17] = hc_bytealign_S (w[12], w[13], offset); + w[16] = hc_bytealign_S (w[11], w[12], offset); + w[15] = hc_bytealign_S (w[10], w[11], offset); + w[14] = hc_bytealign_S (w[ 9], w[10], offset); + w[13] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[12] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[11] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[10] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[ 9] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[ 8] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[ 7] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[ 6] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[ 5] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[ 4] = hc_bytealign_S ( 0, w[ 0], offset); w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; @@ -42694,65 +42694,65 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 5: - w[63] = amd_bytealign_S (w[57], w[58], offset); - w[62] = amd_bytealign_S (w[56], w[57], offset); - w[61] = amd_bytealign_S (w[55], w[56], offset); - w[60] = amd_bytealign_S (w[54], w[55], offset); - w[59] = amd_bytealign_S (w[53], w[54], offset); - w[58] = amd_bytealign_S (w[52], w[53], offset); - w[57] = amd_bytealign_S (w[51], w[52], offset); - w[56] = amd_bytealign_S (w[50], w[51], offset); - w[55] = amd_bytealign_S (w[49], w[50], offset); - w[54] = amd_bytealign_S (w[48], w[49], offset); - w[53] = amd_bytealign_S (w[47], w[48], offset); - w[52] = amd_bytealign_S (w[46], w[47], offset); - w[51] = amd_bytealign_S (w[45], w[46], offset); - w[50] = amd_bytealign_S (w[44], w[45], offset); - w[49] = amd_bytealign_S (w[43], w[44], offset); - w[48] = amd_bytealign_S (w[42], w[43], offset); - w[47] = amd_bytealign_S (w[41], w[42], offset); - w[46] = amd_bytealign_S (w[40], w[41], offset); - w[45] = amd_bytealign_S (w[39], w[40], offset); - w[44] = amd_bytealign_S (w[38], w[39], offset); - w[43] = amd_bytealign_S (w[37], w[38], offset); - w[42] = amd_bytealign_S (w[36], w[37], offset); - w[41] = amd_bytealign_S (w[35], w[36], offset); - w[40] = amd_bytealign_S (w[34], w[35], offset); - w[39] = amd_bytealign_S (w[33], w[34], offset); - w[38] = amd_bytealign_S (w[32], w[33], offset); - w[37] = amd_bytealign_S (w[31], w[32], offset); - w[36] = amd_bytealign_S (w[30], w[31], offset); - w[35] = amd_bytealign_S (w[29], w[30], offset); - w[34] = amd_bytealign_S (w[28], w[29], offset); - w[33] = amd_bytealign_S (w[27], w[28], offset); - w[32] = amd_bytealign_S (w[26], w[27], offset); - w[31] = amd_bytealign_S (w[25], w[26], offset); - w[30] = amd_bytealign_S (w[24], w[25], offset); - w[29] = amd_bytealign_S (w[23], w[24], offset); - w[28] = amd_bytealign_S (w[22], w[23], offset); - w[27] = amd_bytealign_S (w[21], w[22], offset); - w[26] = amd_bytealign_S (w[20], w[21], offset); - w[25] = amd_bytealign_S (w[19], w[20], offset); - w[24] = amd_bytealign_S (w[18], w[19], offset); - w[23] = amd_bytealign_S (w[17], w[18], offset); - w[22] = amd_bytealign_S (w[16], w[17], offset); - w[21] = amd_bytealign_S (w[15], w[16], offset); - w[20] = amd_bytealign_S (w[14], w[15], offset); - w[19] = amd_bytealign_S (w[13], w[14], offset); - w[18] = amd_bytealign_S (w[12], w[13], offset); - w[17] = amd_bytealign_S (w[11], w[12], offset); - w[16] = amd_bytealign_S (w[10], w[11], offset); - w[15] = amd_bytealign_S (w[ 9], w[10], offset); - w[14] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[13] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[12] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[11] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[10] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[ 9] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[ 8] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[ 7] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[ 6] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[ 5] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[57], w[58], offset); + w[62] = hc_bytealign_S (w[56], w[57], offset); + w[61] = hc_bytealign_S (w[55], w[56], offset); + w[60] = hc_bytealign_S (w[54], w[55], offset); + w[59] = hc_bytealign_S (w[53], w[54], offset); + w[58] = hc_bytealign_S (w[52], w[53], offset); + w[57] = hc_bytealign_S (w[51], w[52], offset); + w[56] = hc_bytealign_S (w[50], w[51], offset); + w[55] = hc_bytealign_S (w[49], w[50], offset); + w[54] = hc_bytealign_S (w[48], w[49], offset); + w[53] = hc_bytealign_S (w[47], w[48], offset); + w[52] = hc_bytealign_S (w[46], w[47], offset); + w[51] = hc_bytealign_S (w[45], w[46], offset); + w[50] = hc_bytealign_S (w[44], w[45], offset); + w[49] = hc_bytealign_S (w[43], w[44], offset); + w[48] = hc_bytealign_S (w[42], w[43], offset); + w[47] = hc_bytealign_S (w[41], w[42], offset); + w[46] = hc_bytealign_S (w[40], w[41], offset); + w[45] = hc_bytealign_S (w[39], w[40], offset); + w[44] = hc_bytealign_S (w[38], w[39], offset); + w[43] = hc_bytealign_S (w[37], w[38], offset); + w[42] = hc_bytealign_S (w[36], w[37], offset); + w[41] = hc_bytealign_S (w[35], w[36], offset); + w[40] = hc_bytealign_S (w[34], w[35], offset); + w[39] = hc_bytealign_S (w[33], w[34], offset); + w[38] = hc_bytealign_S (w[32], w[33], offset); + w[37] = hc_bytealign_S (w[31], w[32], offset); + w[36] = hc_bytealign_S (w[30], w[31], offset); + w[35] = hc_bytealign_S (w[29], w[30], offset); + w[34] = hc_bytealign_S (w[28], w[29], offset); + w[33] = hc_bytealign_S (w[27], w[28], offset); + w[32] = hc_bytealign_S (w[26], w[27], offset); + w[31] = hc_bytealign_S (w[25], w[26], offset); + w[30] = hc_bytealign_S (w[24], w[25], offset); + w[29] = hc_bytealign_S (w[23], w[24], offset); + w[28] = hc_bytealign_S (w[22], w[23], offset); + w[27] = hc_bytealign_S (w[21], w[22], offset); + w[26] = hc_bytealign_S (w[20], w[21], offset); + w[25] = hc_bytealign_S (w[19], w[20], offset); + w[24] = hc_bytealign_S (w[18], w[19], offset); + w[23] = hc_bytealign_S (w[17], w[18], offset); + w[22] = hc_bytealign_S (w[16], w[17], offset); + w[21] = hc_bytealign_S (w[15], w[16], offset); + w[20] = hc_bytealign_S (w[14], w[15], offset); + w[19] = hc_bytealign_S (w[13], w[14], offset); + w[18] = hc_bytealign_S (w[12], w[13], offset); + w[17] = hc_bytealign_S (w[11], w[12], offset); + w[16] = hc_bytealign_S (w[10], w[11], offset); + w[15] = hc_bytealign_S (w[ 9], w[10], offset); + w[14] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[13] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[12] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[11] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[10] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[ 9] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[ 8] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[ 7] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[ 6] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[ 5] = hc_bytealign_S ( 0, w[ 0], offset); w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; @@ -42762,64 +42762,64 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 6: - w[63] = amd_bytealign_S (w[56], w[57], offset); - w[62] = amd_bytealign_S (w[55], w[56], offset); - w[61] = amd_bytealign_S (w[54], w[55], offset); - w[60] = amd_bytealign_S (w[53], w[54], offset); - w[59] = amd_bytealign_S (w[52], w[53], offset); - w[58] = amd_bytealign_S (w[51], w[52], offset); - w[57] = amd_bytealign_S (w[50], w[51], offset); - w[56] = amd_bytealign_S (w[49], w[50], offset); - w[55] = amd_bytealign_S (w[48], w[49], offset); - w[54] = amd_bytealign_S (w[47], w[48], offset); - w[53] = amd_bytealign_S (w[46], w[47], offset); - w[52] = amd_bytealign_S (w[45], w[46], offset); - w[51] = amd_bytealign_S (w[44], w[45], offset); - w[50] = amd_bytealign_S (w[43], w[44], offset); - w[49] = amd_bytealign_S (w[42], w[43], offset); - w[48] = amd_bytealign_S (w[41], w[42], offset); - w[47] = amd_bytealign_S (w[40], w[41], offset); - w[46] = amd_bytealign_S (w[39], w[40], offset); - w[45] = amd_bytealign_S (w[38], w[39], offset); - w[44] = amd_bytealign_S (w[37], w[38], offset); - w[43] = amd_bytealign_S (w[36], w[37], offset); - w[42] = amd_bytealign_S (w[35], w[36], offset); - w[41] = amd_bytealign_S (w[34], w[35], offset); - w[40] = amd_bytealign_S (w[33], w[34], offset); - w[39] = amd_bytealign_S (w[32], w[33], offset); - w[38] = amd_bytealign_S (w[31], w[32], offset); - w[37] = amd_bytealign_S (w[30], w[31], offset); - w[36] = amd_bytealign_S (w[29], w[30], offset); - w[35] = amd_bytealign_S (w[28], w[29], offset); - w[34] = amd_bytealign_S (w[27], w[28], offset); - w[33] = amd_bytealign_S (w[26], w[27], offset); - w[32] = amd_bytealign_S (w[25], w[26], offset); - w[31] = amd_bytealign_S (w[24], w[25], offset); - w[30] = amd_bytealign_S (w[23], w[24], offset); - w[29] = amd_bytealign_S (w[22], w[23], offset); - w[28] = amd_bytealign_S (w[21], w[22], offset); - w[27] = amd_bytealign_S (w[20], w[21], offset); - w[26] = amd_bytealign_S (w[19], w[20], offset); - w[25] = amd_bytealign_S (w[18], w[19], offset); - w[24] = amd_bytealign_S (w[17], w[18], offset); - w[23] = amd_bytealign_S (w[16], w[17], offset); - w[22] = amd_bytealign_S (w[15], w[16], offset); - w[21] = amd_bytealign_S (w[14], w[15], offset); - w[20] = amd_bytealign_S (w[13], w[14], offset); - w[19] = amd_bytealign_S (w[12], w[13], offset); - w[18] = amd_bytealign_S (w[11], w[12], offset); - w[17] = amd_bytealign_S (w[10], w[11], offset); - w[16] = amd_bytealign_S (w[ 9], w[10], offset); - w[15] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[14] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[13] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[12] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[11] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[10] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[ 9] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[ 8] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[ 7] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[ 6] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[56], w[57], offset); + w[62] = hc_bytealign_S (w[55], w[56], offset); + w[61] = hc_bytealign_S (w[54], w[55], offset); + w[60] = hc_bytealign_S (w[53], w[54], offset); + w[59] = hc_bytealign_S (w[52], w[53], offset); + w[58] = hc_bytealign_S (w[51], w[52], offset); + w[57] = hc_bytealign_S (w[50], w[51], offset); + w[56] = hc_bytealign_S (w[49], w[50], offset); + w[55] = hc_bytealign_S (w[48], w[49], offset); + w[54] = hc_bytealign_S (w[47], w[48], offset); + w[53] = hc_bytealign_S (w[46], w[47], offset); + w[52] = hc_bytealign_S (w[45], w[46], offset); + w[51] = hc_bytealign_S (w[44], w[45], offset); + w[50] = hc_bytealign_S (w[43], w[44], offset); + w[49] = hc_bytealign_S (w[42], w[43], offset); + w[48] = hc_bytealign_S (w[41], w[42], offset); + w[47] = hc_bytealign_S (w[40], w[41], offset); + w[46] = hc_bytealign_S (w[39], w[40], offset); + w[45] = hc_bytealign_S (w[38], w[39], offset); + w[44] = hc_bytealign_S (w[37], w[38], offset); + w[43] = hc_bytealign_S (w[36], w[37], offset); + w[42] = hc_bytealign_S (w[35], w[36], offset); + w[41] = hc_bytealign_S (w[34], w[35], offset); + w[40] = hc_bytealign_S (w[33], w[34], offset); + w[39] = hc_bytealign_S (w[32], w[33], offset); + w[38] = hc_bytealign_S (w[31], w[32], offset); + w[37] = hc_bytealign_S (w[30], w[31], offset); + w[36] = hc_bytealign_S (w[29], w[30], offset); + w[35] = hc_bytealign_S (w[28], w[29], offset); + w[34] = hc_bytealign_S (w[27], w[28], offset); + w[33] = hc_bytealign_S (w[26], w[27], offset); + w[32] = hc_bytealign_S (w[25], w[26], offset); + w[31] = hc_bytealign_S (w[24], w[25], offset); + w[30] = hc_bytealign_S (w[23], w[24], offset); + w[29] = hc_bytealign_S (w[22], w[23], offset); + w[28] = hc_bytealign_S (w[21], w[22], offset); + w[27] = hc_bytealign_S (w[20], w[21], offset); + w[26] = hc_bytealign_S (w[19], w[20], offset); + w[25] = hc_bytealign_S (w[18], w[19], offset); + w[24] = hc_bytealign_S (w[17], w[18], offset); + w[23] = hc_bytealign_S (w[16], w[17], offset); + w[22] = hc_bytealign_S (w[15], w[16], offset); + w[21] = hc_bytealign_S (w[14], w[15], offset); + w[20] = hc_bytealign_S (w[13], w[14], offset); + w[19] = hc_bytealign_S (w[12], w[13], offset); + w[18] = hc_bytealign_S (w[11], w[12], offset); + w[17] = hc_bytealign_S (w[10], w[11], offset); + w[16] = hc_bytealign_S (w[ 9], w[10], offset); + w[15] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[14] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[13] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[12] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[11] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[10] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[ 9] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[ 8] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[ 7] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[ 6] = hc_bytealign_S ( 0, w[ 0], offset); w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; @@ -42830,63 +42830,63 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 7: - w[63] = amd_bytealign_S (w[55], w[56], offset); - w[62] = amd_bytealign_S (w[54], w[55], offset); - w[61] = amd_bytealign_S (w[53], w[54], offset); - w[60] = amd_bytealign_S (w[52], w[53], offset); - w[59] = amd_bytealign_S (w[51], w[52], offset); - w[58] = amd_bytealign_S (w[50], w[51], offset); - w[57] = amd_bytealign_S (w[49], w[50], offset); - w[56] = amd_bytealign_S (w[48], w[49], offset); - w[55] = amd_bytealign_S (w[47], w[48], offset); - w[54] = amd_bytealign_S (w[46], w[47], offset); - w[53] = amd_bytealign_S (w[45], w[46], offset); - w[52] = amd_bytealign_S (w[44], w[45], offset); - w[51] = amd_bytealign_S (w[43], w[44], offset); - w[50] = amd_bytealign_S (w[42], w[43], offset); - w[49] = amd_bytealign_S (w[41], w[42], offset); - w[48] = amd_bytealign_S (w[40], w[41], offset); - w[47] = amd_bytealign_S (w[39], w[40], offset); - w[46] = amd_bytealign_S (w[38], w[39], offset); - w[45] = amd_bytealign_S (w[37], w[38], offset); - w[44] = amd_bytealign_S (w[36], w[37], offset); - w[43] = amd_bytealign_S (w[35], w[36], offset); - w[42] = amd_bytealign_S (w[34], w[35], offset); - w[41] = amd_bytealign_S (w[33], w[34], offset); - w[40] = amd_bytealign_S (w[32], w[33], offset); - w[39] = amd_bytealign_S (w[31], w[32], offset); - w[38] = amd_bytealign_S (w[30], w[31], offset); - w[37] = amd_bytealign_S (w[29], w[30], offset); - w[36] = amd_bytealign_S (w[28], w[29], offset); - w[35] = amd_bytealign_S (w[27], w[28], offset); - w[34] = amd_bytealign_S (w[26], w[27], offset); - w[33] = amd_bytealign_S (w[25], w[26], offset); - w[32] = amd_bytealign_S (w[24], w[25], offset); - w[31] = amd_bytealign_S (w[23], w[24], offset); - w[30] = amd_bytealign_S (w[22], w[23], offset); - w[29] = amd_bytealign_S (w[21], w[22], offset); - w[28] = amd_bytealign_S (w[20], w[21], offset); - w[27] = amd_bytealign_S (w[19], w[20], offset); - w[26] = amd_bytealign_S (w[18], w[19], offset); - w[25] = amd_bytealign_S (w[17], w[18], offset); - w[24] = amd_bytealign_S (w[16], w[17], offset); - w[23] = amd_bytealign_S (w[15], w[16], offset); - w[22] = amd_bytealign_S (w[14], w[15], offset); - w[21] = amd_bytealign_S (w[13], w[14], offset); - w[20] = amd_bytealign_S (w[12], w[13], offset); - w[19] = amd_bytealign_S (w[11], w[12], offset); - w[18] = amd_bytealign_S (w[10], w[11], offset); - w[17] = amd_bytealign_S (w[ 9], w[10], offset); - w[16] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[15] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[14] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[13] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[12] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[11] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[10] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[ 9] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[ 8] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[ 7] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[55], w[56], offset); + w[62] = hc_bytealign_S (w[54], w[55], offset); + w[61] = hc_bytealign_S (w[53], w[54], offset); + w[60] = hc_bytealign_S (w[52], w[53], offset); + w[59] = hc_bytealign_S (w[51], w[52], offset); + w[58] = hc_bytealign_S (w[50], w[51], offset); + w[57] = hc_bytealign_S (w[49], w[50], offset); + w[56] = hc_bytealign_S (w[48], w[49], offset); + w[55] = hc_bytealign_S (w[47], w[48], offset); + w[54] = hc_bytealign_S (w[46], w[47], offset); + w[53] = hc_bytealign_S (w[45], w[46], offset); + w[52] = hc_bytealign_S (w[44], w[45], offset); + w[51] = hc_bytealign_S (w[43], w[44], offset); + w[50] = hc_bytealign_S (w[42], w[43], offset); + w[49] = hc_bytealign_S (w[41], w[42], offset); + w[48] = hc_bytealign_S (w[40], w[41], offset); + w[47] = hc_bytealign_S (w[39], w[40], offset); + w[46] = hc_bytealign_S (w[38], w[39], offset); + w[45] = hc_bytealign_S (w[37], w[38], offset); + w[44] = hc_bytealign_S (w[36], w[37], offset); + w[43] = hc_bytealign_S (w[35], w[36], offset); + w[42] = hc_bytealign_S (w[34], w[35], offset); + w[41] = hc_bytealign_S (w[33], w[34], offset); + w[40] = hc_bytealign_S (w[32], w[33], offset); + w[39] = hc_bytealign_S (w[31], w[32], offset); + w[38] = hc_bytealign_S (w[30], w[31], offset); + w[37] = hc_bytealign_S (w[29], w[30], offset); + w[36] = hc_bytealign_S (w[28], w[29], offset); + w[35] = hc_bytealign_S (w[27], w[28], offset); + w[34] = hc_bytealign_S (w[26], w[27], offset); + w[33] = hc_bytealign_S (w[25], w[26], offset); + w[32] = hc_bytealign_S (w[24], w[25], offset); + w[31] = hc_bytealign_S (w[23], w[24], offset); + w[30] = hc_bytealign_S (w[22], w[23], offset); + w[29] = hc_bytealign_S (w[21], w[22], offset); + w[28] = hc_bytealign_S (w[20], w[21], offset); + w[27] = hc_bytealign_S (w[19], w[20], offset); + w[26] = hc_bytealign_S (w[18], w[19], offset); + w[25] = hc_bytealign_S (w[17], w[18], offset); + w[24] = hc_bytealign_S (w[16], w[17], offset); + w[23] = hc_bytealign_S (w[15], w[16], offset); + w[22] = hc_bytealign_S (w[14], w[15], offset); + w[21] = hc_bytealign_S (w[13], w[14], offset); + w[20] = hc_bytealign_S (w[12], w[13], offset); + w[19] = hc_bytealign_S (w[11], w[12], offset); + w[18] = hc_bytealign_S (w[10], w[11], offset); + w[17] = hc_bytealign_S (w[ 9], w[10], offset); + w[16] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[15] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[14] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[13] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[12] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[11] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[10] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[ 9] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[ 8] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[ 7] = hc_bytealign_S ( 0, w[ 0], offset); w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; @@ -42898,62 +42898,62 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 8: - w[63] = amd_bytealign_S (w[54], w[55], offset); - w[62] = amd_bytealign_S (w[53], w[54], offset); - w[61] = amd_bytealign_S (w[52], w[53], offset); - w[60] = amd_bytealign_S (w[51], w[52], offset); - w[59] = amd_bytealign_S (w[50], w[51], offset); - w[58] = amd_bytealign_S (w[49], w[50], offset); - w[57] = amd_bytealign_S (w[48], w[49], offset); - w[56] = amd_bytealign_S (w[47], w[48], offset); - w[55] = amd_bytealign_S (w[46], w[47], offset); - w[54] = amd_bytealign_S (w[45], w[46], offset); - w[53] = amd_bytealign_S (w[44], w[45], offset); - w[52] = amd_bytealign_S (w[43], w[44], offset); - w[51] = amd_bytealign_S (w[42], w[43], offset); - w[50] = amd_bytealign_S (w[41], w[42], offset); - w[49] = amd_bytealign_S (w[40], w[41], offset); - w[48] = amd_bytealign_S (w[39], w[40], offset); - w[47] = amd_bytealign_S (w[38], w[39], offset); - w[46] = amd_bytealign_S (w[37], w[38], offset); - w[45] = amd_bytealign_S (w[36], w[37], offset); - w[44] = amd_bytealign_S (w[35], w[36], offset); - w[43] = amd_bytealign_S (w[34], w[35], offset); - w[42] = amd_bytealign_S (w[33], w[34], offset); - w[41] = amd_bytealign_S (w[32], w[33], offset); - w[40] = amd_bytealign_S (w[31], w[32], offset); - w[39] = amd_bytealign_S (w[30], w[31], offset); - w[38] = amd_bytealign_S (w[29], w[30], offset); - w[37] = amd_bytealign_S (w[28], w[29], offset); - w[36] = amd_bytealign_S (w[27], w[28], offset); - w[35] = amd_bytealign_S (w[26], w[27], offset); - w[34] = amd_bytealign_S (w[25], w[26], offset); - w[33] = amd_bytealign_S (w[24], w[25], offset); - w[32] = amd_bytealign_S (w[23], w[24], offset); - w[31] = amd_bytealign_S (w[22], w[23], offset); - w[30] = amd_bytealign_S (w[21], w[22], offset); - w[29] = amd_bytealign_S (w[20], w[21], offset); - w[28] = amd_bytealign_S (w[19], w[20], offset); - w[27] = amd_bytealign_S (w[18], w[19], offset); - w[26] = amd_bytealign_S (w[17], w[18], offset); - w[25] = amd_bytealign_S (w[16], w[17], offset); - w[24] = amd_bytealign_S (w[15], w[16], offset); - w[23] = amd_bytealign_S (w[14], w[15], offset); - w[22] = amd_bytealign_S (w[13], w[14], offset); - w[21] = amd_bytealign_S (w[12], w[13], offset); - w[20] = amd_bytealign_S (w[11], w[12], offset); - w[19] = amd_bytealign_S (w[10], w[11], offset); - w[18] = amd_bytealign_S (w[ 9], w[10], offset); - w[17] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[16] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[15] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[14] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[13] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[12] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[11] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[10] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[ 9] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[ 8] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[54], w[55], offset); + w[62] = hc_bytealign_S (w[53], w[54], offset); + w[61] = hc_bytealign_S (w[52], w[53], offset); + w[60] = hc_bytealign_S (w[51], w[52], offset); + w[59] = hc_bytealign_S (w[50], w[51], offset); + w[58] = hc_bytealign_S (w[49], w[50], offset); + w[57] = hc_bytealign_S (w[48], w[49], offset); + w[56] = hc_bytealign_S (w[47], w[48], offset); + w[55] = hc_bytealign_S (w[46], w[47], offset); + w[54] = hc_bytealign_S (w[45], w[46], offset); + w[53] = hc_bytealign_S (w[44], w[45], offset); + w[52] = hc_bytealign_S (w[43], w[44], offset); + w[51] = hc_bytealign_S (w[42], w[43], offset); + w[50] = hc_bytealign_S (w[41], w[42], offset); + w[49] = hc_bytealign_S (w[40], w[41], offset); + w[48] = hc_bytealign_S (w[39], w[40], offset); + w[47] = hc_bytealign_S (w[38], w[39], offset); + w[46] = hc_bytealign_S (w[37], w[38], offset); + w[45] = hc_bytealign_S (w[36], w[37], offset); + w[44] = hc_bytealign_S (w[35], w[36], offset); + w[43] = hc_bytealign_S (w[34], w[35], offset); + w[42] = hc_bytealign_S (w[33], w[34], offset); + w[41] = hc_bytealign_S (w[32], w[33], offset); + w[40] = hc_bytealign_S (w[31], w[32], offset); + w[39] = hc_bytealign_S (w[30], w[31], offset); + w[38] = hc_bytealign_S (w[29], w[30], offset); + w[37] = hc_bytealign_S (w[28], w[29], offset); + w[36] = hc_bytealign_S (w[27], w[28], offset); + w[35] = hc_bytealign_S (w[26], w[27], offset); + w[34] = hc_bytealign_S (w[25], w[26], offset); + w[33] = hc_bytealign_S (w[24], w[25], offset); + w[32] = hc_bytealign_S (w[23], w[24], offset); + w[31] = hc_bytealign_S (w[22], w[23], offset); + w[30] = hc_bytealign_S (w[21], w[22], offset); + w[29] = hc_bytealign_S (w[20], w[21], offset); + w[28] = hc_bytealign_S (w[19], w[20], offset); + w[27] = hc_bytealign_S (w[18], w[19], offset); + w[26] = hc_bytealign_S (w[17], w[18], offset); + w[25] = hc_bytealign_S (w[16], w[17], offset); + w[24] = hc_bytealign_S (w[15], w[16], offset); + w[23] = hc_bytealign_S (w[14], w[15], offset); + w[22] = hc_bytealign_S (w[13], w[14], offset); + w[21] = hc_bytealign_S (w[12], w[13], offset); + w[20] = hc_bytealign_S (w[11], w[12], offset); + w[19] = hc_bytealign_S (w[10], w[11], offset); + w[18] = hc_bytealign_S (w[ 9], w[10], offset); + w[17] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[16] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[15] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[14] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[13] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[12] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[11] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[10] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[ 9] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[ 8] = hc_bytealign_S ( 0, w[ 0], offset); w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; @@ -42966,61 +42966,61 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 9: - w[63] = amd_bytealign_S (w[53], w[54], offset); - w[62] = amd_bytealign_S (w[52], w[53], offset); - w[61] = amd_bytealign_S (w[51], w[52], offset); - w[60] = amd_bytealign_S (w[50], w[51], offset); - w[59] = amd_bytealign_S (w[49], w[50], offset); - w[58] = amd_bytealign_S (w[48], w[49], offset); - w[57] = amd_bytealign_S (w[47], w[48], offset); - w[56] = amd_bytealign_S (w[46], w[47], offset); - w[55] = amd_bytealign_S (w[45], w[46], offset); - w[54] = amd_bytealign_S (w[44], w[45], offset); - w[53] = amd_bytealign_S (w[43], w[44], offset); - w[52] = amd_bytealign_S (w[42], w[43], offset); - w[51] = amd_bytealign_S (w[41], w[42], offset); - w[50] = amd_bytealign_S (w[40], w[41], offset); - w[49] = amd_bytealign_S (w[39], w[40], offset); - w[48] = amd_bytealign_S (w[38], w[39], offset); - w[47] = amd_bytealign_S (w[37], w[38], offset); - w[46] = amd_bytealign_S (w[36], w[37], offset); - w[45] = amd_bytealign_S (w[35], w[36], offset); - w[44] = amd_bytealign_S (w[34], w[35], offset); - w[43] = amd_bytealign_S (w[33], w[34], offset); - w[42] = amd_bytealign_S (w[32], w[33], offset); - w[41] = amd_bytealign_S (w[31], w[32], offset); - w[40] = amd_bytealign_S (w[30], w[31], offset); - w[39] = amd_bytealign_S (w[29], w[30], offset); - w[38] = amd_bytealign_S (w[28], w[29], offset); - w[37] = amd_bytealign_S (w[27], w[28], offset); - w[36] = amd_bytealign_S (w[26], w[27], offset); - w[35] = amd_bytealign_S (w[25], w[26], offset); - w[34] = amd_bytealign_S (w[24], w[25], offset); - w[33] = amd_bytealign_S (w[23], w[24], offset); - w[32] = amd_bytealign_S (w[22], w[23], offset); - w[31] = amd_bytealign_S (w[21], w[22], offset); - w[30] = amd_bytealign_S (w[20], w[21], offset); - w[29] = amd_bytealign_S (w[19], w[20], offset); - w[28] = amd_bytealign_S (w[18], w[19], offset); - w[27] = amd_bytealign_S (w[17], w[18], offset); - w[26] = amd_bytealign_S (w[16], w[17], offset); - w[25] = amd_bytealign_S (w[15], w[16], offset); - w[24] = amd_bytealign_S (w[14], w[15], offset); - w[23] = amd_bytealign_S (w[13], w[14], offset); - w[22] = amd_bytealign_S (w[12], w[13], offset); - w[21] = amd_bytealign_S (w[11], w[12], offset); - w[20] = amd_bytealign_S (w[10], w[11], offset); - w[19] = amd_bytealign_S (w[ 9], w[10], offset); - w[18] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[17] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[16] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[15] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[14] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[13] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[12] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[11] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[10] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[ 9] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[53], w[54], offset); + w[62] = hc_bytealign_S (w[52], w[53], offset); + w[61] = hc_bytealign_S (w[51], w[52], offset); + w[60] = hc_bytealign_S (w[50], w[51], offset); + w[59] = hc_bytealign_S (w[49], w[50], offset); + w[58] = hc_bytealign_S (w[48], w[49], offset); + w[57] = hc_bytealign_S (w[47], w[48], offset); + w[56] = hc_bytealign_S (w[46], w[47], offset); + w[55] = hc_bytealign_S (w[45], w[46], offset); + w[54] = hc_bytealign_S (w[44], w[45], offset); + w[53] = hc_bytealign_S (w[43], w[44], offset); + w[52] = hc_bytealign_S (w[42], w[43], offset); + w[51] = hc_bytealign_S (w[41], w[42], offset); + w[50] = hc_bytealign_S (w[40], w[41], offset); + w[49] = hc_bytealign_S (w[39], w[40], offset); + w[48] = hc_bytealign_S (w[38], w[39], offset); + w[47] = hc_bytealign_S (w[37], w[38], offset); + w[46] = hc_bytealign_S (w[36], w[37], offset); + w[45] = hc_bytealign_S (w[35], w[36], offset); + w[44] = hc_bytealign_S (w[34], w[35], offset); + w[43] = hc_bytealign_S (w[33], w[34], offset); + w[42] = hc_bytealign_S (w[32], w[33], offset); + w[41] = hc_bytealign_S (w[31], w[32], offset); + w[40] = hc_bytealign_S (w[30], w[31], offset); + w[39] = hc_bytealign_S (w[29], w[30], offset); + w[38] = hc_bytealign_S (w[28], w[29], offset); + w[37] = hc_bytealign_S (w[27], w[28], offset); + w[36] = hc_bytealign_S (w[26], w[27], offset); + w[35] = hc_bytealign_S (w[25], w[26], offset); + w[34] = hc_bytealign_S (w[24], w[25], offset); + w[33] = hc_bytealign_S (w[23], w[24], offset); + w[32] = hc_bytealign_S (w[22], w[23], offset); + w[31] = hc_bytealign_S (w[21], w[22], offset); + w[30] = hc_bytealign_S (w[20], w[21], offset); + w[29] = hc_bytealign_S (w[19], w[20], offset); + w[28] = hc_bytealign_S (w[18], w[19], offset); + w[27] = hc_bytealign_S (w[17], w[18], offset); + w[26] = hc_bytealign_S (w[16], w[17], offset); + w[25] = hc_bytealign_S (w[15], w[16], offset); + w[24] = hc_bytealign_S (w[14], w[15], offset); + w[23] = hc_bytealign_S (w[13], w[14], offset); + w[22] = hc_bytealign_S (w[12], w[13], offset); + w[21] = hc_bytealign_S (w[11], w[12], offset); + w[20] = hc_bytealign_S (w[10], w[11], offset); + w[19] = hc_bytealign_S (w[ 9], w[10], offset); + w[18] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[17] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[16] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[15] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[14] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[13] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[12] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[11] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[10] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[ 9] = hc_bytealign_S ( 0, w[ 0], offset); w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; @@ -43034,60 +43034,60 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 10: - w[63] = amd_bytealign_S (w[52], w[53], offset); - w[62] = amd_bytealign_S (w[51], w[52], offset); - w[61] = amd_bytealign_S (w[50], w[51], offset); - w[60] = amd_bytealign_S (w[49], w[50], offset); - w[59] = amd_bytealign_S (w[48], w[49], offset); - w[58] = amd_bytealign_S (w[47], w[48], offset); - w[57] = amd_bytealign_S (w[46], w[47], offset); - w[56] = amd_bytealign_S (w[45], w[46], offset); - w[55] = amd_bytealign_S (w[44], w[45], offset); - w[54] = amd_bytealign_S (w[43], w[44], offset); - w[53] = amd_bytealign_S (w[42], w[43], offset); - w[52] = amd_bytealign_S (w[41], w[42], offset); - w[51] = amd_bytealign_S (w[40], w[41], offset); - w[50] = amd_bytealign_S (w[39], w[40], offset); - w[49] = amd_bytealign_S (w[38], w[39], offset); - w[48] = amd_bytealign_S (w[37], w[38], offset); - w[47] = amd_bytealign_S (w[36], w[37], offset); - w[46] = amd_bytealign_S (w[35], w[36], offset); - w[45] = amd_bytealign_S (w[34], w[35], offset); - w[44] = amd_bytealign_S (w[33], w[34], offset); - w[43] = amd_bytealign_S (w[32], w[33], offset); - w[42] = amd_bytealign_S (w[31], w[32], offset); - w[41] = amd_bytealign_S (w[30], w[31], offset); - w[40] = amd_bytealign_S (w[29], w[30], offset); - w[39] = amd_bytealign_S (w[28], w[29], offset); - w[38] = amd_bytealign_S (w[27], w[28], offset); - w[37] = amd_bytealign_S (w[26], w[27], offset); - w[36] = amd_bytealign_S (w[25], w[26], offset); - w[35] = amd_bytealign_S (w[24], w[25], offset); - w[34] = amd_bytealign_S (w[23], w[24], offset); - w[33] = amd_bytealign_S (w[22], w[23], offset); - w[32] = amd_bytealign_S (w[21], w[22], offset); - w[31] = amd_bytealign_S (w[20], w[21], offset); - w[30] = amd_bytealign_S (w[19], w[20], offset); - w[29] = amd_bytealign_S (w[18], w[19], offset); - w[28] = amd_bytealign_S (w[17], w[18], offset); - w[27] = amd_bytealign_S (w[16], w[17], offset); - w[26] = amd_bytealign_S (w[15], w[16], offset); - w[25] = amd_bytealign_S (w[14], w[15], offset); - w[24] = amd_bytealign_S (w[13], w[14], offset); - w[23] = amd_bytealign_S (w[12], w[13], offset); - w[22] = amd_bytealign_S (w[11], w[12], offset); - w[21] = amd_bytealign_S (w[10], w[11], offset); - w[20] = amd_bytealign_S (w[ 9], w[10], offset); - w[19] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[18] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[17] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[16] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[15] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[14] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[13] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[12] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[11] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[10] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[52], w[53], offset); + w[62] = hc_bytealign_S (w[51], w[52], offset); + w[61] = hc_bytealign_S (w[50], w[51], offset); + w[60] = hc_bytealign_S (w[49], w[50], offset); + w[59] = hc_bytealign_S (w[48], w[49], offset); + w[58] = hc_bytealign_S (w[47], w[48], offset); + w[57] = hc_bytealign_S (w[46], w[47], offset); + w[56] = hc_bytealign_S (w[45], w[46], offset); + w[55] = hc_bytealign_S (w[44], w[45], offset); + w[54] = hc_bytealign_S (w[43], w[44], offset); + w[53] = hc_bytealign_S (w[42], w[43], offset); + w[52] = hc_bytealign_S (w[41], w[42], offset); + w[51] = hc_bytealign_S (w[40], w[41], offset); + w[50] = hc_bytealign_S (w[39], w[40], offset); + w[49] = hc_bytealign_S (w[38], w[39], offset); + w[48] = hc_bytealign_S (w[37], w[38], offset); + w[47] = hc_bytealign_S (w[36], w[37], offset); + w[46] = hc_bytealign_S (w[35], w[36], offset); + w[45] = hc_bytealign_S (w[34], w[35], offset); + w[44] = hc_bytealign_S (w[33], w[34], offset); + w[43] = hc_bytealign_S (w[32], w[33], offset); + w[42] = hc_bytealign_S (w[31], w[32], offset); + w[41] = hc_bytealign_S (w[30], w[31], offset); + w[40] = hc_bytealign_S (w[29], w[30], offset); + w[39] = hc_bytealign_S (w[28], w[29], offset); + w[38] = hc_bytealign_S (w[27], w[28], offset); + w[37] = hc_bytealign_S (w[26], w[27], offset); + w[36] = hc_bytealign_S (w[25], w[26], offset); + w[35] = hc_bytealign_S (w[24], w[25], offset); + w[34] = hc_bytealign_S (w[23], w[24], offset); + w[33] = hc_bytealign_S (w[22], w[23], offset); + w[32] = hc_bytealign_S (w[21], w[22], offset); + w[31] = hc_bytealign_S (w[20], w[21], offset); + w[30] = hc_bytealign_S (w[19], w[20], offset); + w[29] = hc_bytealign_S (w[18], w[19], offset); + w[28] = hc_bytealign_S (w[17], w[18], offset); + w[27] = hc_bytealign_S (w[16], w[17], offset); + w[26] = hc_bytealign_S (w[15], w[16], offset); + w[25] = hc_bytealign_S (w[14], w[15], offset); + w[24] = hc_bytealign_S (w[13], w[14], offset); + w[23] = hc_bytealign_S (w[12], w[13], offset); + w[22] = hc_bytealign_S (w[11], w[12], offset); + w[21] = hc_bytealign_S (w[10], w[11], offset); + w[20] = hc_bytealign_S (w[ 9], w[10], offset); + w[19] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[18] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[17] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[16] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[15] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[14] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[13] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[12] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[11] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[10] = hc_bytealign_S ( 0, w[ 0], offset); w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; @@ -43102,59 +43102,59 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 11: - w[63] = amd_bytealign_S (w[51], w[52], offset); - w[62] = amd_bytealign_S (w[50], w[51], offset); - w[61] = amd_bytealign_S (w[49], w[50], offset); - w[60] = amd_bytealign_S (w[48], w[49], offset); - w[59] = amd_bytealign_S (w[47], w[48], offset); - w[58] = amd_bytealign_S (w[46], w[47], offset); - w[57] = amd_bytealign_S (w[45], w[46], offset); - w[56] = amd_bytealign_S (w[44], w[45], offset); - w[55] = amd_bytealign_S (w[43], w[44], offset); - w[54] = amd_bytealign_S (w[42], w[43], offset); - w[53] = amd_bytealign_S (w[41], w[42], offset); - w[52] = amd_bytealign_S (w[40], w[41], offset); - w[51] = amd_bytealign_S (w[39], w[40], offset); - w[50] = amd_bytealign_S (w[38], w[39], offset); - w[49] = amd_bytealign_S (w[37], w[38], offset); - w[48] = amd_bytealign_S (w[36], w[37], offset); - w[47] = amd_bytealign_S (w[35], w[36], offset); - w[46] = amd_bytealign_S (w[34], w[35], offset); - w[45] = amd_bytealign_S (w[33], w[34], offset); - w[44] = amd_bytealign_S (w[32], w[33], offset); - w[43] = amd_bytealign_S (w[31], w[32], offset); - w[42] = amd_bytealign_S (w[30], w[31], offset); - w[41] = amd_bytealign_S (w[29], w[30], offset); - w[40] = amd_bytealign_S (w[28], w[29], offset); - w[39] = amd_bytealign_S (w[27], w[28], offset); - w[38] = amd_bytealign_S (w[26], w[27], offset); - w[37] = amd_bytealign_S (w[25], w[26], offset); - w[36] = amd_bytealign_S (w[24], w[25], offset); - w[35] = amd_bytealign_S (w[23], w[24], offset); - w[34] = amd_bytealign_S (w[22], w[23], offset); - w[33] = amd_bytealign_S (w[21], w[22], offset); - w[32] = amd_bytealign_S (w[20], w[21], offset); - w[31] = amd_bytealign_S (w[19], w[20], offset); - w[30] = amd_bytealign_S (w[18], w[19], offset); - w[29] = amd_bytealign_S (w[17], w[18], offset); - w[28] = amd_bytealign_S (w[16], w[17], offset); - w[27] = amd_bytealign_S (w[15], w[16], offset); - w[26] = amd_bytealign_S (w[14], w[15], offset); - w[25] = amd_bytealign_S (w[13], w[14], offset); - w[24] = amd_bytealign_S (w[12], w[13], offset); - w[23] = amd_bytealign_S (w[11], w[12], offset); - w[22] = amd_bytealign_S (w[10], w[11], offset); - w[21] = amd_bytealign_S (w[ 9], w[10], offset); - w[20] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[19] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[18] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[17] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[16] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[15] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[14] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[13] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[12] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[11] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[51], w[52], offset); + w[62] = hc_bytealign_S (w[50], w[51], offset); + w[61] = hc_bytealign_S (w[49], w[50], offset); + w[60] = hc_bytealign_S (w[48], w[49], offset); + w[59] = hc_bytealign_S (w[47], w[48], offset); + w[58] = hc_bytealign_S (w[46], w[47], offset); + w[57] = hc_bytealign_S (w[45], w[46], offset); + w[56] = hc_bytealign_S (w[44], w[45], offset); + w[55] = hc_bytealign_S (w[43], w[44], offset); + w[54] = hc_bytealign_S (w[42], w[43], offset); + w[53] = hc_bytealign_S (w[41], w[42], offset); + w[52] = hc_bytealign_S (w[40], w[41], offset); + w[51] = hc_bytealign_S (w[39], w[40], offset); + w[50] = hc_bytealign_S (w[38], w[39], offset); + w[49] = hc_bytealign_S (w[37], w[38], offset); + w[48] = hc_bytealign_S (w[36], w[37], offset); + w[47] = hc_bytealign_S (w[35], w[36], offset); + w[46] = hc_bytealign_S (w[34], w[35], offset); + w[45] = hc_bytealign_S (w[33], w[34], offset); + w[44] = hc_bytealign_S (w[32], w[33], offset); + w[43] = hc_bytealign_S (w[31], w[32], offset); + w[42] = hc_bytealign_S (w[30], w[31], offset); + w[41] = hc_bytealign_S (w[29], w[30], offset); + w[40] = hc_bytealign_S (w[28], w[29], offset); + w[39] = hc_bytealign_S (w[27], w[28], offset); + w[38] = hc_bytealign_S (w[26], w[27], offset); + w[37] = hc_bytealign_S (w[25], w[26], offset); + w[36] = hc_bytealign_S (w[24], w[25], offset); + w[35] = hc_bytealign_S (w[23], w[24], offset); + w[34] = hc_bytealign_S (w[22], w[23], offset); + w[33] = hc_bytealign_S (w[21], w[22], offset); + w[32] = hc_bytealign_S (w[20], w[21], offset); + w[31] = hc_bytealign_S (w[19], w[20], offset); + w[30] = hc_bytealign_S (w[18], w[19], offset); + w[29] = hc_bytealign_S (w[17], w[18], offset); + w[28] = hc_bytealign_S (w[16], w[17], offset); + w[27] = hc_bytealign_S (w[15], w[16], offset); + w[26] = hc_bytealign_S (w[14], w[15], offset); + w[25] = hc_bytealign_S (w[13], w[14], offset); + w[24] = hc_bytealign_S (w[12], w[13], offset); + w[23] = hc_bytealign_S (w[11], w[12], offset); + w[22] = hc_bytealign_S (w[10], w[11], offset); + w[21] = hc_bytealign_S (w[ 9], w[10], offset); + w[20] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[19] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[18] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[17] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[16] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[15] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[14] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[13] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[12] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[11] = hc_bytealign_S ( 0, w[ 0], offset); w[10] = 0; w[ 9] = 0; w[ 8] = 0; @@ -43170,58 +43170,58 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 12: - w[63] = amd_bytealign_S (w[50], w[51], offset); - w[62] = amd_bytealign_S (w[49], w[50], offset); - w[61] = amd_bytealign_S (w[48], w[49], offset); - w[60] = amd_bytealign_S (w[47], w[48], offset); - w[59] = amd_bytealign_S (w[46], w[47], offset); - w[58] = amd_bytealign_S (w[45], w[46], offset); - w[57] = amd_bytealign_S (w[44], w[45], offset); - w[56] = amd_bytealign_S (w[43], w[44], offset); - w[55] = amd_bytealign_S (w[42], w[43], offset); - w[54] = amd_bytealign_S (w[41], w[42], offset); - w[53] = amd_bytealign_S (w[40], w[41], offset); - w[52] = amd_bytealign_S (w[39], w[40], offset); - w[51] = amd_bytealign_S (w[38], w[39], offset); - w[50] = amd_bytealign_S (w[37], w[38], offset); - w[49] = amd_bytealign_S (w[36], w[37], offset); - w[48] = amd_bytealign_S (w[35], w[36], offset); - w[47] = amd_bytealign_S (w[34], w[35], offset); - w[46] = amd_bytealign_S (w[33], w[34], offset); - w[45] = amd_bytealign_S (w[32], w[33], offset); - w[44] = amd_bytealign_S (w[31], w[32], offset); - w[43] = amd_bytealign_S (w[30], w[31], offset); - w[42] = amd_bytealign_S (w[29], w[30], offset); - w[41] = amd_bytealign_S (w[28], w[29], offset); - w[40] = amd_bytealign_S (w[27], w[28], offset); - w[39] = amd_bytealign_S (w[26], w[27], offset); - w[38] = amd_bytealign_S (w[25], w[26], offset); - w[37] = amd_bytealign_S (w[24], w[25], offset); - w[36] = amd_bytealign_S (w[23], w[24], offset); - w[35] = amd_bytealign_S (w[22], w[23], offset); - w[34] = amd_bytealign_S (w[21], w[22], offset); - w[33] = amd_bytealign_S (w[20], w[21], offset); - w[32] = amd_bytealign_S (w[19], w[20], offset); - w[31] = amd_bytealign_S (w[18], w[19], offset); - w[30] = amd_bytealign_S (w[17], w[18], offset); - w[29] = amd_bytealign_S (w[16], w[17], offset); - w[28] = amd_bytealign_S (w[15], w[16], offset); - w[27] = amd_bytealign_S (w[14], w[15], offset); - w[26] = amd_bytealign_S (w[13], w[14], offset); - w[25] = amd_bytealign_S (w[12], w[13], offset); - w[24] = amd_bytealign_S (w[11], w[12], offset); - w[23] = amd_bytealign_S (w[10], w[11], offset); - w[22] = amd_bytealign_S (w[ 9], w[10], offset); - w[21] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[20] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[19] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[18] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[17] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[16] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[15] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[14] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[13] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[12] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[50], w[51], offset); + w[62] = hc_bytealign_S (w[49], w[50], offset); + w[61] = hc_bytealign_S (w[48], w[49], offset); + w[60] = hc_bytealign_S (w[47], w[48], offset); + w[59] = hc_bytealign_S (w[46], w[47], offset); + w[58] = hc_bytealign_S (w[45], w[46], offset); + w[57] = hc_bytealign_S (w[44], w[45], offset); + w[56] = hc_bytealign_S (w[43], w[44], offset); + w[55] = hc_bytealign_S (w[42], w[43], offset); + w[54] = hc_bytealign_S (w[41], w[42], offset); + w[53] = hc_bytealign_S (w[40], w[41], offset); + w[52] = hc_bytealign_S (w[39], w[40], offset); + w[51] = hc_bytealign_S (w[38], w[39], offset); + w[50] = hc_bytealign_S (w[37], w[38], offset); + w[49] = hc_bytealign_S (w[36], w[37], offset); + w[48] = hc_bytealign_S (w[35], w[36], offset); + w[47] = hc_bytealign_S (w[34], w[35], offset); + w[46] = hc_bytealign_S (w[33], w[34], offset); + w[45] = hc_bytealign_S (w[32], w[33], offset); + w[44] = hc_bytealign_S (w[31], w[32], offset); + w[43] = hc_bytealign_S (w[30], w[31], offset); + w[42] = hc_bytealign_S (w[29], w[30], offset); + w[41] = hc_bytealign_S (w[28], w[29], offset); + w[40] = hc_bytealign_S (w[27], w[28], offset); + w[39] = hc_bytealign_S (w[26], w[27], offset); + w[38] = hc_bytealign_S (w[25], w[26], offset); + w[37] = hc_bytealign_S (w[24], w[25], offset); + w[36] = hc_bytealign_S (w[23], w[24], offset); + w[35] = hc_bytealign_S (w[22], w[23], offset); + w[34] = hc_bytealign_S (w[21], w[22], offset); + w[33] = hc_bytealign_S (w[20], w[21], offset); + w[32] = hc_bytealign_S (w[19], w[20], offset); + w[31] = hc_bytealign_S (w[18], w[19], offset); + w[30] = hc_bytealign_S (w[17], w[18], offset); + w[29] = hc_bytealign_S (w[16], w[17], offset); + w[28] = hc_bytealign_S (w[15], w[16], offset); + w[27] = hc_bytealign_S (w[14], w[15], offset); + w[26] = hc_bytealign_S (w[13], w[14], offset); + w[25] = hc_bytealign_S (w[12], w[13], offset); + w[24] = hc_bytealign_S (w[11], w[12], offset); + w[23] = hc_bytealign_S (w[10], w[11], offset); + w[22] = hc_bytealign_S (w[ 9], w[10], offset); + w[21] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[20] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[19] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[18] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[17] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[16] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[15] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[14] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[13] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[12] = hc_bytealign_S ( 0, w[ 0], offset); w[11] = 0; w[10] = 0; w[ 9] = 0; @@ -43238,57 +43238,57 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 13: - w[63] = amd_bytealign_S (w[49], w[50], offset); - w[62] = amd_bytealign_S (w[48], w[49], offset); - w[61] = amd_bytealign_S (w[47], w[48], offset); - w[60] = amd_bytealign_S (w[46], w[47], offset); - w[59] = amd_bytealign_S (w[45], w[46], offset); - w[58] = amd_bytealign_S (w[44], w[45], offset); - w[57] = amd_bytealign_S (w[43], w[44], offset); - w[56] = amd_bytealign_S (w[42], w[43], offset); - w[55] = amd_bytealign_S (w[41], w[42], offset); - w[54] = amd_bytealign_S (w[40], w[41], offset); - w[53] = amd_bytealign_S (w[39], w[40], offset); - w[52] = amd_bytealign_S (w[38], w[39], offset); - w[51] = amd_bytealign_S (w[37], w[38], offset); - w[50] = amd_bytealign_S (w[36], w[37], offset); - w[49] = amd_bytealign_S (w[35], w[36], offset); - w[48] = amd_bytealign_S (w[34], w[35], offset); - w[47] = amd_bytealign_S (w[33], w[34], offset); - w[46] = amd_bytealign_S (w[32], w[33], offset); - w[45] = amd_bytealign_S (w[31], w[32], offset); - w[44] = amd_bytealign_S (w[30], w[31], offset); - w[43] = amd_bytealign_S (w[29], w[30], offset); - w[42] = amd_bytealign_S (w[28], w[29], offset); - w[41] = amd_bytealign_S (w[27], w[28], offset); - w[40] = amd_bytealign_S (w[26], w[27], offset); - w[39] = amd_bytealign_S (w[25], w[26], offset); - w[38] = amd_bytealign_S (w[24], w[25], offset); - w[37] = amd_bytealign_S (w[23], w[24], offset); - w[36] = amd_bytealign_S (w[22], w[23], offset); - w[35] = amd_bytealign_S (w[21], w[22], offset); - w[34] = amd_bytealign_S (w[20], w[21], offset); - w[33] = amd_bytealign_S (w[19], w[20], offset); - w[32] = amd_bytealign_S (w[18], w[19], offset); - w[31] = amd_bytealign_S (w[17], w[18], offset); - w[30] = amd_bytealign_S (w[16], w[17], offset); - w[29] = amd_bytealign_S (w[15], w[16], offset); - w[28] = amd_bytealign_S (w[14], w[15], offset); - w[27] = amd_bytealign_S (w[13], w[14], offset); - w[26] = amd_bytealign_S (w[12], w[13], offset); - w[25] = amd_bytealign_S (w[11], w[12], offset); - w[24] = amd_bytealign_S (w[10], w[11], offset); - w[23] = amd_bytealign_S (w[ 9], w[10], offset); - w[22] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[21] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[20] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[19] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[18] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[17] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[16] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[15] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[14] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[13] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[49], w[50], offset); + w[62] = hc_bytealign_S (w[48], w[49], offset); + w[61] = hc_bytealign_S (w[47], w[48], offset); + w[60] = hc_bytealign_S (w[46], w[47], offset); + w[59] = hc_bytealign_S (w[45], w[46], offset); + w[58] = hc_bytealign_S (w[44], w[45], offset); + w[57] = hc_bytealign_S (w[43], w[44], offset); + w[56] = hc_bytealign_S (w[42], w[43], offset); + w[55] = hc_bytealign_S (w[41], w[42], offset); + w[54] = hc_bytealign_S (w[40], w[41], offset); + w[53] = hc_bytealign_S (w[39], w[40], offset); + w[52] = hc_bytealign_S (w[38], w[39], offset); + w[51] = hc_bytealign_S (w[37], w[38], offset); + w[50] = hc_bytealign_S (w[36], w[37], offset); + w[49] = hc_bytealign_S (w[35], w[36], offset); + w[48] = hc_bytealign_S (w[34], w[35], offset); + w[47] = hc_bytealign_S (w[33], w[34], offset); + w[46] = hc_bytealign_S (w[32], w[33], offset); + w[45] = hc_bytealign_S (w[31], w[32], offset); + w[44] = hc_bytealign_S (w[30], w[31], offset); + w[43] = hc_bytealign_S (w[29], w[30], offset); + w[42] = hc_bytealign_S (w[28], w[29], offset); + w[41] = hc_bytealign_S (w[27], w[28], offset); + w[40] = hc_bytealign_S (w[26], w[27], offset); + w[39] = hc_bytealign_S (w[25], w[26], offset); + w[38] = hc_bytealign_S (w[24], w[25], offset); + w[37] = hc_bytealign_S (w[23], w[24], offset); + w[36] = hc_bytealign_S (w[22], w[23], offset); + w[35] = hc_bytealign_S (w[21], w[22], offset); + w[34] = hc_bytealign_S (w[20], w[21], offset); + w[33] = hc_bytealign_S (w[19], w[20], offset); + w[32] = hc_bytealign_S (w[18], w[19], offset); + w[31] = hc_bytealign_S (w[17], w[18], offset); + w[30] = hc_bytealign_S (w[16], w[17], offset); + w[29] = hc_bytealign_S (w[15], w[16], offset); + w[28] = hc_bytealign_S (w[14], w[15], offset); + w[27] = hc_bytealign_S (w[13], w[14], offset); + w[26] = hc_bytealign_S (w[12], w[13], offset); + w[25] = hc_bytealign_S (w[11], w[12], offset); + w[24] = hc_bytealign_S (w[10], w[11], offset); + w[23] = hc_bytealign_S (w[ 9], w[10], offset); + w[22] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[21] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[20] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[19] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[18] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[17] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[16] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[15] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[14] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[13] = hc_bytealign_S ( 0, w[ 0], offset); w[12] = 0; w[11] = 0; w[10] = 0; @@ -43306,56 +43306,56 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 14: - w[63] = amd_bytealign_S (w[48], w[49], offset); - w[62] = amd_bytealign_S (w[47], w[48], offset); - w[61] = amd_bytealign_S (w[46], w[47], offset); - w[60] = amd_bytealign_S (w[45], w[46], offset); - w[59] = amd_bytealign_S (w[44], w[45], offset); - w[58] = amd_bytealign_S (w[43], w[44], offset); - w[57] = amd_bytealign_S (w[42], w[43], offset); - w[56] = amd_bytealign_S (w[41], w[42], offset); - w[55] = amd_bytealign_S (w[40], w[41], offset); - w[54] = amd_bytealign_S (w[39], w[40], offset); - w[53] = amd_bytealign_S (w[38], w[39], offset); - w[52] = amd_bytealign_S (w[37], w[38], offset); - w[51] = amd_bytealign_S (w[36], w[37], offset); - w[50] = amd_bytealign_S (w[35], w[36], offset); - w[49] = amd_bytealign_S (w[34], w[35], offset); - w[48] = amd_bytealign_S (w[33], w[34], offset); - w[47] = amd_bytealign_S (w[32], w[33], offset); - w[46] = amd_bytealign_S (w[31], w[32], offset); - w[45] = amd_bytealign_S (w[30], w[31], offset); - w[44] = amd_bytealign_S (w[29], w[30], offset); - w[43] = amd_bytealign_S (w[28], w[29], offset); - w[42] = amd_bytealign_S (w[27], w[28], offset); - w[41] = amd_bytealign_S (w[26], w[27], offset); - w[40] = amd_bytealign_S (w[25], w[26], offset); - w[39] = amd_bytealign_S (w[24], w[25], offset); - w[38] = amd_bytealign_S (w[23], w[24], offset); - w[37] = amd_bytealign_S (w[22], w[23], offset); - w[36] = amd_bytealign_S (w[21], w[22], offset); - w[35] = amd_bytealign_S (w[20], w[21], offset); - w[34] = amd_bytealign_S (w[19], w[20], offset); - w[33] = amd_bytealign_S (w[18], w[19], offset); - w[32] = amd_bytealign_S (w[17], w[18], offset); - w[31] = amd_bytealign_S (w[16], w[17], offset); - w[30] = amd_bytealign_S (w[15], w[16], offset); - w[29] = amd_bytealign_S (w[14], w[15], offset); - w[28] = amd_bytealign_S (w[13], w[14], offset); - w[27] = amd_bytealign_S (w[12], w[13], offset); - w[26] = amd_bytealign_S (w[11], w[12], offset); - w[25] = amd_bytealign_S (w[10], w[11], offset); - w[24] = amd_bytealign_S (w[ 9], w[10], offset); - w[23] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[22] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[21] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[20] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[19] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[18] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[17] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[16] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[15] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[14] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[48], w[49], offset); + w[62] = hc_bytealign_S (w[47], w[48], offset); + w[61] = hc_bytealign_S (w[46], w[47], offset); + w[60] = hc_bytealign_S (w[45], w[46], offset); + w[59] = hc_bytealign_S (w[44], w[45], offset); + w[58] = hc_bytealign_S (w[43], w[44], offset); + w[57] = hc_bytealign_S (w[42], w[43], offset); + w[56] = hc_bytealign_S (w[41], w[42], offset); + w[55] = hc_bytealign_S (w[40], w[41], offset); + w[54] = hc_bytealign_S (w[39], w[40], offset); + w[53] = hc_bytealign_S (w[38], w[39], offset); + w[52] = hc_bytealign_S (w[37], w[38], offset); + w[51] = hc_bytealign_S (w[36], w[37], offset); + w[50] = hc_bytealign_S (w[35], w[36], offset); + w[49] = hc_bytealign_S (w[34], w[35], offset); + w[48] = hc_bytealign_S (w[33], w[34], offset); + w[47] = hc_bytealign_S (w[32], w[33], offset); + w[46] = hc_bytealign_S (w[31], w[32], offset); + w[45] = hc_bytealign_S (w[30], w[31], offset); + w[44] = hc_bytealign_S (w[29], w[30], offset); + w[43] = hc_bytealign_S (w[28], w[29], offset); + w[42] = hc_bytealign_S (w[27], w[28], offset); + w[41] = hc_bytealign_S (w[26], w[27], offset); + w[40] = hc_bytealign_S (w[25], w[26], offset); + w[39] = hc_bytealign_S (w[24], w[25], offset); + w[38] = hc_bytealign_S (w[23], w[24], offset); + w[37] = hc_bytealign_S (w[22], w[23], offset); + w[36] = hc_bytealign_S (w[21], w[22], offset); + w[35] = hc_bytealign_S (w[20], w[21], offset); + w[34] = hc_bytealign_S (w[19], w[20], offset); + w[33] = hc_bytealign_S (w[18], w[19], offset); + w[32] = hc_bytealign_S (w[17], w[18], offset); + w[31] = hc_bytealign_S (w[16], w[17], offset); + w[30] = hc_bytealign_S (w[15], w[16], offset); + w[29] = hc_bytealign_S (w[14], w[15], offset); + w[28] = hc_bytealign_S (w[13], w[14], offset); + w[27] = hc_bytealign_S (w[12], w[13], offset); + w[26] = hc_bytealign_S (w[11], w[12], offset); + w[25] = hc_bytealign_S (w[10], w[11], offset); + w[24] = hc_bytealign_S (w[ 9], w[10], offset); + w[23] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[22] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[21] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[20] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[19] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[18] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[17] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[16] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[15] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[14] = hc_bytealign_S ( 0, w[ 0], offset); w[13] = 0; w[12] = 0; w[11] = 0; @@ -43374,55 +43374,55 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 15: - w[63] = amd_bytealign_S (w[47], w[48], offset); - w[62] = amd_bytealign_S (w[46], w[47], offset); - w[61] = amd_bytealign_S (w[45], w[46], offset); - w[60] = amd_bytealign_S (w[44], w[45], offset); - w[59] = amd_bytealign_S (w[43], w[44], offset); - w[58] = amd_bytealign_S (w[42], w[43], offset); - w[57] = amd_bytealign_S (w[41], w[42], offset); - w[56] = amd_bytealign_S (w[40], w[41], offset); - w[55] = amd_bytealign_S (w[39], w[40], offset); - w[54] = amd_bytealign_S (w[38], w[39], offset); - w[53] = amd_bytealign_S (w[37], w[38], offset); - w[52] = amd_bytealign_S (w[36], w[37], offset); - w[51] = amd_bytealign_S (w[35], w[36], offset); - w[50] = amd_bytealign_S (w[34], w[35], offset); - w[49] = amd_bytealign_S (w[33], w[34], offset); - w[48] = amd_bytealign_S (w[32], w[33], offset); - w[47] = amd_bytealign_S (w[31], w[32], offset); - w[46] = amd_bytealign_S (w[30], w[31], offset); - w[45] = amd_bytealign_S (w[29], w[30], offset); - w[44] = amd_bytealign_S (w[28], w[29], offset); - w[43] = amd_bytealign_S (w[27], w[28], offset); - w[42] = amd_bytealign_S (w[26], w[27], offset); - w[41] = amd_bytealign_S (w[25], w[26], offset); - w[40] = amd_bytealign_S (w[24], w[25], offset); - w[39] = amd_bytealign_S (w[23], w[24], offset); - w[38] = amd_bytealign_S (w[22], w[23], offset); - w[37] = amd_bytealign_S (w[21], w[22], offset); - w[36] = amd_bytealign_S (w[20], w[21], offset); - w[35] = amd_bytealign_S (w[19], w[20], offset); - w[34] = amd_bytealign_S (w[18], w[19], offset); - w[33] = amd_bytealign_S (w[17], w[18], offset); - w[32] = amd_bytealign_S (w[16], w[17], offset); - w[31] = amd_bytealign_S (w[15], w[16], offset); - w[30] = amd_bytealign_S (w[14], w[15], offset); - w[29] = amd_bytealign_S (w[13], w[14], offset); - w[28] = amd_bytealign_S (w[12], w[13], offset); - w[27] = amd_bytealign_S (w[11], w[12], offset); - w[26] = amd_bytealign_S (w[10], w[11], offset); - w[25] = amd_bytealign_S (w[ 9], w[10], offset); - w[24] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[23] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[22] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[21] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[20] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[19] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[18] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[17] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[16] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[15] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[47], w[48], offset); + w[62] = hc_bytealign_S (w[46], w[47], offset); + w[61] = hc_bytealign_S (w[45], w[46], offset); + w[60] = hc_bytealign_S (w[44], w[45], offset); + w[59] = hc_bytealign_S (w[43], w[44], offset); + w[58] = hc_bytealign_S (w[42], w[43], offset); + w[57] = hc_bytealign_S (w[41], w[42], offset); + w[56] = hc_bytealign_S (w[40], w[41], offset); + w[55] = hc_bytealign_S (w[39], w[40], offset); + w[54] = hc_bytealign_S (w[38], w[39], offset); + w[53] = hc_bytealign_S (w[37], w[38], offset); + w[52] = hc_bytealign_S (w[36], w[37], offset); + w[51] = hc_bytealign_S (w[35], w[36], offset); + w[50] = hc_bytealign_S (w[34], w[35], offset); + w[49] = hc_bytealign_S (w[33], w[34], offset); + w[48] = hc_bytealign_S (w[32], w[33], offset); + w[47] = hc_bytealign_S (w[31], w[32], offset); + w[46] = hc_bytealign_S (w[30], w[31], offset); + w[45] = hc_bytealign_S (w[29], w[30], offset); + w[44] = hc_bytealign_S (w[28], w[29], offset); + w[43] = hc_bytealign_S (w[27], w[28], offset); + w[42] = hc_bytealign_S (w[26], w[27], offset); + w[41] = hc_bytealign_S (w[25], w[26], offset); + w[40] = hc_bytealign_S (w[24], w[25], offset); + w[39] = hc_bytealign_S (w[23], w[24], offset); + w[38] = hc_bytealign_S (w[22], w[23], offset); + w[37] = hc_bytealign_S (w[21], w[22], offset); + w[36] = hc_bytealign_S (w[20], w[21], offset); + w[35] = hc_bytealign_S (w[19], w[20], offset); + w[34] = hc_bytealign_S (w[18], w[19], offset); + w[33] = hc_bytealign_S (w[17], w[18], offset); + w[32] = hc_bytealign_S (w[16], w[17], offset); + w[31] = hc_bytealign_S (w[15], w[16], offset); + w[30] = hc_bytealign_S (w[14], w[15], offset); + w[29] = hc_bytealign_S (w[13], w[14], offset); + w[28] = hc_bytealign_S (w[12], w[13], offset); + w[27] = hc_bytealign_S (w[11], w[12], offset); + w[26] = hc_bytealign_S (w[10], w[11], offset); + w[25] = hc_bytealign_S (w[ 9], w[10], offset); + w[24] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[23] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[22] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[21] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[20] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[19] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[18] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[17] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[16] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[15] = hc_bytealign_S ( 0, w[ 0], offset); w[14] = 0; w[13] = 0; w[12] = 0; @@ -43442,54 +43442,54 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 16: - w[63] = amd_bytealign_S (w[46], w[47], offset); - w[62] = amd_bytealign_S (w[45], w[46], offset); - w[61] = amd_bytealign_S (w[44], w[45], offset); - w[60] = amd_bytealign_S (w[43], w[44], offset); - w[59] = amd_bytealign_S (w[42], w[43], offset); - w[58] = amd_bytealign_S (w[41], w[42], offset); - w[57] = amd_bytealign_S (w[40], w[41], offset); - w[56] = amd_bytealign_S (w[39], w[40], offset); - w[55] = amd_bytealign_S (w[38], w[39], offset); - w[54] = amd_bytealign_S (w[37], w[38], offset); - w[53] = amd_bytealign_S (w[36], w[37], offset); - w[52] = amd_bytealign_S (w[35], w[36], offset); - w[51] = amd_bytealign_S (w[34], w[35], offset); - w[50] = amd_bytealign_S (w[33], w[34], offset); - w[49] = amd_bytealign_S (w[32], w[33], offset); - w[48] = amd_bytealign_S (w[31], w[32], offset); - w[47] = amd_bytealign_S (w[30], w[31], offset); - w[46] = amd_bytealign_S (w[29], w[30], offset); - w[45] = amd_bytealign_S (w[28], w[29], offset); - w[44] = amd_bytealign_S (w[27], w[28], offset); - w[43] = amd_bytealign_S (w[26], w[27], offset); - w[42] = amd_bytealign_S (w[25], w[26], offset); - w[41] = amd_bytealign_S (w[24], w[25], offset); - w[40] = amd_bytealign_S (w[23], w[24], offset); - w[39] = amd_bytealign_S (w[22], w[23], offset); - w[38] = amd_bytealign_S (w[21], w[22], offset); - w[37] = amd_bytealign_S (w[20], w[21], offset); - w[36] = amd_bytealign_S (w[19], w[20], offset); - w[35] = amd_bytealign_S (w[18], w[19], offset); - w[34] = amd_bytealign_S (w[17], w[18], offset); - w[33] = amd_bytealign_S (w[16], w[17], offset); - w[32] = amd_bytealign_S (w[15], w[16], offset); - w[31] = amd_bytealign_S (w[14], w[15], offset); - w[30] = amd_bytealign_S (w[13], w[14], offset); - w[29] = amd_bytealign_S (w[12], w[13], offset); - w[28] = amd_bytealign_S (w[11], w[12], offset); - w[27] = amd_bytealign_S (w[10], w[11], offset); - w[26] = amd_bytealign_S (w[ 9], w[10], offset); - w[25] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[24] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[23] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[22] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[21] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[20] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[19] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[18] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[17] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[16] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[46], w[47], offset); + w[62] = hc_bytealign_S (w[45], w[46], offset); + w[61] = hc_bytealign_S (w[44], w[45], offset); + w[60] = hc_bytealign_S (w[43], w[44], offset); + w[59] = hc_bytealign_S (w[42], w[43], offset); + w[58] = hc_bytealign_S (w[41], w[42], offset); + w[57] = hc_bytealign_S (w[40], w[41], offset); + w[56] = hc_bytealign_S (w[39], w[40], offset); + w[55] = hc_bytealign_S (w[38], w[39], offset); + w[54] = hc_bytealign_S (w[37], w[38], offset); + w[53] = hc_bytealign_S (w[36], w[37], offset); + w[52] = hc_bytealign_S (w[35], w[36], offset); + w[51] = hc_bytealign_S (w[34], w[35], offset); + w[50] = hc_bytealign_S (w[33], w[34], offset); + w[49] = hc_bytealign_S (w[32], w[33], offset); + w[48] = hc_bytealign_S (w[31], w[32], offset); + w[47] = hc_bytealign_S (w[30], w[31], offset); + w[46] = hc_bytealign_S (w[29], w[30], offset); + w[45] = hc_bytealign_S (w[28], w[29], offset); + w[44] = hc_bytealign_S (w[27], w[28], offset); + w[43] = hc_bytealign_S (w[26], w[27], offset); + w[42] = hc_bytealign_S (w[25], w[26], offset); + w[41] = hc_bytealign_S (w[24], w[25], offset); + w[40] = hc_bytealign_S (w[23], w[24], offset); + w[39] = hc_bytealign_S (w[22], w[23], offset); + w[38] = hc_bytealign_S (w[21], w[22], offset); + w[37] = hc_bytealign_S (w[20], w[21], offset); + w[36] = hc_bytealign_S (w[19], w[20], offset); + w[35] = hc_bytealign_S (w[18], w[19], offset); + w[34] = hc_bytealign_S (w[17], w[18], offset); + w[33] = hc_bytealign_S (w[16], w[17], offset); + w[32] = hc_bytealign_S (w[15], w[16], offset); + w[31] = hc_bytealign_S (w[14], w[15], offset); + w[30] = hc_bytealign_S (w[13], w[14], offset); + w[29] = hc_bytealign_S (w[12], w[13], offset); + w[28] = hc_bytealign_S (w[11], w[12], offset); + w[27] = hc_bytealign_S (w[10], w[11], offset); + w[26] = hc_bytealign_S (w[ 9], w[10], offset); + w[25] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[24] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[23] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[22] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[21] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[20] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[19] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[18] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[17] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[16] = hc_bytealign_S ( 0, w[ 0], offset); w[15] = 0; w[14] = 0; w[13] = 0; @@ -43510,53 +43510,53 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 17: - w[63] = amd_bytealign_S (w[45], w[46], offset); - w[62] = amd_bytealign_S (w[44], w[45], offset); - w[61] = amd_bytealign_S (w[43], w[44], offset); - w[60] = amd_bytealign_S (w[42], w[43], offset); - w[59] = amd_bytealign_S (w[41], w[42], offset); - w[58] = amd_bytealign_S (w[40], w[41], offset); - w[57] = amd_bytealign_S (w[39], w[40], offset); - w[56] = amd_bytealign_S (w[38], w[39], offset); - w[55] = amd_bytealign_S (w[37], w[38], offset); - w[54] = amd_bytealign_S (w[36], w[37], offset); - w[53] = amd_bytealign_S (w[35], w[36], offset); - w[52] = amd_bytealign_S (w[34], w[35], offset); - w[51] = amd_bytealign_S (w[33], w[34], offset); - w[50] = amd_bytealign_S (w[32], w[33], offset); - w[49] = amd_bytealign_S (w[31], w[32], offset); - w[48] = amd_bytealign_S (w[30], w[31], offset); - w[47] = amd_bytealign_S (w[29], w[30], offset); - w[46] = amd_bytealign_S (w[28], w[29], offset); - w[45] = amd_bytealign_S (w[27], w[28], offset); - w[44] = amd_bytealign_S (w[26], w[27], offset); - w[43] = amd_bytealign_S (w[25], w[26], offset); - w[42] = amd_bytealign_S (w[24], w[25], offset); - w[41] = amd_bytealign_S (w[23], w[24], offset); - w[40] = amd_bytealign_S (w[22], w[23], offset); - w[39] = amd_bytealign_S (w[21], w[22], offset); - w[38] = amd_bytealign_S (w[20], w[21], offset); - w[37] = amd_bytealign_S (w[19], w[20], offset); - w[36] = amd_bytealign_S (w[18], w[19], offset); - w[35] = amd_bytealign_S (w[17], w[18], offset); - w[34] = amd_bytealign_S (w[16], w[17], offset); - w[33] = amd_bytealign_S (w[15], w[16], offset); - w[32] = amd_bytealign_S (w[14], w[15], offset); - w[31] = amd_bytealign_S (w[13], w[14], offset); - w[30] = amd_bytealign_S (w[12], w[13], offset); - w[29] = amd_bytealign_S (w[11], w[12], offset); - w[28] = amd_bytealign_S (w[10], w[11], offset); - w[27] = amd_bytealign_S (w[ 9], w[10], offset); - w[26] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[25] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[24] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[23] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[22] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[21] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[20] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[19] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[18] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[17] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[45], w[46], offset); + w[62] = hc_bytealign_S (w[44], w[45], offset); + w[61] = hc_bytealign_S (w[43], w[44], offset); + w[60] = hc_bytealign_S (w[42], w[43], offset); + w[59] = hc_bytealign_S (w[41], w[42], offset); + w[58] = hc_bytealign_S (w[40], w[41], offset); + w[57] = hc_bytealign_S (w[39], w[40], offset); + w[56] = hc_bytealign_S (w[38], w[39], offset); + w[55] = hc_bytealign_S (w[37], w[38], offset); + w[54] = hc_bytealign_S (w[36], w[37], offset); + w[53] = hc_bytealign_S (w[35], w[36], offset); + w[52] = hc_bytealign_S (w[34], w[35], offset); + w[51] = hc_bytealign_S (w[33], w[34], offset); + w[50] = hc_bytealign_S (w[32], w[33], offset); + w[49] = hc_bytealign_S (w[31], w[32], offset); + w[48] = hc_bytealign_S (w[30], w[31], offset); + w[47] = hc_bytealign_S (w[29], w[30], offset); + w[46] = hc_bytealign_S (w[28], w[29], offset); + w[45] = hc_bytealign_S (w[27], w[28], offset); + w[44] = hc_bytealign_S (w[26], w[27], offset); + w[43] = hc_bytealign_S (w[25], w[26], offset); + w[42] = hc_bytealign_S (w[24], w[25], offset); + w[41] = hc_bytealign_S (w[23], w[24], offset); + w[40] = hc_bytealign_S (w[22], w[23], offset); + w[39] = hc_bytealign_S (w[21], w[22], offset); + w[38] = hc_bytealign_S (w[20], w[21], offset); + w[37] = hc_bytealign_S (w[19], w[20], offset); + w[36] = hc_bytealign_S (w[18], w[19], offset); + w[35] = hc_bytealign_S (w[17], w[18], offset); + w[34] = hc_bytealign_S (w[16], w[17], offset); + w[33] = hc_bytealign_S (w[15], w[16], offset); + w[32] = hc_bytealign_S (w[14], w[15], offset); + w[31] = hc_bytealign_S (w[13], w[14], offset); + w[30] = hc_bytealign_S (w[12], w[13], offset); + w[29] = hc_bytealign_S (w[11], w[12], offset); + w[28] = hc_bytealign_S (w[10], w[11], offset); + w[27] = hc_bytealign_S (w[ 9], w[10], offset); + w[26] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[25] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[24] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[23] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[22] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[21] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[20] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[19] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[18] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[17] = hc_bytealign_S ( 0, w[ 0], offset); w[16] = 0; w[15] = 0; w[14] = 0; @@ -43578,52 +43578,52 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 18: - w[63] = amd_bytealign_S (w[44], w[45], offset); - w[62] = amd_bytealign_S (w[43], w[44], offset); - w[61] = amd_bytealign_S (w[42], w[43], offset); - w[60] = amd_bytealign_S (w[41], w[42], offset); - w[59] = amd_bytealign_S (w[40], w[41], offset); - w[58] = amd_bytealign_S (w[39], w[40], offset); - w[57] = amd_bytealign_S (w[38], w[39], offset); - w[56] = amd_bytealign_S (w[37], w[38], offset); - w[55] = amd_bytealign_S (w[36], w[37], offset); - w[54] = amd_bytealign_S (w[35], w[36], offset); - w[53] = amd_bytealign_S (w[34], w[35], offset); - w[52] = amd_bytealign_S (w[33], w[34], offset); - w[51] = amd_bytealign_S (w[32], w[33], offset); - w[50] = amd_bytealign_S (w[31], w[32], offset); - w[49] = amd_bytealign_S (w[30], w[31], offset); - w[48] = amd_bytealign_S (w[29], w[30], offset); - w[47] = amd_bytealign_S (w[28], w[29], offset); - w[46] = amd_bytealign_S (w[27], w[28], offset); - w[45] = amd_bytealign_S (w[26], w[27], offset); - w[44] = amd_bytealign_S (w[25], w[26], offset); - w[43] = amd_bytealign_S (w[24], w[25], offset); - w[42] = amd_bytealign_S (w[23], w[24], offset); - w[41] = amd_bytealign_S (w[22], w[23], offset); - w[40] = amd_bytealign_S (w[21], w[22], offset); - w[39] = amd_bytealign_S (w[20], w[21], offset); - w[38] = amd_bytealign_S (w[19], w[20], offset); - w[37] = amd_bytealign_S (w[18], w[19], offset); - w[36] = amd_bytealign_S (w[17], w[18], offset); - w[35] = amd_bytealign_S (w[16], w[17], offset); - w[34] = amd_bytealign_S (w[15], w[16], offset); - w[33] = amd_bytealign_S (w[14], w[15], offset); - w[32] = amd_bytealign_S (w[13], w[14], offset); - w[31] = amd_bytealign_S (w[12], w[13], offset); - w[30] = amd_bytealign_S (w[11], w[12], offset); - w[29] = amd_bytealign_S (w[10], w[11], offset); - w[28] = amd_bytealign_S (w[ 9], w[10], offset); - w[27] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[26] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[25] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[24] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[23] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[22] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[21] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[20] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[19] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[18] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[44], w[45], offset); + w[62] = hc_bytealign_S (w[43], w[44], offset); + w[61] = hc_bytealign_S (w[42], w[43], offset); + w[60] = hc_bytealign_S (w[41], w[42], offset); + w[59] = hc_bytealign_S (w[40], w[41], offset); + w[58] = hc_bytealign_S (w[39], w[40], offset); + w[57] = hc_bytealign_S (w[38], w[39], offset); + w[56] = hc_bytealign_S (w[37], w[38], offset); + w[55] = hc_bytealign_S (w[36], w[37], offset); + w[54] = hc_bytealign_S (w[35], w[36], offset); + w[53] = hc_bytealign_S (w[34], w[35], offset); + w[52] = hc_bytealign_S (w[33], w[34], offset); + w[51] = hc_bytealign_S (w[32], w[33], offset); + w[50] = hc_bytealign_S (w[31], w[32], offset); + w[49] = hc_bytealign_S (w[30], w[31], offset); + w[48] = hc_bytealign_S (w[29], w[30], offset); + w[47] = hc_bytealign_S (w[28], w[29], offset); + w[46] = hc_bytealign_S (w[27], w[28], offset); + w[45] = hc_bytealign_S (w[26], w[27], offset); + w[44] = hc_bytealign_S (w[25], w[26], offset); + w[43] = hc_bytealign_S (w[24], w[25], offset); + w[42] = hc_bytealign_S (w[23], w[24], offset); + w[41] = hc_bytealign_S (w[22], w[23], offset); + w[40] = hc_bytealign_S (w[21], w[22], offset); + w[39] = hc_bytealign_S (w[20], w[21], offset); + w[38] = hc_bytealign_S (w[19], w[20], offset); + w[37] = hc_bytealign_S (w[18], w[19], offset); + w[36] = hc_bytealign_S (w[17], w[18], offset); + w[35] = hc_bytealign_S (w[16], w[17], offset); + w[34] = hc_bytealign_S (w[15], w[16], offset); + w[33] = hc_bytealign_S (w[14], w[15], offset); + w[32] = hc_bytealign_S (w[13], w[14], offset); + w[31] = hc_bytealign_S (w[12], w[13], offset); + w[30] = hc_bytealign_S (w[11], w[12], offset); + w[29] = hc_bytealign_S (w[10], w[11], offset); + w[28] = hc_bytealign_S (w[ 9], w[10], offset); + w[27] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[26] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[25] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[24] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[23] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[22] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[21] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[20] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[19] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[18] = hc_bytealign_S ( 0, w[ 0], offset); w[17] = 0; w[16] = 0; w[15] = 0; @@ -43646,51 +43646,51 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 19: - w[63] = amd_bytealign_S (w[43], w[44], offset); - w[62] = amd_bytealign_S (w[42], w[43], offset); - w[61] = amd_bytealign_S (w[41], w[42], offset); - w[60] = amd_bytealign_S (w[40], w[41], offset); - w[59] = amd_bytealign_S (w[39], w[40], offset); - w[58] = amd_bytealign_S (w[38], w[39], offset); - w[57] = amd_bytealign_S (w[37], w[38], offset); - w[56] = amd_bytealign_S (w[36], w[37], offset); - w[55] = amd_bytealign_S (w[35], w[36], offset); - w[54] = amd_bytealign_S (w[34], w[35], offset); - w[53] = amd_bytealign_S (w[33], w[34], offset); - w[52] = amd_bytealign_S (w[32], w[33], offset); - w[51] = amd_bytealign_S (w[31], w[32], offset); - w[50] = amd_bytealign_S (w[30], w[31], offset); - w[49] = amd_bytealign_S (w[29], w[30], offset); - w[48] = amd_bytealign_S (w[28], w[29], offset); - w[47] = amd_bytealign_S (w[27], w[28], offset); - w[46] = amd_bytealign_S (w[26], w[27], offset); - w[45] = amd_bytealign_S (w[25], w[26], offset); - w[44] = amd_bytealign_S (w[24], w[25], offset); - w[43] = amd_bytealign_S (w[23], w[24], offset); - w[42] = amd_bytealign_S (w[22], w[23], offset); - w[41] = amd_bytealign_S (w[21], w[22], offset); - w[40] = amd_bytealign_S (w[20], w[21], offset); - w[39] = amd_bytealign_S (w[19], w[20], offset); - w[38] = amd_bytealign_S (w[18], w[19], offset); - w[37] = amd_bytealign_S (w[17], w[18], offset); - w[36] = amd_bytealign_S (w[16], w[17], offset); - w[35] = amd_bytealign_S (w[15], w[16], offset); - w[34] = amd_bytealign_S (w[14], w[15], offset); - w[33] = amd_bytealign_S (w[13], w[14], offset); - w[32] = amd_bytealign_S (w[12], w[13], offset); - w[31] = amd_bytealign_S (w[11], w[12], offset); - w[30] = amd_bytealign_S (w[10], w[11], offset); - w[29] = amd_bytealign_S (w[ 9], w[10], offset); - w[28] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[27] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[26] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[25] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[24] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[23] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[22] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[21] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[20] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[19] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[43], w[44], offset); + w[62] = hc_bytealign_S (w[42], w[43], offset); + w[61] = hc_bytealign_S (w[41], w[42], offset); + w[60] = hc_bytealign_S (w[40], w[41], offset); + w[59] = hc_bytealign_S (w[39], w[40], offset); + w[58] = hc_bytealign_S (w[38], w[39], offset); + w[57] = hc_bytealign_S (w[37], w[38], offset); + w[56] = hc_bytealign_S (w[36], w[37], offset); + w[55] = hc_bytealign_S (w[35], w[36], offset); + w[54] = hc_bytealign_S (w[34], w[35], offset); + w[53] = hc_bytealign_S (w[33], w[34], offset); + w[52] = hc_bytealign_S (w[32], w[33], offset); + w[51] = hc_bytealign_S (w[31], w[32], offset); + w[50] = hc_bytealign_S (w[30], w[31], offset); + w[49] = hc_bytealign_S (w[29], w[30], offset); + w[48] = hc_bytealign_S (w[28], w[29], offset); + w[47] = hc_bytealign_S (w[27], w[28], offset); + w[46] = hc_bytealign_S (w[26], w[27], offset); + w[45] = hc_bytealign_S (w[25], w[26], offset); + w[44] = hc_bytealign_S (w[24], w[25], offset); + w[43] = hc_bytealign_S (w[23], w[24], offset); + w[42] = hc_bytealign_S (w[22], w[23], offset); + w[41] = hc_bytealign_S (w[21], w[22], offset); + w[40] = hc_bytealign_S (w[20], w[21], offset); + w[39] = hc_bytealign_S (w[19], w[20], offset); + w[38] = hc_bytealign_S (w[18], w[19], offset); + w[37] = hc_bytealign_S (w[17], w[18], offset); + w[36] = hc_bytealign_S (w[16], w[17], offset); + w[35] = hc_bytealign_S (w[15], w[16], offset); + w[34] = hc_bytealign_S (w[14], w[15], offset); + w[33] = hc_bytealign_S (w[13], w[14], offset); + w[32] = hc_bytealign_S (w[12], w[13], offset); + w[31] = hc_bytealign_S (w[11], w[12], offset); + w[30] = hc_bytealign_S (w[10], w[11], offset); + w[29] = hc_bytealign_S (w[ 9], w[10], offset); + w[28] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[27] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[26] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[25] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[24] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[23] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[22] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[21] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[20] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[19] = hc_bytealign_S ( 0, w[ 0], offset); w[18] = 0; w[17] = 0; w[16] = 0; @@ -43714,50 +43714,50 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 20: - w[63] = amd_bytealign_S (w[42], w[43], offset); - w[62] = amd_bytealign_S (w[41], w[42], offset); - w[61] = amd_bytealign_S (w[40], w[41], offset); - w[60] = amd_bytealign_S (w[39], w[40], offset); - w[59] = amd_bytealign_S (w[38], w[39], offset); - w[58] = amd_bytealign_S (w[37], w[38], offset); - w[57] = amd_bytealign_S (w[36], w[37], offset); - w[56] = amd_bytealign_S (w[35], w[36], offset); - w[55] = amd_bytealign_S (w[34], w[35], offset); - w[54] = amd_bytealign_S (w[33], w[34], offset); - w[53] = amd_bytealign_S (w[32], w[33], offset); - w[52] = amd_bytealign_S (w[31], w[32], offset); - w[51] = amd_bytealign_S (w[30], w[31], offset); - w[50] = amd_bytealign_S (w[29], w[30], offset); - w[49] = amd_bytealign_S (w[28], w[29], offset); - w[48] = amd_bytealign_S (w[27], w[28], offset); - w[47] = amd_bytealign_S (w[26], w[27], offset); - w[46] = amd_bytealign_S (w[25], w[26], offset); - w[45] = amd_bytealign_S (w[24], w[25], offset); - w[44] = amd_bytealign_S (w[23], w[24], offset); - w[43] = amd_bytealign_S (w[22], w[23], offset); - w[42] = amd_bytealign_S (w[21], w[22], offset); - w[41] = amd_bytealign_S (w[20], w[21], offset); - w[40] = amd_bytealign_S (w[19], w[20], offset); - w[39] = amd_bytealign_S (w[18], w[19], offset); - w[38] = amd_bytealign_S (w[17], w[18], offset); - w[37] = amd_bytealign_S (w[16], w[17], offset); - w[36] = amd_bytealign_S (w[15], w[16], offset); - w[35] = amd_bytealign_S (w[14], w[15], offset); - w[34] = amd_bytealign_S (w[13], w[14], offset); - w[33] = amd_bytealign_S (w[12], w[13], offset); - w[32] = amd_bytealign_S (w[11], w[12], offset); - w[31] = amd_bytealign_S (w[10], w[11], offset); - w[30] = amd_bytealign_S (w[ 9], w[10], offset); - w[29] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[28] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[27] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[26] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[25] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[24] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[23] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[22] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[21] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[20] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[42], w[43], offset); + w[62] = hc_bytealign_S (w[41], w[42], offset); + w[61] = hc_bytealign_S (w[40], w[41], offset); + w[60] = hc_bytealign_S (w[39], w[40], offset); + w[59] = hc_bytealign_S (w[38], w[39], offset); + w[58] = hc_bytealign_S (w[37], w[38], offset); + w[57] = hc_bytealign_S (w[36], w[37], offset); + w[56] = hc_bytealign_S (w[35], w[36], offset); + w[55] = hc_bytealign_S (w[34], w[35], offset); + w[54] = hc_bytealign_S (w[33], w[34], offset); + w[53] = hc_bytealign_S (w[32], w[33], offset); + w[52] = hc_bytealign_S (w[31], w[32], offset); + w[51] = hc_bytealign_S (w[30], w[31], offset); + w[50] = hc_bytealign_S (w[29], w[30], offset); + w[49] = hc_bytealign_S (w[28], w[29], offset); + w[48] = hc_bytealign_S (w[27], w[28], offset); + w[47] = hc_bytealign_S (w[26], w[27], offset); + w[46] = hc_bytealign_S (w[25], w[26], offset); + w[45] = hc_bytealign_S (w[24], w[25], offset); + w[44] = hc_bytealign_S (w[23], w[24], offset); + w[43] = hc_bytealign_S (w[22], w[23], offset); + w[42] = hc_bytealign_S (w[21], w[22], offset); + w[41] = hc_bytealign_S (w[20], w[21], offset); + w[40] = hc_bytealign_S (w[19], w[20], offset); + w[39] = hc_bytealign_S (w[18], w[19], offset); + w[38] = hc_bytealign_S (w[17], w[18], offset); + w[37] = hc_bytealign_S (w[16], w[17], offset); + w[36] = hc_bytealign_S (w[15], w[16], offset); + w[35] = hc_bytealign_S (w[14], w[15], offset); + w[34] = hc_bytealign_S (w[13], w[14], offset); + w[33] = hc_bytealign_S (w[12], w[13], offset); + w[32] = hc_bytealign_S (w[11], w[12], offset); + w[31] = hc_bytealign_S (w[10], w[11], offset); + w[30] = hc_bytealign_S (w[ 9], w[10], offset); + w[29] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[28] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[27] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[26] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[25] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[24] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[23] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[22] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[21] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[20] = hc_bytealign_S ( 0, w[ 0], offset); w[19] = 0; w[18] = 0; w[17] = 0; @@ -43782,49 +43782,49 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 21: - w[63] = amd_bytealign_S (w[41], w[42], offset); - w[62] = amd_bytealign_S (w[40], w[41], offset); - w[61] = amd_bytealign_S (w[39], w[40], offset); - w[60] = amd_bytealign_S (w[38], w[39], offset); - w[59] = amd_bytealign_S (w[37], w[38], offset); - w[58] = amd_bytealign_S (w[36], w[37], offset); - w[57] = amd_bytealign_S (w[35], w[36], offset); - w[56] = amd_bytealign_S (w[34], w[35], offset); - w[55] = amd_bytealign_S (w[33], w[34], offset); - w[54] = amd_bytealign_S (w[32], w[33], offset); - w[53] = amd_bytealign_S (w[31], w[32], offset); - w[52] = amd_bytealign_S (w[30], w[31], offset); - w[51] = amd_bytealign_S (w[29], w[30], offset); - w[50] = amd_bytealign_S (w[28], w[29], offset); - w[49] = amd_bytealign_S (w[27], w[28], offset); - w[48] = amd_bytealign_S (w[26], w[27], offset); - w[47] = amd_bytealign_S (w[25], w[26], offset); - w[46] = amd_bytealign_S (w[24], w[25], offset); - w[45] = amd_bytealign_S (w[23], w[24], offset); - w[44] = amd_bytealign_S (w[22], w[23], offset); - w[43] = amd_bytealign_S (w[21], w[22], offset); - w[42] = amd_bytealign_S (w[20], w[21], offset); - w[41] = amd_bytealign_S (w[19], w[20], offset); - w[40] = amd_bytealign_S (w[18], w[19], offset); - w[39] = amd_bytealign_S (w[17], w[18], offset); - w[38] = amd_bytealign_S (w[16], w[17], offset); - w[37] = amd_bytealign_S (w[15], w[16], offset); - w[36] = amd_bytealign_S (w[14], w[15], offset); - w[35] = amd_bytealign_S (w[13], w[14], offset); - w[34] = amd_bytealign_S (w[12], w[13], offset); - w[33] = amd_bytealign_S (w[11], w[12], offset); - w[32] = amd_bytealign_S (w[10], w[11], offset); - w[31] = amd_bytealign_S (w[ 9], w[10], offset); - w[30] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[29] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[28] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[27] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[26] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[25] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[24] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[23] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[22] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[21] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[41], w[42], offset); + w[62] = hc_bytealign_S (w[40], w[41], offset); + w[61] = hc_bytealign_S (w[39], w[40], offset); + w[60] = hc_bytealign_S (w[38], w[39], offset); + w[59] = hc_bytealign_S (w[37], w[38], offset); + w[58] = hc_bytealign_S (w[36], w[37], offset); + w[57] = hc_bytealign_S (w[35], w[36], offset); + w[56] = hc_bytealign_S (w[34], w[35], offset); + w[55] = hc_bytealign_S (w[33], w[34], offset); + w[54] = hc_bytealign_S (w[32], w[33], offset); + w[53] = hc_bytealign_S (w[31], w[32], offset); + w[52] = hc_bytealign_S (w[30], w[31], offset); + w[51] = hc_bytealign_S (w[29], w[30], offset); + w[50] = hc_bytealign_S (w[28], w[29], offset); + w[49] = hc_bytealign_S (w[27], w[28], offset); + w[48] = hc_bytealign_S (w[26], w[27], offset); + w[47] = hc_bytealign_S (w[25], w[26], offset); + w[46] = hc_bytealign_S (w[24], w[25], offset); + w[45] = hc_bytealign_S (w[23], w[24], offset); + w[44] = hc_bytealign_S (w[22], w[23], offset); + w[43] = hc_bytealign_S (w[21], w[22], offset); + w[42] = hc_bytealign_S (w[20], w[21], offset); + w[41] = hc_bytealign_S (w[19], w[20], offset); + w[40] = hc_bytealign_S (w[18], w[19], offset); + w[39] = hc_bytealign_S (w[17], w[18], offset); + w[38] = hc_bytealign_S (w[16], w[17], offset); + w[37] = hc_bytealign_S (w[15], w[16], offset); + w[36] = hc_bytealign_S (w[14], w[15], offset); + w[35] = hc_bytealign_S (w[13], w[14], offset); + w[34] = hc_bytealign_S (w[12], w[13], offset); + w[33] = hc_bytealign_S (w[11], w[12], offset); + w[32] = hc_bytealign_S (w[10], w[11], offset); + w[31] = hc_bytealign_S (w[ 9], w[10], offset); + w[30] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[29] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[28] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[27] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[26] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[25] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[24] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[23] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[22] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[21] = hc_bytealign_S ( 0, w[ 0], offset); w[20] = 0; w[19] = 0; w[18] = 0; @@ -43850,48 +43850,48 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 22: - w[63] = amd_bytealign_S (w[40], w[41], offset); - w[62] = amd_bytealign_S (w[39], w[40], offset); - w[61] = amd_bytealign_S (w[38], w[39], offset); - w[60] = amd_bytealign_S (w[37], w[38], offset); - w[59] = amd_bytealign_S (w[36], w[37], offset); - w[58] = amd_bytealign_S (w[35], w[36], offset); - w[57] = amd_bytealign_S (w[34], w[35], offset); - w[56] = amd_bytealign_S (w[33], w[34], offset); - w[55] = amd_bytealign_S (w[32], w[33], offset); - w[54] = amd_bytealign_S (w[31], w[32], offset); - w[53] = amd_bytealign_S (w[30], w[31], offset); - w[52] = amd_bytealign_S (w[29], w[30], offset); - w[51] = amd_bytealign_S (w[28], w[29], offset); - w[50] = amd_bytealign_S (w[27], w[28], offset); - w[49] = amd_bytealign_S (w[26], w[27], offset); - w[48] = amd_bytealign_S (w[25], w[26], offset); - w[47] = amd_bytealign_S (w[24], w[25], offset); - w[46] = amd_bytealign_S (w[23], w[24], offset); - w[45] = amd_bytealign_S (w[22], w[23], offset); - w[44] = amd_bytealign_S (w[21], w[22], offset); - w[43] = amd_bytealign_S (w[20], w[21], offset); - w[42] = amd_bytealign_S (w[19], w[20], offset); - w[41] = amd_bytealign_S (w[18], w[19], offset); - w[40] = amd_bytealign_S (w[17], w[18], offset); - w[39] = amd_bytealign_S (w[16], w[17], offset); - w[38] = amd_bytealign_S (w[15], w[16], offset); - w[37] = amd_bytealign_S (w[14], w[15], offset); - w[36] = amd_bytealign_S (w[13], w[14], offset); - w[35] = amd_bytealign_S (w[12], w[13], offset); - w[34] = amd_bytealign_S (w[11], w[12], offset); - w[33] = amd_bytealign_S (w[10], w[11], offset); - w[32] = amd_bytealign_S (w[ 9], w[10], offset); - w[31] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[30] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[29] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[28] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[27] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[26] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[25] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[24] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[23] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[22] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[40], w[41], offset); + w[62] = hc_bytealign_S (w[39], w[40], offset); + w[61] = hc_bytealign_S (w[38], w[39], offset); + w[60] = hc_bytealign_S (w[37], w[38], offset); + w[59] = hc_bytealign_S (w[36], w[37], offset); + w[58] = hc_bytealign_S (w[35], w[36], offset); + w[57] = hc_bytealign_S (w[34], w[35], offset); + w[56] = hc_bytealign_S (w[33], w[34], offset); + w[55] = hc_bytealign_S (w[32], w[33], offset); + w[54] = hc_bytealign_S (w[31], w[32], offset); + w[53] = hc_bytealign_S (w[30], w[31], offset); + w[52] = hc_bytealign_S (w[29], w[30], offset); + w[51] = hc_bytealign_S (w[28], w[29], offset); + w[50] = hc_bytealign_S (w[27], w[28], offset); + w[49] = hc_bytealign_S (w[26], w[27], offset); + w[48] = hc_bytealign_S (w[25], w[26], offset); + w[47] = hc_bytealign_S (w[24], w[25], offset); + w[46] = hc_bytealign_S (w[23], w[24], offset); + w[45] = hc_bytealign_S (w[22], w[23], offset); + w[44] = hc_bytealign_S (w[21], w[22], offset); + w[43] = hc_bytealign_S (w[20], w[21], offset); + w[42] = hc_bytealign_S (w[19], w[20], offset); + w[41] = hc_bytealign_S (w[18], w[19], offset); + w[40] = hc_bytealign_S (w[17], w[18], offset); + w[39] = hc_bytealign_S (w[16], w[17], offset); + w[38] = hc_bytealign_S (w[15], w[16], offset); + w[37] = hc_bytealign_S (w[14], w[15], offset); + w[36] = hc_bytealign_S (w[13], w[14], offset); + w[35] = hc_bytealign_S (w[12], w[13], offset); + w[34] = hc_bytealign_S (w[11], w[12], offset); + w[33] = hc_bytealign_S (w[10], w[11], offset); + w[32] = hc_bytealign_S (w[ 9], w[10], offset); + w[31] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[30] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[29] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[28] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[27] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[26] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[25] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[24] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[23] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[22] = hc_bytealign_S ( 0, w[ 0], offset); w[21] = 0; w[20] = 0; w[19] = 0; @@ -43918,47 +43918,47 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 23: - w[63] = amd_bytealign_S (w[39], w[40], offset); - w[62] = amd_bytealign_S (w[38], w[39], offset); - w[61] = amd_bytealign_S (w[37], w[38], offset); - w[60] = amd_bytealign_S (w[36], w[37], offset); - w[59] = amd_bytealign_S (w[35], w[36], offset); - w[58] = amd_bytealign_S (w[34], w[35], offset); - w[57] = amd_bytealign_S (w[33], w[34], offset); - w[56] = amd_bytealign_S (w[32], w[33], offset); - w[55] = amd_bytealign_S (w[31], w[32], offset); - w[54] = amd_bytealign_S (w[30], w[31], offset); - w[53] = amd_bytealign_S (w[29], w[30], offset); - w[52] = amd_bytealign_S (w[28], w[29], offset); - w[51] = amd_bytealign_S (w[27], w[28], offset); - w[50] = amd_bytealign_S (w[26], w[27], offset); - w[49] = amd_bytealign_S (w[25], w[26], offset); - w[48] = amd_bytealign_S (w[24], w[25], offset); - w[47] = amd_bytealign_S (w[23], w[24], offset); - w[46] = amd_bytealign_S (w[22], w[23], offset); - w[45] = amd_bytealign_S (w[21], w[22], offset); - w[44] = amd_bytealign_S (w[20], w[21], offset); - w[43] = amd_bytealign_S (w[19], w[20], offset); - w[42] = amd_bytealign_S (w[18], w[19], offset); - w[41] = amd_bytealign_S (w[17], w[18], offset); - w[40] = amd_bytealign_S (w[16], w[17], offset); - w[39] = amd_bytealign_S (w[15], w[16], offset); - w[38] = amd_bytealign_S (w[14], w[15], offset); - w[37] = amd_bytealign_S (w[13], w[14], offset); - w[36] = amd_bytealign_S (w[12], w[13], offset); - w[35] = amd_bytealign_S (w[11], w[12], offset); - w[34] = amd_bytealign_S (w[10], w[11], offset); - w[33] = amd_bytealign_S (w[ 9], w[10], offset); - w[32] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[31] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[30] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[29] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[28] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[27] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[26] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[25] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[24] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[23] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[39], w[40], offset); + w[62] = hc_bytealign_S (w[38], w[39], offset); + w[61] = hc_bytealign_S (w[37], w[38], offset); + w[60] = hc_bytealign_S (w[36], w[37], offset); + w[59] = hc_bytealign_S (w[35], w[36], offset); + w[58] = hc_bytealign_S (w[34], w[35], offset); + w[57] = hc_bytealign_S (w[33], w[34], offset); + w[56] = hc_bytealign_S (w[32], w[33], offset); + w[55] = hc_bytealign_S (w[31], w[32], offset); + w[54] = hc_bytealign_S (w[30], w[31], offset); + w[53] = hc_bytealign_S (w[29], w[30], offset); + w[52] = hc_bytealign_S (w[28], w[29], offset); + w[51] = hc_bytealign_S (w[27], w[28], offset); + w[50] = hc_bytealign_S (w[26], w[27], offset); + w[49] = hc_bytealign_S (w[25], w[26], offset); + w[48] = hc_bytealign_S (w[24], w[25], offset); + w[47] = hc_bytealign_S (w[23], w[24], offset); + w[46] = hc_bytealign_S (w[22], w[23], offset); + w[45] = hc_bytealign_S (w[21], w[22], offset); + w[44] = hc_bytealign_S (w[20], w[21], offset); + w[43] = hc_bytealign_S (w[19], w[20], offset); + w[42] = hc_bytealign_S (w[18], w[19], offset); + w[41] = hc_bytealign_S (w[17], w[18], offset); + w[40] = hc_bytealign_S (w[16], w[17], offset); + w[39] = hc_bytealign_S (w[15], w[16], offset); + w[38] = hc_bytealign_S (w[14], w[15], offset); + w[37] = hc_bytealign_S (w[13], w[14], offset); + w[36] = hc_bytealign_S (w[12], w[13], offset); + w[35] = hc_bytealign_S (w[11], w[12], offset); + w[34] = hc_bytealign_S (w[10], w[11], offset); + w[33] = hc_bytealign_S (w[ 9], w[10], offset); + w[32] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[31] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[30] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[29] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[28] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[27] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[26] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[25] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[24] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[23] = hc_bytealign_S ( 0, w[ 0], offset); w[22] = 0; w[21] = 0; w[20] = 0; @@ -43986,46 +43986,46 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 24: - w[63] = amd_bytealign_S (w[38], w[39], offset); - w[62] = amd_bytealign_S (w[37], w[38], offset); - w[61] = amd_bytealign_S (w[36], w[37], offset); - w[60] = amd_bytealign_S (w[35], w[36], offset); - w[59] = amd_bytealign_S (w[34], w[35], offset); - w[58] = amd_bytealign_S (w[33], w[34], offset); - w[57] = amd_bytealign_S (w[32], w[33], offset); - w[56] = amd_bytealign_S (w[31], w[32], offset); - w[55] = amd_bytealign_S (w[30], w[31], offset); - w[54] = amd_bytealign_S (w[29], w[30], offset); - w[53] = amd_bytealign_S (w[28], w[29], offset); - w[52] = amd_bytealign_S (w[27], w[28], offset); - w[51] = amd_bytealign_S (w[26], w[27], offset); - w[50] = amd_bytealign_S (w[25], w[26], offset); - w[49] = amd_bytealign_S (w[24], w[25], offset); - w[48] = amd_bytealign_S (w[23], w[24], offset); - w[47] = amd_bytealign_S (w[22], w[23], offset); - w[46] = amd_bytealign_S (w[21], w[22], offset); - w[45] = amd_bytealign_S (w[20], w[21], offset); - w[44] = amd_bytealign_S (w[19], w[20], offset); - w[43] = amd_bytealign_S (w[18], w[19], offset); - w[42] = amd_bytealign_S (w[17], w[18], offset); - w[41] = amd_bytealign_S (w[16], w[17], offset); - w[40] = amd_bytealign_S (w[15], w[16], offset); - w[39] = amd_bytealign_S (w[14], w[15], offset); - w[38] = amd_bytealign_S (w[13], w[14], offset); - w[37] = amd_bytealign_S (w[12], w[13], offset); - w[36] = amd_bytealign_S (w[11], w[12], offset); - w[35] = amd_bytealign_S (w[10], w[11], offset); - w[34] = amd_bytealign_S (w[ 9], w[10], offset); - w[33] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[32] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[31] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[30] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[29] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[28] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[27] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[26] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[25] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[24] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[38], w[39], offset); + w[62] = hc_bytealign_S (w[37], w[38], offset); + w[61] = hc_bytealign_S (w[36], w[37], offset); + w[60] = hc_bytealign_S (w[35], w[36], offset); + w[59] = hc_bytealign_S (w[34], w[35], offset); + w[58] = hc_bytealign_S (w[33], w[34], offset); + w[57] = hc_bytealign_S (w[32], w[33], offset); + w[56] = hc_bytealign_S (w[31], w[32], offset); + w[55] = hc_bytealign_S (w[30], w[31], offset); + w[54] = hc_bytealign_S (w[29], w[30], offset); + w[53] = hc_bytealign_S (w[28], w[29], offset); + w[52] = hc_bytealign_S (w[27], w[28], offset); + w[51] = hc_bytealign_S (w[26], w[27], offset); + w[50] = hc_bytealign_S (w[25], w[26], offset); + w[49] = hc_bytealign_S (w[24], w[25], offset); + w[48] = hc_bytealign_S (w[23], w[24], offset); + w[47] = hc_bytealign_S (w[22], w[23], offset); + w[46] = hc_bytealign_S (w[21], w[22], offset); + w[45] = hc_bytealign_S (w[20], w[21], offset); + w[44] = hc_bytealign_S (w[19], w[20], offset); + w[43] = hc_bytealign_S (w[18], w[19], offset); + w[42] = hc_bytealign_S (w[17], w[18], offset); + w[41] = hc_bytealign_S (w[16], w[17], offset); + w[40] = hc_bytealign_S (w[15], w[16], offset); + w[39] = hc_bytealign_S (w[14], w[15], offset); + w[38] = hc_bytealign_S (w[13], w[14], offset); + w[37] = hc_bytealign_S (w[12], w[13], offset); + w[36] = hc_bytealign_S (w[11], w[12], offset); + w[35] = hc_bytealign_S (w[10], w[11], offset); + w[34] = hc_bytealign_S (w[ 9], w[10], offset); + w[33] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[32] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[31] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[30] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[29] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[28] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[27] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[26] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[25] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[24] = hc_bytealign_S ( 0, w[ 0], offset); w[23] = 0; w[22] = 0; w[21] = 0; @@ -44054,45 +44054,45 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 25: - w[63] = amd_bytealign_S (w[37], w[38], offset); - w[62] = amd_bytealign_S (w[36], w[37], offset); - w[61] = amd_bytealign_S (w[35], w[36], offset); - w[60] = amd_bytealign_S (w[34], w[35], offset); - w[59] = amd_bytealign_S (w[33], w[34], offset); - w[58] = amd_bytealign_S (w[32], w[33], offset); - w[57] = amd_bytealign_S (w[31], w[32], offset); - w[56] = amd_bytealign_S (w[30], w[31], offset); - w[55] = amd_bytealign_S (w[29], w[30], offset); - w[54] = amd_bytealign_S (w[28], w[29], offset); - w[53] = amd_bytealign_S (w[27], w[28], offset); - w[52] = amd_bytealign_S (w[26], w[27], offset); - w[51] = amd_bytealign_S (w[25], w[26], offset); - w[50] = amd_bytealign_S (w[24], w[25], offset); - w[49] = amd_bytealign_S (w[23], w[24], offset); - w[48] = amd_bytealign_S (w[22], w[23], offset); - w[47] = amd_bytealign_S (w[21], w[22], offset); - w[46] = amd_bytealign_S (w[20], w[21], offset); - w[45] = amd_bytealign_S (w[19], w[20], offset); - w[44] = amd_bytealign_S (w[18], w[19], offset); - w[43] = amd_bytealign_S (w[17], w[18], offset); - w[42] = amd_bytealign_S (w[16], w[17], offset); - w[41] = amd_bytealign_S (w[15], w[16], offset); - w[40] = amd_bytealign_S (w[14], w[15], offset); - w[39] = amd_bytealign_S (w[13], w[14], offset); - w[38] = amd_bytealign_S (w[12], w[13], offset); - w[37] = amd_bytealign_S (w[11], w[12], offset); - w[36] = amd_bytealign_S (w[10], w[11], offset); - w[35] = amd_bytealign_S (w[ 9], w[10], offset); - w[34] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[33] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[32] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[31] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[30] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[29] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[28] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[27] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[26] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[25] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[37], w[38], offset); + w[62] = hc_bytealign_S (w[36], w[37], offset); + w[61] = hc_bytealign_S (w[35], w[36], offset); + w[60] = hc_bytealign_S (w[34], w[35], offset); + w[59] = hc_bytealign_S (w[33], w[34], offset); + w[58] = hc_bytealign_S (w[32], w[33], offset); + w[57] = hc_bytealign_S (w[31], w[32], offset); + w[56] = hc_bytealign_S (w[30], w[31], offset); + w[55] = hc_bytealign_S (w[29], w[30], offset); + w[54] = hc_bytealign_S (w[28], w[29], offset); + w[53] = hc_bytealign_S (w[27], w[28], offset); + w[52] = hc_bytealign_S (w[26], w[27], offset); + w[51] = hc_bytealign_S (w[25], w[26], offset); + w[50] = hc_bytealign_S (w[24], w[25], offset); + w[49] = hc_bytealign_S (w[23], w[24], offset); + w[48] = hc_bytealign_S (w[22], w[23], offset); + w[47] = hc_bytealign_S (w[21], w[22], offset); + w[46] = hc_bytealign_S (w[20], w[21], offset); + w[45] = hc_bytealign_S (w[19], w[20], offset); + w[44] = hc_bytealign_S (w[18], w[19], offset); + w[43] = hc_bytealign_S (w[17], w[18], offset); + w[42] = hc_bytealign_S (w[16], w[17], offset); + w[41] = hc_bytealign_S (w[15], w[16], offset); + w[40] = hc_bytealign_S (w[14], w[15], offset); + w[39] = hc_bytealign_S (w[13], w[14], offset); + w[38] = hc_bytealign_S (w[12], w[13], offset); + w[37] = hc_bytealign_S (w[11], w[12], offset); + w[36] = hc_bytealign_S (w[10], w[11], offset); + w[35] = hc_bytealign_S (w[ 9], w[10], offset); + w[34] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[33] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[32] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[31] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[30] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[29] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[28] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[27] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[26] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[25] = hc_bytealign_S ( 0, w[ 0], offset); w[24] = 0; w[23] = 0; w[22] = 0; @@ -44122,44 +44122,44 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 26: - w[63] = amd_bytealign_S (w[36], w[37], offset); - w[62] = amd_bytealign_S (w[35], w[36], offset); - w[61] = amd_bytealign_S (w[34], w[35], offset); - w[60] = amd_bytealign_S (w[33], w[34], offset); - w[59] = amd_bytealign_S (w[32], w[33], offset); - w[58] = amd_bytealign_S (w[31], w[32], offset); - w[57] = amd_bytealign_S (w[30], w[31], offset); - w[56] = amd_bytealign_S (w[29], w[30], offset); - w[55] = amd_bytealign_S (w[28], w[29], offset); - w[54] = amd_bytealign_S (w[27], w[28], offset); - w[53] = amd_bytealign_S (w[26], w[27], offset); - w[52] = amd_bytealign_S (w[25], w[26], offset); - w[51] = amd_bytealign_S (w[24], w[25], offset); - w[50] = amd_bytealign_S (w[23], w[24], offset); - w[49] = amd_bytealign_S (w[22], w[23], offset); - w[48] = amd_bytealign_S (w[21], w[22], offset); - w[47] = amd_bytealign_S (w[20], w[21], offset); - w[46] = amd_bytealign_S (w[19], w[20], offset); - w[45] = amd_bytealign_S (w[18], w[19], offset); - w[44] = amd_bytealign_S (w[17], w[18], offset); - w[43] = amd_bytealign_S (w[16], w[17], offset); - w[42] = amd_bytealign_S (w[15], w[16], offset); - w[41] = amd_bytealign_S (w[14], w[15], offset); - w[40] = amd_bytealign_S (w[13], w[14], offset); - w[39] = amd_bytealign_S (w[12], w[13], offset); - w[38] = amd_bytealign_S (w[11], w[12], offset); - w[37] = amd_bytealign_S (w[10], w[11], offset); - w[36] = amd_bytealign_S (w[ 9], w[10], offset); - w[35] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[34] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[33] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[32] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[31] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[30] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[29] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[28] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[27] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[26] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[36], w[37], offset); + w[62] = hc_bytealign_S (w[35], w[36], offset); + w[61] = hc_bytealign_S (w[34], w[35], offset); + w[60] = hc_bytealign_S (w[33], w[34], offset); + w[59] = hc_bytealign_S (w[32], w[33], offset); + w[58] = hc_bytealign_S (w[31], w[32], offset); + w[57] = hc_bytealign_S (w[30], w[31], offset); + w[56] = hc_bytealign_S (w[29], w[30], offset); + w[55] = hc_bytealign_S (w[28], w[29], offset); + w[54] = hc_bytealign_S (w[27], w[28], offset); + w[53] = hc_bytealign_S (w[26], w[27], offset); + w[52] = hc_bytealign_S (w[25], w[26], offset); + w[51] = hc_bytealign_S (w[24], w[25], offset); + w[50] = hc_bytealign_S (w[23], w[24], offset); + w[49] = hc_bytealign_S (w[22], w[23], offset); + w[48] = hc_bytealign_S (w[21], w[22], offset); + w[47] = hc_bytealign_S (w[20], w[21], offset); + w[46] = hc_bytealign_S (w[19], w[20], offset); + w[45] = hc_bytealign_S (w[18], w[19], offset); + w[44] = hc_bytealign_S (w[17], w[18], offset); + w[43] = hc_bytealign_S (w[16], w[17], offset); + w[42] = hc_bytealign_S (w[15], w[16], offset); + w[41] = hc_bytealign_S (w[14], w[15], offset); + w[40] = hc_bytealign_S (w[13], w[14], offset); + w[39] = hc_bytealign_S (w[12], w[13], offset); + w[38] = hc_bytealign_S (w[11], w[12], offset); + w[37] = hc_bytealign_S (w[10], w[11], offset); + w[36] = hc_bytealign_S (w[ 9], w[10], offset); + w[35] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[34] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[33] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[32] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[31] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[30] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[29] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[28] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[27] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[26] = hc_bytealign_S ( 0, w[ 0], offset); w[25] = 0; w[24] = 0; w[23] = 0; @@ -44190,43 +44190,43 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 27: - w[63] = amd_bytealign_S (w[35], w[36], offset); - w[62] = amd_bytealign_S (w[34], w[35], offset); - w[61] = amd_bytealign_S (w[33], w[34], offset); - w[60] = amd_bytealign_S (w[32], w[33], offset); - w[59] = amd_bytealign_S (w[31], w[32], offset); - w[58] = amd_bytealign_S (w[30], w[31], offset); - w[57] = amd_bytealign_S (w[29], w[30], offset); - w[56] = amd_bytealign_S (w[28], w[29], offset); - w[55] = amd_bytealign_S (w[27], w[28], offset); - w[54] = amd_bytealign_S (w[26], w[27], offset); - w[53] = amd_bytealign_S (w[25], w[26], offset); - w[52] = amd_bytealign_S (w[24], w[25], offset); - w[51] = amd_bytealign_S (w[23], w[24], offset); - w[50] = amd_bytealign_S (w[22], w[23], offset); - w[49] = amd_bytealign_S (w[21], w[22], offset); - w[48] = amd_bytealign_S (w[20], w[21], offset); - w[47] = amd_bytealign_S (w[19], w[20], offset); - w[46] = amd_bytealign_S (w[18], w[19], offset); - w[45] = amd_bytealign_S (w[17], w[18], offset); - w[44] = amd_bytealign_S (w[16], w[17], offset); - w[43] = amd_bytealign_S (w[15], w[16], offset); - w[42] = amd_bytealign_S (w[14], w[15], offset); - w[41] = amd_bytealign_S (w[13], w[14], offset); - w[40] = amd_bytealign_S (w[12], w[13], offset); - w[39] = amd_bytealign_S (w[11], w[12], offset); - w[38] = amd_bytealign_S (w[10], w[11], offset); - w[37] = amd_bytealign_S (w[ 9], w[10], offset); - w[36] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[35] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[34] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[33] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[32] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[31] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[30] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[29] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[28] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[27] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[35], w[36], offset); + w[62] = hc_bytealign_S (w[34], w[35], offset); + w[61] = hc_bytealign_S (w[33], w[34], offset); + w[60] = hc_bytealign_S (w[32], w[33], offset); + w[59] = hc_bytealign_S (w[31], w[32], offset); + w[58] = hc_bytealign_S (w[30], w[31], offset); + w[57] = hc_bytealign_S (w[29], w[30], offset); + w[56] = hc_bytealign_S (w[28], w[29], offset); + w[55] = hc_bytealign_S (w[27], w[28], offset); + w[54] = hc_bytealign_S (w[26], w[27], offset); + w[53] = hc_bytealign_S (w[25], w[26], offset); + w[52] = hc_bytealign_S (w[24], w[25], offset); + w[51] = hc_bytealign_S (w[23], w[24], offset); + w[50] = hc_bytealign_S (w[22], w[23], offset); + w[49] = hc_bytealign_S (w[21], w[22], offset); + w[48] = hc_bytealign_S (w[20], w[21], offset); + w[47] = hc_bytealign_S (w[19], w[20], offset); + w[46] = hc_bytealign_S (w[18], w[19], offset); + w[45] = hc_bytealign_S (w[17], w[18], offset); + w[44] = hc_bytealign_S (w[16], w[17], offset); + w[43] = hc_bytealign_S (w[15], w[16], offset); + w[42] = hc_bytealign_S (w[14], w[15], offset); + w[41] = hc_bytealign_S (w[13], w[14], offset); + w[40] = hc_bytealign_S (w[12], w[13], offset); + w[39] = hc_bytealign_S (w[11], w[12], offset); + w[38] = hc_bytealign_S (w[10], w[11], offset); + w[37] = hc_bytealign_S (w[ 9], w[10], offset); + w[36] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[35] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[34] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[33] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[32] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[31] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[30] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[29] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[28] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[27] = hc_bytealign_S ( 0, w[ 0], offset); w[26] = 0; w[25] = 0; w[24] = 0; @@ -44258,42 +44258,42 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 28: - w[63] = amd_bytealign_S (w[34], w[35], offset); - w[62] = amd_bytealign_S (w[33], w[34], offset); - w[61] = amd_bytealign_S (w[32], w[33], offset); - w[60] = amd_bytealign_S (w[31], w[32], offset); - w[59] = amd_bytealign_S (w[30], w[31], offset); - w[58] = amd_bytealign_S (w[29], w[30], offset); - w[57] = amd_bytealign_S (w[28], w[29], offset); - w[56] = amd_bytealign_S (w[27], w[28], offset); - w[55] = amd_bytealign_S (w[26], w[27], offset); - w[54] = amd_bytealign_S (w[25], w[26], offset); - w[53] = amd_bytealign_S (w[24], w[25], offset); - w[52] = amd_bytealign_S (w[23], w[24], offset); - w[51] = amd_bytealign_S (w[22], w[23], offset); - w[50] = amd_bytealign_S (w[21], w[22], offset); - w[49] = amd_bytealign_S (w[20], w[21], offset); - w[48] = amd_bytealign_S (w[19], w[20], offset); - w[47] = amd_bytealign_S (w[18], w[19], offset); - w[46] = amd_bytealign_S (w[17], w[18], offset); - w[45] = amd_bytealign_S (w[16], w[17], offset); - w[44] = amd_bytealign_S (w[15], w[16], offset); - w[43] = amd_bytealign_S (w[14], w[15], offset); - w[42] = amd_bytealign_S (w[13], w[14], offset); - w[41] = amd_bytealign_S (w[12], w[13], offset); - w[40] = amd_bytealign_S (w[11], w[12], offset); - w[39] = amd_bytealign_S (w[10], w[11], offset); - w[38] = amd_bytealign_S (w[ 9], w[10], offset); - w[37] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[36] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[35] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[34] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[33] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[32] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[31] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[30] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[29] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[28] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[34], w[35], offset); + w[62] = hc_bytealign_S (w[33], w[34], offset); + w[61] = hc_bytealign_S (w[32], w[33], offset); + w[60] = hc_bytealign_S (w[31], w[32], offset); + w[59] = hc_bytealign_S (w[30], w[31], offset); + w[58] = hc_bytealign_S (w[29], w[30], offset); + w[57] = hc_bytealign_S (w[28], w[29], offset); + w[56] = hc_bytealign_S (w[27], w[28], offset); + w[55] = hc_bytealign_S (w[26], w[27], offset); + w[54] = hc_bytealign_S (w[25], w[26], offset); + w[53] = hc_bytealign_S (w[24], w[25], offset); + w[52] = hc_bytealign_S (w[23], w[24], offset); + w[51] = hc_bytealign_S (w[22], w[23], offset); + w[50] = hc_bytealign_S (w[21], w[22], offset); + w[49] = hc_bytealign_S (w[20], w[21], offset); + w[48] = hc_bytealign_S (w[19], w[20], offset); + w[47] = hc_bytealign_S (w[18], w[19], offset); + w[46] = hc_bytealign_S (w[17], w[18], offset); + w[45] = hc_bytealign_S (w[16], w[17], offset); + w[44] = hc_bytealign_S (w[15], w[16], offset); + w[43] = hc_bytealign_S (w[14], w[15], offset); + w[42] = hc_bytealign_S (w[13], w[14], offset); + w[41] = hc_bytealign_S (w[12], w[13], offset); + w[40] = hc_bytealign_S (w[11], w[12], offset); + w[39] = hc_bytealign_S (w[10], w[11], offset); + w[38] = hc_bytealign_S (w[ 9], w[10], offset); + w[37] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[36] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[35] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[34] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[33] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[32] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[31] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[30] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[29] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[28] = hc_bytealign_S ( 0, w[ 0], offset); w[27] = 0; w[26] = 0; w[25] = 0; @@ -44326,41 +44326,41 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 29: - w[63] = amd_bytealign_S (w[33], w[34], offset); - w[62] = amd_bytealign_S (w[32], w[33], offset); - w[61] = amd_bytealign_S (w[31], w[32], offset); - w[60] = amd_bytealign_S (w[30], w[31], offset); - w[59] = amd_bytealign_S (w[29], w[30], offset); - w[58] = amd_bytealign_S (w[28], w[29], offset); - w[57] = amd_bytealign_S (w[27], w[28], offset); - w[56] = amd_bytealign_S (w[26], w[27], offset); - w[55] = amd_bytealign_S (w[25], w[26], offset); - w[54] = amd_bytealign_S (w[24], w[25], offset); - w[53] = amd_bytealign_S (w[23], w[24], offset); - w[52] = amd_bytealign_S (w[22], w[23], offset); - w[51] = amd_bytealign_S (w[21], w[22], offset); - w[50] = amd_bytealign_S (w[20], w[21], offset); - w[49] = amd_bytealign_S (w[19], w[20], offset); - w[48] = amd_bytealign_S (w[18], w[19], offset); - w[47] = amd_bytealign_S (w[17], w[18], offset); - w[46] = amd_bytealign_S (w[16], w[17], offset); - w[45] = amd_bytealign_S (w[15], w[16], offset); - w[44] = amd_bytealign_S (w[14], w[15], offset); - w[43] = amd_bytealign_S (w[13], w[14], offset); - w[42] = amd_bytealign_S (w[12], w[13], offset); - w[41] = amd_bytealign_S (w[11], w[12], offset); - w[40] = amd_bytealign_S (w[10], w[11], offset); - w[39] = amd_bytealign_S (w[ 9], w[10], offset); - w[38] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[37] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[36] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[35] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[34] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[33] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[32] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[31] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[30] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[29] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[33], w[34], offset); + w[62] = hc_bytealign_S (w[32], w[33], offset); + w[61] = hc_bytealign_S (w[31], w[32], offset); + w[60] = hc_bytealign_S (w[30], w[31], offset); + w[59] = hc_bytealign_S (w[29], w[30], offset); + w[58] = hc_bytealign_S (w[28], w[29], offset); + w[57] = hc_bytealign_S (w[27], w[28], offset); + w[56] = hc_bytealign_S (w[26], w[27], offset); + w[55] = hc_bytealign_S (w[25], w[26], offset); + w[54] = hc_bytealign_S (w[24], w[25], offset); + w[53] = hc_bytealign_S (w[23], w[24], offset); + w[52] = hc_bytealign_S (w[22], w[23], offset); + w[51] = hc_bytealign_S (w[21], w[22], offset); + w[50] = hc_bytealign_S (w[20], w[21], offset); + w[49] = hc_bytealign_S (w[19], w[20], offset); + w[48] = hc_bytealign_S (w[18], w[19], offset); + w[47] = hc_bytealign_S (w[17], w[18], offset); + w[46] = hc_bytealign_S (w[16], w[17], offset); + w[45] = hc_bytealign_S (w[15], w[16], offset); + w[44] = hc_bytealign_S (w[14], w[15], offset); + w[43] = hc_bytealign_S (w[13], w[14], offset); + w[42] = hc_bytealign_S (w[12], w[13], offset); + w[41] = hc_bytealign_S (w[11], w[12], offset); + w[40] = hc_bytealign_S (w[10], w[11], offset); + w[39] = hc_bytealign_S (w[ 9], w[10], offset); + w[38] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[37] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[36] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[35] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[34] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[33] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[32] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[31] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[30] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[29] = hc_bytealign_S ( 0, w[ 0], offset); w[28] = 0; w[27] = 0; w[26] = 0; @@ -44394,40 +44394,40 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 30: - w[63] = amd_bytealign_S (w[32], w[33], offset); - w[62] = amd_bytealign_S (w[31], w[32], offset); - w[61] = amd_bytealign_S (w[30], w[31], offset); - w[60] = amd_bytealign_S (w[29], w[30], offset); - w[59] = amd_bytealign_S (w[28], w[29], offset); - w[58] = amd_bytealign_S (w[27], w[28], offset); - w[57] = amd_bytealign_S (w[26], w[27], offset); - w[56] = amd_bytealign_S (w[25], w[26], offset); - w[55] = amd_bytealign_S (w[24], w[25], offset); - w[54] = amd_bytealign_S (w[23], w[24], offset); - w[53] = amd_bytealign_S (w[22], w[23], offset); - w[52] = amd_bytealign_S (w[21], w[22], offset); - w[51] = amd_bytealign_S (w[20], w[21], offset); - w[50] = amd_bytealign_S (w[19], w[20], offset); - w[49] = amd_bytealign_S (w[18], w[19], offset); - w[48] = amd_bytealign_S (w[17], w[18], offset); - w[47] = amd_bytealign_S (w[16], w[17], offset); - w[46] = amd_bytealign_S (w[15], w[16], offset); - w[45] = amd_bytealign_S (w[14], w[15], offset); - w[44] = amd_bytealign_S (w[13], w[14], offset); - w[43] = amd_bytealign_S (w[12], w[13], offset); - w[42] = amd_bytealign_S (w[11], w[12], offset); - w[41] = amd_bytealign_S (w[10], w[11], offset); - w[40] = amd_bytealign_S (w[ 9], w[10], offset); - w[39] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[38] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[37] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[36] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[35] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[34] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[33] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[32] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[31] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[30] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[32], w[33], offset); + w[62] = hc_bytealign_S (w[31], w[32], offset); + w[61] = hc_bytealign_S (w[30], w[31], offset); + w[60] = hc_bytealign_S (w[29], w[30], offset); + w[59] = hc_bytealign_S (w[28], w[29], offset); + w[58] = hc_bytealign_S (w[27], w[28], offset); + w[57] = hc_bytealign_S (w[26], w[27], offset); + w[56] = hc_bytealign_S (w[25], w[26], offset); + w[55] = hc_bytealign_S (w[24], w[25], offset); + w[54] = hc_bytealign_S (w[23], w[24], offset); + w[53] = hc_bytealign_S (w[22], w[23], offset); + w[52] = hc_bytealign_S (w[21], w[22], offset); + w[51] = hc_bytealign_S (w[20], w[21], offset); + w[50] = hc_bytealign_S (w[19], w[20], offset); + w[49] = hc_bytealign_S (w[18], w[19], offset); + w[48] = hc_bytealign_S (w[17], w[18], offset); + w[47] = hc_bytealign_S (w[16], w[17], offset); + w[46] = hc_bytealign_S (w[15], w[16], offset); + w[45] = hc_bytealign_S (w[14], w[15], offset); + w[44] = hc_bytealign_S (w[13], w[14], offset); + w[43] = hc_bytealign_S (w[12], w[13], offset); + w[42] = hc_bytealign_S (w[11], w[12], offset); + w[41] = hc_bytealign_S (w[10], w[11], offset); + w[40] = hc_bytealign_S (w[ 9], w[10], offset); + w[39] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[38] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[37] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[36] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[35] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[34] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[33] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[32] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[31] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[30] = hc_bytealign_S ( 0, w[ 0], offset); w[29] = 0; w[28] = 0; w[27] = 0; @@ -44462,39 +44462,39 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 31: - w[63] = amd_bytealign_S (w[31], w[32], offset); - w[62] = amd_bytealign_S (w[30], w[31], offset); - w[61] = amd_bytealign_S (w[29], w[30], offset); - w[60] = amd_bytealign_S (w[28], w[29], offset); - w[59] = amd_bytealign_S (w[27], w[28], offset); - w[58] = amd_bytealign_S (w[26], w[27], offset); - w[57] = amd_bytealign_S (w[25], w[26], offset); - w[56] = amd_bytealign_S (w[24], w[25], offset); - w[55] = amd_bytealign_S (w[23], w[24], offset); - w[54] = amd_bytealign_S (w[22], w[23], offset); - w[53] = amd_bytealign_S (w[21], w[22], offset); - w[52] = amd_bytealign_S (w[20], w[21], offset); - w[51] = amd_bytealign_S (w[19], w[20], offset); - w[50] = amd_bytealign_S (w[18], w[19], offset); - w[49] = amd_bytealign_S (w[17], w[18], offset); - w[48] = amd_bytealign_S (w[16], w[17], offset); - w[47] = amd_bytealign_S (w[15], w[16], offset); - w[46] = amd_bytealign_S (w[14], w[15], offset); - w[45] = amd_bytealign_S (w[13], w[14], offset); - w[44] = amd_bytealign_S (w[12], w[13], offset); - w[43] = amd_bytealign_S (w[11], w[12], offset); - w[42] = amd_bytealign_S (w[10], w[11], offset); - w[41] = amd_bytealign_S (w[ 9], w[10], offset); - w[40] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[39] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[38] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[37] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[36] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[35] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[34] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[33] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[32] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[31] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[31], w[32], offset); + w[62] = hc_bytealign_S (w[30], w[31], offset); + w[61] = hc_bytealign_S (w[29], w[30], offset); + w[60] = hc_bytealign_S (w[28], w[29], offset); + w[59] = hc_bytealign_S (w[27], w[28], offset); + w[58] = hc_bytealign_S (w[26], w[27], offset); + w[57] = hc_bytealign_S (w[25], w[26], offset); + w[56] = hc_bytealign_S (w[24], w[25], offset); + w[55] = hc_bytealign_S (w[23], w[24], offset); + w[54] = hc_bytealign_S (w[22], w[23], offset); + w[53] = hc_bytealign_S (w[21], w[22], offset); + w[52] = hc_bytealign_S (w[20], w[21], offset); + w[51] = hc_bytealign_S (w[19], w[20], offset); + w[50] = hc_bytealign_S (w[18], w[19], offset); + w[49] = hc_bytealign_S (w[17], w[18], offset); + w[48] = hc_bytealign_S (w[16], w[17], offset); + w[47] = hc_bytealign_S (w[15], w[16], offset); + w[46] = hc_bytealign_S (w[14], w[15], offset); + w[45] = hc_bytealign_S (w[13], w[14], offset); + w[44] = hc_bytealign_S (w[12], w[13], offset); + w[43] = hc_bytealign_S (w[11], w[12], offset); + w[42] = hc_bytealign_S (w[10], w[11], offset); + w[41] = hc_bytealign_S (w[ 9], w[10], offset); + w[40] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[39] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[38] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[37] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[36] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[35] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[34] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[33] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[32] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[31] = hc_bytealign_S ( 0, w[ 0], offset); w[30] = 0; w[29] = 0; w[28] = 0; @@ -44530,38 +44530,38 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 32: - w[63] = amd_bytealign_S (w[30], w[31], offset); - w[62] = amd_bytealign_S (w[29], w[30], offset); - w[61] = amd_bytealign_S (w[28], w[29], offset); - w[60] = amd_bytealign_S (w[27], w[28], offset); - w[59] = amd_bytealign_S (w[26], w[27], offset); - w[58] = amd_bytealign_S (w[25], w[26], offset); - w[57] = amd_bytealign_S (w[24], w[25], offset); - w[56] = amd_bytealign_S (w[23], w[24], offset); - w[55] = amd_bytealign_S (w[22], w[23], offset); - w[54] = amd_bytealign_S (w[21], w[22], offset); - w[53] = amd_bytealign_S (w[20], w[21], offset); - w[52] = amd_bytealign_S (w[19], w[20], offset); - w[51] = amd_bytealign_S (w[18], w[19], offset); - w[50] = amd_bytealign_S (w[17], w[18], offset); - w[49] = amd_bytealign_S (w[16], w[17], offset); - w[48] = amd_bytealign_S (w[15], w[16], offset); - w[47] = amd_bytealign_S (w[14], w[15], offset); - w[46] = amd_bytealign_S (w[13], w[14], offset); - w[45] = amd_bytealign_S (w[12], w[13], offset); - w[44] = amd_bytealign_S (w[11], w[12], offset); - w[43] = amd_bytealign_S (w[10], w[11], offset); - w[42] = amd_bytealign_S (w[ 9], w[10], offset); - w[41] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[40] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[39] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[38] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[37] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[36] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[35] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[34] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[33] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[32] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[30], w[31], offset); + w[62] = hc_bytealign_S (w[29], w[30], offset); + w[61] = hc_bytealign_S (w[28], w[29], offset); + w[60] = hc_bytealign_S (w[27], w[28], offset); + w[59] = hc_bytealign_S (w[26], w[27], offset); + w[58] = hc_bytealign_S (w[25], w[26], offset); + w[57] = hc_bytealign_S (w[24], w[25], offset); + w[56] = hc_bytealign_S (w[23], w[24], offset); + w[55] = hc_bytealign_S (w[22], w[23], offset); + w[54] = hc_bytealign_S (w[21], w[22], offset); + w[53] = hc_bytealign_S (w[20], w[21], offset); + w[52] = hc_bytealign_S (w[19], w[20], offset); + w[51] = hc_bytealign_S (w[18], w[19], offset); + w[50] = hc_bytealign_S (w[17], w[18], offset); + w[49] = hc_bytealign_S (w[16], w[17], offset); + w[48] = hc_bytealign_S (w[15], w[16], offset); + w[47] = hc_bytealign_S (w[14], w[15], offset); + w[46] = hc_bytealign_S (w[13], w[14], offset); + w[45] = hc_bytealign_S (w[12], w[13], offset); + w[44] = hc_bytealign_S (w[11], w[12], offset); + w[43] = hc_bytealign_S (w[10], w[11], offset); + w[42] = hc_bytealign_S (w[ 9], w[10], offset); + w[41] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[40] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[39] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[38] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[37] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[36] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[35] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[34] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[33] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[32] = hc_bytealign_S ( 0, w[ 0], offset); w[31] = 0; w[30] = 0; w[29] = 0; @@ -44598,37 +44598,37 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 33: - w[63] = amd_bytealign_S (w[29], w[30], offset); - w[62] = amd_bytealign_S (w[28], w[29], offset); - w[61] = amd_bytealign_S (w[27], w[28], offset); - w[60] = amd_bytealign_S (w[26], w[27], offset); - w[59] = amd_bytealign_S (w[25], w[26], offset); - w[58] = amd_bytealign_S (w[24], w[25], offset); - w[57] = amd_bytealign_S (w[23], w[24], offset); - w[56] = amd_bytealign_S (w[22], w[23], offset); - w[55] = amd_bytealign_S (w[21], w[22], offset); - w[54] = amd_bytealign_S (w[20], w[21], offset); - w[53] = amd_bytealign_S (w[19], w[20], offset); - w[52] = amd_bytealign_S (w[18], w[19], offset); - w[51] = amd_bytealign_S (w[17], w[18], offset); - w[50] = amd_bytealign_S (w[16], w[17], offset); - w[49] = amd_bytealign_S (w[15], w[16], offset); - w[48] = amd_bytealign_S (w[14], w[15], offset); - w[47] = amd_bytealign_S (w[13], w[14], offset); - w[46] = amd_bytealign_S (w[12], w[13], offset); - w[45] = amd_bytealign_S (w[11], w[12], offset); - w[44] = amd_bytealign_S (w[10], w[11], offset); - w[43] = amd_bytealign_S (w[ 9], w[10], offset); - w[42] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[41] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[40] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[39] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[38] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[37] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[36] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[35] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[34] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[33] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[29], w[30], offset); + w[62] = hc_bytealign_S (w[28], w[29], offset); + w[61] = hc_bytealign_S (w[27], w[28], offset); + w[60] = hc_bytealign_S (w[26], w[27], offset); + w[59] = hc_bytealign_S (w[25], w[26], offset); + w[58] = hc_bytealign_S (w[24], w[25], offset); + w[57] = hc_bytealign_S (w[23], w[24], offset); + w[56] = hc_bytealign_S (w[22], w[23], offset); + w[55] = hc_bytealign_S (w[21], w[22], offset); + w[54] = hc_bytealign_S (w[20], w[21], offset); + w[53] = hc_bytealign_S (w[19], w[20], offset); + w[52] = hc_bytealign_S (w[18], w[19], offset); + w[51] = hc_bytealign_S (w[17], w[18], offset); + w[50] = hc_bytealign_S (w[16], w[17], offset); + w[49] = hc_bytealign_S (w[15], w[16], offset); + w[48] = hc_bytealign_S (w[14], w[15], offset); + w[47] = hc_bytealign_S (w[13], w[14], offset); + w[46] = hc_bytealign_S (w[12], w[13], offset); + w[45] = hc_bytealign_S (w[11], w[12], offset); + w[44] = hc_bytealign_S (w[10], w[11], offset); + w[43] = hc_bytealign_S (w[ 9], w[10], offset); + w[42] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[41] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[40] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[39] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[38] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[37] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[36] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[35] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[34] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[33] = hc_bytealign_S ( 0, w[ 0], offset); w[32] = 0; w[31] = 0; w[30] = 0; @@ -44666,36 +44666,36 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 34: - w[63] = amd_bytealign_S (w[28], w[29], offset); - w[62] = amd_bytealign_S (w[27], w[28], offset); - w[61] = amd_bytealign_S (w[26], w[27], offset); - w[60] = amd_bytealign_S (w[25], w[26], offset); - w[59] = amd_bytealign_S (w[24], w[25], offset); - w[58] = amd_bytealign_S (w[23], w[24], offset); - w[57] = amd_bytealign_S (w[22], w[23], offset); - w[56] = amd_bytealign_S (w[21], w[22], offset); - w[55] = amd_bytealign_S (w[20], w[21], offset); - w[54] = amd_bytealign_S (w[19], w[20], offset); - w[53] = amd_bytealign_S (w[18], w[19], offset); - w[52] = amd_bytealign_S (w[17], w[18], offset); - w[51] = amd_bytealign_S (w[16], w[17], offset); - w[50] = amd_bytealign_S (w[15], w[16], offset); - w[49] = amd_bytealign_S (w[14], w[15], offset); - w[48] = amd_bytealign_S (w[13], w[14], offset); - w[47] = amd_bytealign_S (w[12], w[13], offset); - w[46] = amd_bytealign_S (w[11], w[12], offset); - w[45] = amd_bytealign_S (w[10], w[11], offset); - w[44] = amd_bytealign_S (w[ 9], w[10], offset); - w[43] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[42] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[41] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[40] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[39] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[38] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[37] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[36] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[35] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[34] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[28], w[29], offset); + w[62] = hc_bytealign_S (w[27], w[28], offset); + w[61] = hc_bytealign_S (w[26], w[27], offset); + w[60] = hc_bytealign_S (w[25], w[26], offset); + w[59] = hc_bytealign_S (w[24], w[25], offset); + w[58] = hc_bytealign_S (w[23], w[24], offset); + w[57] = hc_bytealign_S (w[22], w[23], offset); + w[56] = hc_bytealign_S (w[21], w[22], offset); + w[55] = hc_bytealign_S (w[20], w[21], offset); + w[54] = hc_bytealign_S (w[19], w[20], offset); + w[53] = hc_bytealign_S (w[18], w[19], offset); + w[52] = hc_bytealign_S (w[17], w[18], offset); + w[51] = hc_bytealign_S (w[16], w[17], offset); + w[50] = hc_bytealign_S (w[15], w[16], offset); + w[49] = hc_bytealign_S (w[14], w[15], offset); + w[48] = hc_bytealign_S (w[13], w[14], offset); + w[47] = hc_bytealign_S (w[12], w[13], offset); + w[46] = hc_bytealign_S (w[11], w[12], offset); + w[45] = hc_bytealign_S (w[10], w[11], offset); + w[44] = hc_bytealign_S (w[ 9], w[10], offset); + w[43] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[42] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[41] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[40] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[39] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[38] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[37] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[36] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[35] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[34] = hc_bytealign_S ( 0, w[ 0], offset); w[33] = 0; w[32] = 0; w[31] = 0; @@ -44734,35 +44734,35 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 35: - w[63] = amd_bytealign_S (w[27], w[28], offset); - w[62] = amd_bytealign_S (w[26], w[27], offset); - w[61] = amd_bytealign_S (w[25], w[26], offset); - w[60] = amd_bytealign_S (w[24], w[25], offset); - w[59] = amd_bytealign_S (w[23], w[24], offset); - w[58] = amd_bytealign_S (w[22], w[23], offset); - w[57] = amd_bytealign_S (w[21], w[22], offset); - w[56] = amd_bytealign_S (w[20], w[21], offset); - w[55] = amd_bytealign_S (w[19], w[20], offset); - w[54] = amd_bytealign_S (w[18], w[19], offset); - w[53] = amd_bytealign_S (w[17], w[18], offset); - w[52] = amd_bytealign_S (w[16], w[17], offset); - w[51] = amd_bytealign_S (w[15], w[16], offset); - w[50] = amd_bytealign_S (w[14], w[15], offset); - w[49] = amd_bytealign_S (w[13], w[14], offset); - w[48] = amd_bytealign_S (w[12], w[13], offset); - w[47] = amd_bytealign_S (w[11], w[12], offset); - w[46] = amd_bytealign_S (w[10], w[11], offset); - w[45] = amd_bytealign_S (w[ 9], w[10], offset); - w[44] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[43] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[42] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[41] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[40] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[39] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[38] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[37] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[36] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[35] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[27], w[28], offset); + w[62] = hc_bytealign_S (w[26], w[27], offset); + w[61] = hc_bytealign_S (w[25], w[26], offset); + w[60] = hc_bytealign_S (w[24], w[25], offset); + w[59] = hc_bytealign_S (w[23], w[24], offset); + w[58] = hc_bytealign_S (w[22], w[23], offset); + w[57] = hc_bytealign_S (w[21], w[22], offset); + w[56] = hc_bytealign_S (w[20], w[21], offset); + w[55] = hc_bytealign_S (w[19], w[20], offset); + w[54] = hc_bytealign_S (w[18], w[19], offset); + w[53] = hc_bytealign_S (w[17], w[18], offset); + w[52] = hc_bytealign_S (w[16], w[17], offset); + w[51] = hc_bytealign_S (w[15], w[16], offset); + w[50] = hc_bytealign_S (w[14], w[15], offset); + w[49] = hc_bytealign_S (w[13], w[14], offset); + w[48] = hc_bytealign_S (w[12], w[13], offset); + w[47] = hc_bytealign_S (w[11], w[12], offset); + w[46] = hc_bytealign_S (w[10], w[11], offset); + w[45] = hc_bytealign_S (w[ 9], w[10], offset); + w[44] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[43] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[42] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[41] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[40] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[39] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[38] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[37] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[36] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[35] = hc_bytealign_S ( 0, w[ 0], offset); w[34] = 0; w[33] = 0; w[32] = 0; @@ -44802,34 +44802,34 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 36: - w[63] = amd_bytealign_S (w[26], w[27], offset); - w[62] = amd_bytealign_S (w[25], w[26], offset); - w[61] = amd_bytealign_S (w[24], w[25], offset); - w[60] = amd_bytealign_S (w[23], w[24], offset); - w[59] = amd_bytealign_S (w[22], w[23], offset); - w[58] = amd_bytealign_S (w[21], w[22], offset); - w[57] = amd_bytealign_S (w[20], w[21], offset); - w[56] = amd_bytealign_S (w[19], w[20], offset); - w[55] = amd_bytealign_S (w[18], w[19], offset); - w[54] = amd_bytealign_S (w[17], w[18], offset); - w[53] = amd_bytealign_S (w[16], w[17], offset); - w[52] = amd_bytealign_S (w[15], w[16], offset); - w[51] = amd_bytealign_S (w[14], w[15], offset); - w[50] = amd_bytealign_S (w[13], w[14], offset); - w[49] = amd_bytealign_S (w[12], w[13], offset); - w[48] = amd_bytealign_S (w[11], w[12], offset); - w[47] = amd_bytealign_S (w[10], w[11], offset); - w[46] = amd_bytealign_S (w[ 9], w[10], offset); - w[45] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[44] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[43] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[42] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[41] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[40] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[39] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[38] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[37] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[36] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[26], w[27], offset); + w[62] = hc_bytealign_S (w[25], w[26], offset); + w[61] = hc_bytealign_S (w[24], w[25], offset); + w[60] = hc_bytealign_S (w[23], w[24], offset); + w[59] = hc_bytealign_S (w[22], w[23], offset); + w[58] = hc_bytealign_S (w[21], w[22], offset); + w[57] = hc_bytealign_S (w[20], w[21], offset); + w[56] = hc_bytealign_S (w[19], w[20], offset); + w[55] = hc_bytealign_S (w[18], w[19], offset); + w[54] = hc_bytealign_S (w[17], w[18], offset); + w[53] = hc_bytealign_S (w[16], w[17], offset); + w[52] = hc_bytealign_S (w[15], w[16], offset); + w[51] = hc_bytealign_S (w[14], w[15], offset); + w[50] = hc_bytealign_S (w[13], w[14], offset); + w[49] = hc_bytealign_S (w[12], w[13], offset); + w[48] = hc_bytealign_S (w[11], w[12], offset); + w[47] = hc_bytealign_S (w[10], w[11], offset); + w[46] = hc_bytealign_S (w[ 9], w[10], offset); + w[45] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[44] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[43] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[42] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[41] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[40] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[39] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[38] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[37] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[36] = hc_bytealign_S ( 0, w[ 0], offset); w[35] = 0; w[34] = 0; w[33] = 0; @@ -44870,33 +44870,33 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 37: - w[63] = amd_bytealign_S (w[25], w[26], offset); - w[62] = amd_bytealign_S (w[24], w[25], offset); - w[61] = amd_bytealign_S (w[23], w[24], offset); - w[60] = amd_bytealign_S (w[22], w[23], offset); - w[59] = amd_bytealign_S (w[21], w[22], offset); - w[58] = amd_bytealign_S (w[20], w[21], offset); - w[57] = amd_bytealign_S (w[19], w[20], offset); - w[56] = amd_bytealign_S (w[18], w[19], offset); - w[55] = amd_bytealign_S (w[17], w[18], offset); - w[54] = amd_bytealign_S (w[16], w[17], offset); - w[53] = amd_bytealign_S (w[15], w[16], offset); - w[52] = amd_bytealign_S (w[14], w[15], offset); - w[51] = amd_bytealign_S (w[13], w[14], offset); - w[50] = amd_bytealign_S (w[12], w[13], offset); - w[49] = amd_bytealign_S (w[11], w[12], offset); - w[48] = amd_bytealign_S (w[10], w[11], offset); - w[47] = amd_bytealign_S (w[ 9], w[10], offset); - w[46] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[45] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[44] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[43] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[42] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[41] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[40] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[39] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[38] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[37] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[25], w[26], offset); + w[62] = hc_bytealign_S (w[24], w[25], offset); + w[61] = hc_bytealign_S (w[23], w[24], offset); + w[60] = hc_bytealign_S (w[22], w[23], offset); + w[59] = hc_bytealign_S (w[21], w[22], offset); + w[58] = hc_bytealign_S (w[20], w[21], offset); + w[57] = hc_bytealign_S (w[19], w[20], offset); + w[56] = hc_bytealign_S (w[18], w[19], offset); + w[55] = hc_bytealign_S (w[17], w[18], offset); + w[54] = hc_bytealign_S (w[16], w[17], offset); + w[53] = hc_bytealign_S (w[15], w[16], offset); + w[52] = hc_bytealign_S (w[14], w[15], offset); + w[51] = hc_bytealign_S (w[13], w[14], offset); + w[50] = hc_bytealign_S (w[12], w[13], offset); + w[49] = hc_bytealign_S (w[11], w[12], offset); + w[48] = hc_bytealign_S (w[10], w[11], offset); + w[47] = hc_bytealign_S (w[ 9], w[10], offset); + w[46] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[45] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[44] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[43] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[42] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[41] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[40] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[39] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[38] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[37] = hc_bytealign_S ( 0, w[ 0], offset); w[36] = 0; w[35] = 0; w[34] = 0; @@ -44938,32 +44938,32 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 38: - w[63] = amd_bytealign_S (w[24], w[25], offset); - w[62] = amd_bytealign_S (w[23], w[24], offset); - w[61] = amd_bytealign_S (w[22], w[23], offset); - w[60] = amd_bytealign_S (w[21], w[22], offset); - w[59] = amd_bytealign_S (w[20], w[21], offset); - w[58] = amd_bytealign_S (w[19], w[20], offset); - w[57] = amd_bytealign_S (w[18], w[19], offset); - w[56] = amd_bytealign_S (w[17], w[18], offset); - w[55] = amd_bytealign_S (w[16], w[17], offset); - w[54] = amd_bytealign_S (w[15], w[16], offset); - w[53] = amd_bytealign_S (w[14], w[15], offset); - w[52] = amd_bytealign_S (w[13], w[14], offset); - w[51] = amd_bytealign_S (w[12], w[13], offset); - w[50] = amd_bytealign_S (w[11], w[12], offset); - w[49] = amd_bytealign_S (w[10], w[11], offset); - w[48] = amd_bytealign_S (w[ 9], w[10], offset); - w[47] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[46] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[45] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[44] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[43] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[42] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[41] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[40] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[39] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[38] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[24], w[25], offset); + w[62] = hc_bytealign_S (w[23], w[24], offset); + w[61] = hc_bytealign_S (w[22], w[23], offset); + w[60] = hc_bytealign_S (w[21], w[22], offset); + w[59] = hc_bytealign_S (w[20], w[21], offset); + w[58] = hc_bytealign_S (w[19], w[20], offset); + w[57] = hc_bytealign_S (w[18], w[19], offset); + w[56] = hc_bytealign_S (w[17], w[18], offset); + w[55] = hc_bytealign_S (w[16], w[17], offset); + w[54] = hc_bytealign_S (w[15], w[16], offset); + w[53] = hc_bytealign_S (w[14], w[15], offset); + w[52] = hc_bytealign_S (w[13], w[14], offset); + w[51] = hc_bytealign_S (w[12], w[13], offset); + w[50] = hc_bytealign_S (w[11], w[12], offset); + w[49] = hc_bytealign_S (w[10], w[11], offset); + w[48] = hc_bytealign_S (w[ 9], w[10], offset); + w[47] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[46] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[45] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[44] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[43] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[42] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[41] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[40] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[39] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[38] = hc_bytealign_S ( 0, w[ 0], offset); w[37] = 0; w[36] = 0; w[35] = 0; @@ -45006,31 +45006,31 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 39: - w[63] = amd_bytealign_S (w[23], w[24], offset); - w[62] = amd_bytealign_S (w[22], w[23], offset); - w[61] = amd_bytealign_S (w[21], w[22], offset); - w[60] = amd_bytealign_S (w[20], w[21], offset); - w[59] = amd_bytealign_S (w[19], w[20], offset); - w[58] = amd_bytealign_S (w[18], w[19], offset); - w[57] = amd_bytealign_S (w[17], w[18], offset); - w[56] = amd_bytealign_S (w[16], w[17], offset); - w[55] = amd_bytealign_S (w[15], w[16], offset); - w[54] = amd_bytealign_S (w[14], w[15], offset); - w[53] = amd_bytealign_S (w[13], w[14], offset); - w[52] = amd_bytealign_S (w[12], w[13], offset); - w[51] = amd_bytealign_S (w[11], w[12], offset); - w[50] = amd_bytealign_S (w[10], w[11], offset); - w[49] = amd_bytealign_S (w[ 9], w[10], offset); - w[48] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[47] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[46] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[45] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[44] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[43] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[42] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[41] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[40] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[39] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[23], w[24], offset); + w[62] = hc_bytealign_S (w[22], w[23], offset); + w[61] = hc_bytealign_S (w[21], w[22], offset); + w[60] = hc_bytealign_S (w[20], w[21], offset); + w[59] = hc_bytealign_S (w[19], w[20], offset); + w[58] = hc_bytealign_S (w[18], w[19], offset); + w[57] = hc_bytealign_S (w[17], w[18], offset); + w[56] = hc_bytealign_S (w[16], w[17], offset); + w[55] = hc_bytealign_S (w[15], w[16], offset); + w[54] = hc_bytealign_S (w[14], w[15], offset); + w[53] = hc_bytealign_S (w[13], w[14], offset); + w[52] = hc_bytealign_S (w[12], w[13], offset); + w[51] = hc_bytealign_S (w[11], w[12], offset); + w[50] = hc_bytealign_S (w[10], w[11], offset); + w[49] = hc_bytealign_S (w[ 9], w[10], offset); + w[48] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[47] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[46] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[45] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[44] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[43] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[42] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[41] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[40] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[39] = hc_bytealign_S ( 0, w[ 0], offset); w[38] = 0; w[37] = 0; w[36] = 0; @@ -45074,30 +45074,30 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 40: - w[63] = amd_bytealign_S (w[22], w[23], offset); - w[62] = amd_bytealign_S (w[21], w[22], offset); - w[61] = amd_bytealign_S (w[20], w[21], offset); - w[60] = amd_bytealign_S (w[19], w[20], offset); - w[59] = amd_bytealign_S (w[18], w[19], offset); - w[58] = amd_bytealign_S (w[17], w[18], offset); - w[57] = amd_bytealign_S (w[16], w[17], offset); - w[56] = amd_bytealign_S (w[15], w[16], offset); - w[55] = amd_bytealign_S (w[14], w[15], offset); - w[54] = amd_bytealign_S (w[13], w[14], offset); - w[53] = amd_bytealign_S (w[12], w[13], offset); - w[52] = amd_bytealign_S (w[11], w[12], offset); - w[51] = amd_bytealign_S (w[10], w[11], offset); - w[50] = amd_bytealign_S (w[ 9], w[10], offset); - w[49] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[48] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[47] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[46] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[45] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[44] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[43] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[42] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[41] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[40] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[22], w[23], offset); + w[62] = hc_bytealign_S (w[21], w[22], offset); + w[61] = hc_bytealign_S (w[20], w[21], offset); + w[60] = hc_bytealign_S (w[19], w[20], offset); + w[59] = hc_bytealign_S (w[18], w[19], offset); + w[58] = hc_bytealign_S (w[17], w[18], offset); + w[57] = hc_bytealign_S (w[16], w[17], offset); + w[56] = hc_bytealign_S (w[15], w[16], offset); + w[55] = hc_bytealign_S (w[14], w[15], offset); + w[54] = hc_bytealign_S (w[13], w[14], offset); + w[53] = hc_bytealign_S (w[12], w[13], offset); + w[52] = hc_bytealign_S (w[11], w[12], offset); + w[51] = hc_bytealign_S (w[10], w[11], offset); + w[50] = hc_bytealign_S (w[ 9], w[10], offset); + w[49] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[48] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[47] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[46] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[45] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[44] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[43] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[42] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[41] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[40] = hc_bytealign_S ( 0, w[ 0], offset); w[39] = 0; w[38] = 0; w[37] = 0; @@ -45142,29 +45142,29 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 41: - w[63] = amd_bytealign_S (w[21], w[22], offset); - w[62] = amd_bytealign_S (w[20], w[21], offset); - w[61] = amd_bytealign_S (w[19], w[20], offset); - w[60] = amd_bytealign_S (w[18], w[19], offset); - w[59] = amd_bytealign_S (w[17], w[18], offset); - w[58] = amd_bytealign_S (w[16], w[17], offset); - w[57] = amd_bytealign_S (w[15], w[16], offset); - w[56] = amd_bytealign_S (w[14], w[15], offset); - w[55] = amd_bytealign_S (w[13], w[14], offset); - w[54] = amd_bytealign_S (w[12], w[13], offset); - w[53] = amd_bytealign_S (w[11], w[12], offset); - w[52] = amd_bytealign_S (w[10], w[11], offset); - w[51] = amd_bytealign_S (w[ 9], w[10], offset); - w[50] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[49] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[48] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[47] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[46] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[45] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[44] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[43] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[42] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[41] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[21], w[22], offset); + w[62] = hc_bytealign_S (w[20], w[21], offset); + w[61] = hc_bytealign_S (w[19], w[20], offset); + w[60] = hc_bytealign_S (w[18], w[19], offset); + w[59] = hc_bytealign_S (w[17], w[18], offset); + w[58] = hc_bytealign_S (w[16], w[17], offset); + w[57] = hc_bytealign_S (w[15], w[16], offset); + w[56] = hc_bytealign_S (w[14], w[15], offset); + w[55] = hc_bytealign_S (w[13], w[14], offset); + w[54] = hc_bytealign_S (w[12], w[13], offset); + w[53] = hc_bytealign_S (w[11], w[12], offset); + w[52] = hc_bytealign_S (w[10], w[11], offset); + w[51] = hc_bytealign_S (w[ 9], w[10], offset); + w[50] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[49] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[48] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[47] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[46] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[45] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[44] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[43] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[42] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[41] = hc_bytealign_S ( 0, w[ 0], offset); w[40] = 0; w[39] = 0; w[38] = 0; @@ -45210,28 +45210,28 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 42: - w[63] = amd_bytealign_S (w[20], w[21], offset); - w[62] = amd_bytealign_S (w[19], w[20], offset); - w[61] = amd_bytealign_S (w[18], w[19], offset); - w[60] = amd_bytealign_S (w[17], w[18], offset); - w[59] = amd_bytealign_S (w[16], w[17], offset); - w[58] = amd_bytealign_S (w[15], w[16], offset); - w[57] = amd_bytealign_S (w[14], w[15], offset); - w[56] = amd_bytealign_S (w[13], w[14], offset); - w[55] = amd_bytealign_S (w[12], w[13], offset); - w[54] = amd_bytealign_S (w[11], w[12], offset); - w[53] = amd_bytealign_S (w[10], w[11], offset); - w[52] = amd_bytealign_S (w[ 9], w[10], offset); - w[51] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[50] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[49] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[48] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[47] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[46] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[45] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[44] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[43] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[42] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[20], w[21], offset); + w[62] = hc_bytealign_S (w[19], w[20], offset); + w[61] = hc_bytealign_S (w[18], w[19], offset); + w[60] = hc_bytealign_S (w[17], w[18], offset); + w[59] = hc_bytealign_S (w[16], w[17], offset); + w[58] = hc_bytealign_S (w[15], w[16], offset); + w[57] = hc_bytealign_S (w[14], w[15], offset); + w[56] = hc_bytealign_S (w[13], w[14], offset); + w[55] = hc_bytealign_S (w[12], w[13], offset); + w[54] = hc_bytealign_S (w[11], w[12], offset); + w[53] = hc_bytealign_S (w[10], w[11], offset); + w[52] = hc_bytealign_S (w[ 9], w[10], offset); + w[51] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[50] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[49] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[48] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[47] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[46] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[45] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[44] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[43] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[42] = hc_bytealign_S ( 0, w[ 0], offset); w[41] = 0; w[40] = 0; w[39] = 0; @@ -45278,27 +45278,27 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 43: - w[63] = amd_bytealign_S (w[19], w[20], offset); - w[62] = amd_bytealign_S (w[18], w[19], offset); - w[61] = amd_bytealign_S (w[17], w[18], offset); - w[60] = amd_bytealign_S (w[16], w[17], offset); - w[59] = amd_bytealign_S (w[15], w[16], offset); - w[58] = amd_bytealign_S (w[14], w[15], offset); - w[57] = amd_bytealign_S (w[13], w[14], offset); - w[56] = amd_bytealign_S (w[12], w[13], offset); - w[55] = amd_bytealign_S (w[11], w[12], offset); - w[54] = amd_bytealign_S (w[10], w[11], offset); - w[53] = amd_bytealign_S (w[ 9], w[10], offset); - w[52] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[51] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[50] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[49] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[48] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[47] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[46] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[45] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[44] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[43] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[19], w[20], offset); + w[62] = hc_bytealign_S (w[18], w[19], offset); + w[61] = hc_bytealign_S (w[17], w[18], offset); + w[60] = hc_bytealign_S (w[16], w[17], offset); + w[59] = hc_bytealign_S (w[15], w[16], offset); + w[58] = hc_bytealign_S (w[14], w[15], offset); + w[57] = hc_bytealign_S (w[13], w[14], offset); + w[56] = hc_bytealign_S (w[12], w[13], offset); + w[55] = hc_bytealign_S (w[11], w[12], offset); + w[54] = hc_bytealign_S (w[10], w[11], offset); + w[53] = hc_bytealign_S (w[ 9], w[10], offset); + w[52] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[51] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[50] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[49] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[48] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[47] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[46] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[45] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[44] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[43] = hc_bytealign_S ( 0, w[ 0], offset); w[42] = 0; w[41] = 0; w[40] = 0; @@ -45346,26 +45346,26 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 44: - w[63] = amd_bytealign_S (w[18], w[19], offset); - w[62] = amd_bytealign_S (w[17], w[18], offset); - w[61] = amd_bytealign_S (w[16], w[17], offset); - w[60] = amd_bytealign_S (w[15], w[16], offset); - w[59] = amd_bytealign_S (w[14], w[15], offset); - w[58] = amd_bytealign_S (w[13], w[14], offset); - w[57] = amd_bytealign_S (w[12], w[13], offset); - w[56] = amd_bytealign_S (w[11], w[12], offset); - w[55] = amd_bytealign_S (w[10], w[11], offset); - w[54] = amd_bytealign_S (w[ 9], w[10], offset); - w[53] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[52] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[51] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[50] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[49] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[48] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[47] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[46] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[45] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[44] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[18], w[19], offset); + w[62] = hc_bytealign_S (w[17], w[18], offset); + w[61] = hc_bytealign_S (w[16], w[17], offset); + w[60] = hc_bytealign_S (w[15], w[16], offset); + w[59] = hc_bytealign_S (w[14], w[15], offset); + w[58] = hc_bytealign_S (w[13], w[14], offset); + w[57] = hc_bytealign_S (w[12], w[13], offset); + w[56] = hc_bytealign_S (w[11], w[12], offset); + w[55] = hc_bytealign_S (w[10], w[11], offset); + w[54] = hc_bytealign_S (w[ 9], w[10], offset); + w[53] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[52] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[51] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[50] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[49] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[48] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[47] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[46] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[45] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[44] = hc_bytealign_S ( 0, w[ 0], offset); w[43] = 0; w[42] = 0; w[41] = 0; @@ -45414,25 +45414,25 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 45: - w[63] = amd_bytealign_S (w[17], w[18], offset); - w[62] = amd_bytealign_S (w[16], w[17], offset); - w[61] = amd_bytealign_S (w[15], w[16], offset); - w[60] = amd_bytealign_S (w[14], w[15], offset); - w[59] = amd_bytealign_S (w[13], w[14], offset); - w[58] = amd_bytealign_S (w[12], w[13], offset); - w[57] = amd_bytealign_S (w[11], w[12], offset); - w[56] = amd_bytealign_S (w[10], w[11], offset); - w[55] = amd_bytealign_S (w[ 9], w[10], offset); - w[54] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[53] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[52] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[51] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[50] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[49] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[48] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[47] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[46] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[45] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[17], w[18], offset); + w[62] = hc_bytealign_S (w[16], w[17], offset); + w[61] = hc_bytealign_S (w[15], w[16], offset); + w[60] = hc_bytealign_S (w[14], w[15], offset); + w[59] = hc_bytealign_S (w[13], w[14], offset); + w[58] = hc_bytealign_S (w[12], w[13], offset); + w[57] = hc_bytealign_S (w[11], w[12], offset); + w[56] = hc_bytealign_S (w[10], w[11], offset); + w[55] = hc_bytealign_S (w[ 9], w[10], offset); + w[54] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[53] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[52] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[51] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[50] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[49] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[48] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[47] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[46] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[45] = hc_bytealign_S ( 0, w[ 0], offset); w[44] = 0; w[43] = 0; w[42] = 0; @@ -45482,24 +45482,24 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 46: - w[63] = amd_bytealign_S (w[16], w[17], offset); - w[62] = amd_bytealign_S (w[15], w[16], offset); - w[61] = amd_bytealign_S (w[14], w[15], offset); - w[60] = amd_bytealign_S (w[13], w[14], offset); - w[59] = amd_bytealign_S (w[12], w[13], offset); - w[58] = amd_bytealign_S (w[11], w[12], offset); - w[57] = amd_bytealign_S (w[10], w[11], offset); - w[56] = amd_bytealign_S (w[ 9], w[10], offset); - w[55] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[54] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[53] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[52] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[51] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[50] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[49] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[48] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[47] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[46] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[16], w[17], offset); + w[62] = hc_bytealign_S (w[15], w[16], offset); + w[61] = hc_bytealign_S (w[14], w[15], offset); + w[60] = hc_bytealign_S (w[13], w[14], offset); + w[59] = hc_bytealign_S (w[12], w[13], offset); + w[58] = hc_bytealign_S (w[11], w[12], offset); + w[57] = hc_bytealign_S (w[10], w[11], offset); + w[56] = hc_bytealign_S (w[ 9], w[10], offset); + w[55] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[54] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[53] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[52] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[51] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[50] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[49] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[48] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[47] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[46] = hc_bytealign_S ( 0, w[ 0], offset); w[45] = 0; w[44] = 0; w[43] = 0; @@ -45550,23 +45550,23 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 47: - w[63] = amd_bytealign_S (w[15], w[16], offset); - w[62] = amd_bytealign_S (w[14], w[15], offset); - w[61] = amd_bytealign_S (w[13], w[14], offset); - w[60] = amd_bytealign_S (w[12], w[13], offset); - w[59] = amd_bytealign_S (w[11], w[12], offset); - w[58] = amd_bytealign_S (w[10], w[11], offset); - w[57] = amd_bytealign_S (w[ 9], w[10], offset); - w[56] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[55] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[54] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[53] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[52] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[51] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[50] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[49] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[48] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[47] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[15], w[16], offset); + w[62] = hc_bytealign_S (w[14], w[15], offset); + w[61] = hc_bytealign_S (w[13], w[14], offset); + w[60] = hc_bytealign_S (w[12], w[13], offset); + w[59] = hc_bytealign_S (w[11], w[12], offset); + w[58] = hc_bytealign_S (w[10], w[11], offset); + w[57] = hc_bytealign_S (w[ 9], w[10], offset); + w[56] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[55] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[54] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[53] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[52] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[51] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[50] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[49] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[48] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[47] = hc_bytealign_S ( 0, w[ 0], offset); w[46] = 0; w[45] = 0; w[44] = 0; @@ -45618,22 +45618,22 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 48: - w[63] = amd_bytealign_S (w[14], w[15], offset); - w[62] = amd_bytealign_S (w[13], w[14], offset); - w[61] = amd_bytealign_S (w[12], w[13], offset); - w[60] = amd_bytealign_S (w[11], w[12], offset); - w[59] = amd_bytealign_S (w[10], w[11], offset); - w[58] = amd_bytealign_S (w[ 9], w[10], offset); - w[57] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[56] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[55] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[54] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[53] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[52] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[51] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[50] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[49] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[48] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[14], w[15], offset); + w[62] = hc_bytealign_S (w[13], w[14], offset); + w[61] = hc_bytealign_S (w[12], w[13], offset); + w[60] = hc_bytealign_S (w[11], w[12], offset); + w[59] = hc_bytealign_S (w[10], w[11], offset); + w[58] = hc_bytealign_S (w[ 9], w[10], offset); + w[57] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[56] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[55] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[54] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[53] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[52] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[51] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[50] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[49] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[48] = hc_bytealign_S ( 0, w[ 0], offset); w[47] = 0; w[46] = 0; w[45] = 0; @@ -45686,21 +45686,21 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 49: - w[63] = amd_bytealign_S (w[13], w[14], offset); - w[62] = amd_bytealign_S (w[12], w[13], offset); - w[61] = amd_bytealign_S (w[11], w[12], offset); - w[60] = amd_bytealign_S (w[10], w[11], offset); - w[59] = amd_bytealign_S (w[ 9], w[10], offset); - w[58] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[57] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[56] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[55] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[54] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[53] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[52] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[51] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[50] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[49] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[13], w[14], offset); + w[62] = hc_bytealign_S (w[12], w[13], offset); + w[61] = hc_bytealign_S (w[11], w[12], offset); + w[60] = hc_bytealign_S (w[10], w[11], offset); + w[59] = hc_bytealign_S (w[ 9], w[10], offset); + w[58] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[57] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[56] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[55] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[54] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[53] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[52] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[51] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[50] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[49] = hc_bytealign_S ( 0, w[ 0], offset); w[48] = 0; w[47] = 0; w[46] = 0; @@ -45754,20 +45754,20 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 50: - w[63] = amd_bytealign_S (w[12], w[13], offset); - w[62] = amd_bytealign_S (w[11], w[12], offset); - w[61] = amd_bytealign_S (w[10], w[11], offset); - w[60] = amd_bytealign_S (w[ 9], w[10], offset); - w[59] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[58] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[57] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[56] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[55] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[54] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[53] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[52] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[51] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[50] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[12], w[13], offset); + w[62] = hc_bytealign_S (w[11], w[12], offset); + w[61] = hc_bytealign_S (w[10], w[11], offset); + w[60] = hc_bytealign_S (w[ 9], w[10], offset); + w[59] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[58] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[57] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[56] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[55] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[54] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[53] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[52] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[51] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[50] = hc_bytealign_S ( 0, w[ 0], offset); w[49] = 0; w[48] = 0; w[47] = 0; @@ -45822,19 +45822,19 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 51: - w[63] = amd_bytealign_S (w[11], w[12], offset); - w[62] = amd_bytealign_S (w[10], w[11], offset); - w[61] = amd_bytealign_S (w[ 9], w[10], offset); - w[60] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[59] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[58] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[57] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[56] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[55] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[54] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[53] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[52] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[51] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[11], w[12], offset); + w[62] = hc_bytealign_S (w[10], w[11], offset); + w[61] = hc_bytealign_S (w[ 9], w[10], offset); + w[60] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[59] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[58] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[57] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[56] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[55] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[54] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[53] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[52] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[51] = hc_bytealign_S ( 0, w[ 0], offset); w[50] = 0; w[49] = 0; w[48] = 0; @@ -45890,18 +45890,18 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 52: - w[63] = amd_bytealign_S (w[10], w[11], offset); - w[62] = amd_bytealign_S (w[ 9], w[10], offset); - w[61] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[60] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[59] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[58] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[57] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[56] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[55] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[54] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[53] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[52] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[10], w[11], offset); + w[62] = hc_bytealign_S (w[ 9], w[10], offset); + w[61] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[60] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[59] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[58] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[57] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[56] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[55] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[54] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[53] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[52] = hc_bytealign_S ( 0, w[ 0], offset); w[51] = 0; w[50] = 0; w[49] = 0; @@ -45958,17 +45958,17 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 53: - w[63] = amd_bytealign_S (w[ 9], w[10], offset); - w[62] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[61] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[60] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[59] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[58] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[57] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[56] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[55] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[54] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[53] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[ 9], w[10], offset); + w[62] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[61] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[60] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[59] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[58] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[57] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[56] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[55] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[54] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[53] = hc_bytealign_S ( 0, w[ 0], offset); w[52] = 0; w[51] = 0; w[50] = 0; @@ -46026,16 +46026,16 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 54: - w[63] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[62] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[61] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[60] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[59] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[58] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[57] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[56] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[55] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[54] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[62] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[61] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[60] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[59] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[58] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[57] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[56] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[55] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[54] = hc_bytealign_S ( 0, w[ 0], offset); w[53] = 0; w[52] = 0; w[51] = 0; @@ -46094,15 +46094,15 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 55: - w[63] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[62] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[61] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[60] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[59] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[58] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[57] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[56] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[55] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[62] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[61] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[60] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[59] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[58] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[57] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[56] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[55] = hc_bytealign_S ( 0, w[ 0], offset); w[54] = 0; w[53] = 0; w[52] = 0; @@ -46162,14 +46162,14 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 56: - w[63] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[62] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[61] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[60] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[59] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[58] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[57] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[56] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[62] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[61] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[60] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[59] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[58] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[57] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[56] = hc_bytealign_S ( 0, w[ 0], offset); w[55] = 0; w[54] = 0; w[53] = 0; @@ -46230,13 +46230,13 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 57: - w[63] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[62] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[61] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[60] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[59] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[58] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[57] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[62] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[61] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[60] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[59] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[58] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[57] = hc_bytealign_S ( 0, w[ 0], offset); w[56] = 0; w[55] = 0; w[54] = 0; @@ -46298,12 +46298,12 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 58: - w[63] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[62] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[61] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[60] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[59] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[58] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[62] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[61] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[60] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[59] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[58] = hc_bytealign_S ( 0, w[ 0], offset); w[57] = 0; w[56] = 0; w[55] = 0; @@ -46366,11 +46366,11 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 59: - w[63] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[62] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[61] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[60] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[59] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[62] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[61] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[60] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[59] = hc_bytealign_S ( 0, w[ 0], offset); w[58] = 0; w[57] = 0; w[56] = 0; @@ -46434,10 +46434,10 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 60: - w[63] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[62] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[61] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[60] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[62] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[61] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[60] = hc_bytealign_S ( 0, w[ 0], offset); w[59] = 0; w[58] = 0; w[57] = 0; @@ -46502,9 +46502,9 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 61: - w[63] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[62] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[61] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[62] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[61] = hc_bytealign_S ( 0, w[ 0], offset); w[60] = 0; w[59] = 0; w[58] = 0; @@ -46570,8 +46570,8 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 62: - w[63] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[62] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[62] = hc_bytealign_S ( 0, w[ 0], offset); w[61] = 0; w[60] = 0; w[59] = 0; @@ -46638,7 +46638,7 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 63: - w[63] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S ( 0, w[ 0], offset); w[62] = 0; w[61] = 0; w[60] = 0; @@ -46724,271 +46724,271 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) switch (offset_switch) { case 0: - w[63] = __byte_perm_S (w[62], w[63], selector); - w[62] = __byte_perm_S (w[61], w[62], selector); - w[61] = __byte_perm_S (w[60], w[61], selector); - w[60] = __byte_perm_S (w[59], w[60], selector); - w[59] = __byte_perm_S (w[58], w[59], selector); - w[58] = __byte_perm_S (w[57], w[58], selector); - w[57] = __byte_perm_S (w[56], w[57], selector); - w[56] = __byte_perm_S (w[55], w[56], selector); - w[55] = __byte_perm_S (w[54], w[55], selector); - w[54] = __byte_perm_S (w[53], w[54], selector); - w[53] = __byte_perm_S (w[52], w[53], selector); - w[52] = __byte_perm_S (w[51], w[52], selector); - w[51] = __byte_perm_S (w[50], w[51], selector); - w[50] = __byte_perm_S (w[49], w[50], selector); - w[49] = __byte_perm_S (w[48], w[49], selector); - w[48] = __byte_perm_S (w[47], w[48], selector); - w[47] = __byte_perm_S (w[46], w[47], selector); - w[46] = __byte_perm_S (w[45], w[46], selector); - w[45] = __byte_perm_S (w[44], w[45], selector); - w[44] = __byte_perm_S (w[43], w[44], selector); - w[43] = __byte_perm_S (w[42], w[43], selector); - w[42] = __byte_perm_S (w[41], w[42], selector); - w[41] = __byte_perm_S (w[40], w[41], selector); - w[40] = __byte_perm_S (w[39], w[40], selector); - w[39] = __byte_perm_S (w[38], w[39], selector); - w[38] = __byte_perm_S (w[37], w[38], selector); - w[37] = __byte_perm_S (w[36], w[37], selector); - w[36] = __byte_perm_S (w[35], w[36], selector); - w[35] = __byte_perm_S (w[34], w[35], selector); - w[34] = __byte_perm_S (w[33], w[34], selector); - w[33] = __byte_perm_S (w[32], w[33], selector); - w[32] = __byte_perm_S (w[31], w[32], selector); - w[31] = __byte_perm_S (w[30], w[31], selector); - w[30] = __byte_perm_S (w[29], w[30], selector); - w[29] = __byte_perm_S (w[28], w[29], selector); - w[28] = __byte_perm_S (w[27], w[28], selector); - w[27] = __byte_perm_S (w[26], w[27], selector); - w[26] = __byte_perm_S (w[25], w[26], selector); - w[25] = __byte_perm_S (w[24], w[25], selector); - w[24] = __byte_perm_S (w[23], w[24], selector); - w[23] = __byte_perm_S (w[22], w[23], selector); - w[22] = __byte_perm_S (w[21], w[22], selector); - w[21] = __byte_perm_S (w[20], w[21], selector); - w[20] = __byte_perm_S (w[19], w[20], selector); - w[19] = __byte_perm_S (w[18], w[19], selector); - w[18] = __byte_perm_S (w[17], w[18], selector); - w[17] = __byte_perm_S (w[16], w[17], selector); - w[16] = __byte_perm_S (w[15], w[16], selector); - w[15] = __byte_perm_S (w[14], w[15], selector); - w[14] = __byte_perm_S (w[13], w[14], selector); - w[13] = __byte_perm_S (w[12], w[13], selector); - w[12] = __byte_perm_S (w[11], w[12], selector); - w[11] = __byte_perm_S (w[10], w[11], selector); - w[10] = __byte_perm_S (w[ 9], w[10], selector); - w[ 9] = __byte_perm_S (w[ 8], w[ 9], selector); - w[ 8] = __byte_perm_S (w[ 7], w[ 8], selector); - w[ 7] = __byte_perm_S (w[ 6], w[ 7], selector); - w[ 6] = __byte_perm_S (w[ 5], w[ 6], selector); - w[ 5] = __byte_perm_S (w[ 4], w[ 5], selector); - w[ 4] = __byte_perm_S (w[ 3], w[ 4], selector); - w[ 3] = __byte_perm_S (w[ 2], w[ 3], selector); - w[ 2] = __byte_perm_S (w[ 1], w[ 2], selector); - w[ 1] = __byte_perm_S (w[ 0], w[ 1], selector); - w[ 0] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[62], w[63], selector); + w[62] = hc_byte_perm_S (w[61], w[62], selector); + w[61] = hc_byte_perm_S (w[60], w[61], selector); + w[60] = hc_byte_perm_S (w[59], w[60], selector); + w[59] = hc_byte_perm_S (w[58], w[59], selector); + w[58] = hc_byte_perm_S (w[57], w[58], selector); + w[57] = hc_byte_perm_S (w[56], w[57], selector); + w[56] = hc_byte_perm_S (w[55], w[56], selector); + w[55] = hc_byte_perm_S (w[54], w[55], selector); + w[54] = hc_byte_perm_S (w[53], w[54], selector); + w[53] = hc_byte_perm_S (w[52], w[53], selector); + w[52] = hc_byte_perm_S (w[51], w[52], selector); + w[51] = hc_byte_perm_S (w[50], w[51], selector); + w[50] = hc_byte_perm_S (w[49], w[50], selector); + w[49] = hc_byte_perm_S (w[48], w[49], selector); + w[48] = hc_byte_perm_S (w[47], w[48], selector); + w[47] = hc_byte_perm_S (w[46], w[47], selector); + w[46] = hc_byte_perm_S (w[45], w[46], selector); + w[45] = hc_byte_perm_S (w[44], w[45], selector); + w[44] = hc_byte_perm_S (w[43], w[44], selector); + w[43] = hc_byte_perm_S (w[42], w[43], selector); + w[42] = hc_byte_perm_S (w[41], w[42], selector); + w[41] = hc_byte_perm_S (w[40], w[41], selector); + w[40] = hc_byte_perm_S (w[39], w[40], selector); + w[39] = hc_byte_perm_S (w[38], w[39], selector); + w[38] = hc_byte_perm_S (w[37], w[38], selector); + w[37] = hc_byte_perm_S (w[36], w[37], selector); + w[36] = hc_byte_perm_S (w[35], w[36], selector); + w[35] = hc_byte_perm_S (w[34], w[35], selector); + w[34] = hc_byte_perm_S (w[33], w[34], selector); + w[33] = hc_byte_perm_S (w[32], w[33], selector); + w[32] = hc_byte_perm_S (w[31], w[32], selector); + w[31] = hc_byte_perm_S (w[30], w[31], selector); + w[30] = hc_byte_perm_S (w[29], w[30], selector); + w[29] = hc_byte_perm_S (w[28], w[29], selector); + w[28] = hc_byte_perm_S (w[27], w[28], selector); + w[27] = hc_byte_perm_S (w[26], w[27], selector); + w[26] = hc_byte_perm_S (w[25], w[26], selector); + w[25] = hc_byte_perm_S (w[24], w[25], selector); + w[24] = hc_byte_perm_S (w[23], w[24], selector); + w[23] = hc_byte_perm_S (w[22], w[23], selector); + w[22] = hc_byte_perm_S (w[21], w[22], selector); + w[21] = hc_byte_perm_S (w[20], w[21], selector); + w[20] = hc_byte_perm_S (w[19], w[20], selector); + w[19] = hc_byte_perm_S (w[18], w[19], selector); + w[18] = hc_byte_perm_S (w[17], w[18], selector); + w[17] = hc_byte_perm_S (w[16], w[17], selector); + w[16] = hc_byte_perm_S (w[15], w[16], selector); + w[15] = hc_byte_perm_S (w[14], w[15], selector); + w[14] = hc_byte_perm_S (w[13], w[14], selector); + w[13] = hc_byte_perm_S (w[12], w[13], selector); + w[12] = hc_byte_perm_S (w[11], w[12], selector); + w[11] = hc_byte_perm_S (w[10], w[11], selector); + w[10] = hc_byte_perm_S (w[ 9], w[10], selector); + w[ 9] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[ 8] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[ 7] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[ 6] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[ 5] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[ 4] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[ 3] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[ 2] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[ 1] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[ 0] = hc_byte_perm_S ( 0, w[ 0], selector); break; case 1: - w[63] = __byte_perm_S (w[61], w[62], selector); - w[62] = __byte_perm_S (w[60], w[61], selector); - w[61] = __byte_perm_S (w[59], w[60], selector); - w[60] = __byte_perm_S (w[58], w[59], selector); - w[59] = __byte_perm_S (w[57], w[58], selector); - w[58] = __byte_perm_S (w[56], w[57], selector); - w[57] = __byte_perm_S (w[55], w[56], selector); - w[56] = __byte_perm_S (w[54], w[55], selector); - w[55] = __byte_perm_S (w[53], w[54], selector); - w[54] = __byte_perm_S (w[52], w[53], selector); - w[53] = __byte_perm_S (w[51], w[52], selector); - w[52] = __byte_perm_S (w[50], w[51], selector); - w[51] = __byte_perm_S (w[49], w[50], selector); - w[50] = __byte_perm_S (w[48], w[49], selector); - w[49] = __byte_perm_S (w[47], w[48], selector); - w[48] = __byte_perm_S (w[46], w[47], selector); - w[47] = __byte_perm_S (w[45], w[46], selector); - w[46] = __byte_perm_S (w[44], w[45], selector); - w[45] = __byte_perm_S (w[43], w[44], selector); - w[44] = __byte_perm_S (w[42], w[43], selector); - w[43] = __byte_perm_S (w[41], w[42], selector); - w[42] = __byte_perm_S (w[40], w[41], selector); - w[41] = __byte_perm_S (w[39], w[40], selector); - w[40] = __byte_perm_S (w[38], w[39], selector); - w[39] = __byte_perm_S (w[37], w[38], selector); - w[38] = __byte_perm_S (w[36], w[37], selector); - w[37] = __byte_perm_S (w[35], w[36], selector); - w[36] = __byte_perm_S (w[34], w[35], selector); - w[35] = __byte_perm_S (w[33], w[34], selector); - w[34] = __byte_perm_S (w[32], w[33], selector); - w[33] = __byte_perm_S (w[31], w[32], selector); - w[32] = __byte_perm_S (w[30], w[31], selector); - w[31] = __byte_perm_S (w[29], w[30], selector); - w[30] = __byte_perm_S (w[28], w[29], selector); - w[29] = __byte_perm_S (w[27], w[28], selector); - w[28] = __byte_perm_S (w[26], w[27], selector); - w[27] = __byte_perm_S (w[25], w[26], selector); - w[26] = __byte_perm_S (w[24], w[25], selector); - w[25] = __byte_perm_S (w[23], w[24], selector); - w[24] = __byte_perm_S (w[22], w[23], selector); - w[23] = __byte_perm_S (w[21], w[22], selector); - w[22] = __byte_perm_S (w[20], w[21], selector); - w[21] = __byte_perm_S (w[19], w[20], selector); - w[20] = __byte_perm_S (w[18], w[19], selector); - w[19] = __byte_perm_S (w[17], w[18], selector); - w[18] = __byte_perm_S (w[16], w[17], selector); - w[17] = __byte_perm_S (w[15], w[16], selector); - w[16] = __byte_perm_S (w[14], w[15], selector); - w[15] = __byte_perm_S (w[13], w[14], selector); - w[14] = __byte_perm_S (w[12], w[13], selector); - w[13] = __byte_perm_S (w[11], w[12], selector); - w[12] = __byte_perm_S (w[10], w[11], selector); - w[11] = __byte_perm_S (w[ 9], w[10], selector); - w[10] = __byte_perm_S (w[ 8], w[ 9], selector); - w[ 9] = __byte_perm_S (w[ 7], w[ 8], selector); - w[ 8] = __byte_perm_S (w[ 6], w[ 7], selector); - w[ 7] = __byte_perm_S (w[ 5], w[ 6], selector); - w[ 6] = __byte_perm_S (w[ 4], w[ 5], selector); - w[ 5] = __byte_perm_S (w[ 3], w[ 4], selector); - w[ 4] = __byte_perm_S (w[ 2], w[ 3], selector); - w[ 3] = __byte_perm_S (w[ 1], w[ 2], selector); - w[ 2] = __byte_perm_S (w[ 0], w[ 1], selector); - w[ 1] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[61], w[62], selector); + w[62] = hc_byte_perm_S (w[60], w[61], selector); + w[61] = hc_byte_perm_S (w[59], w[60], selector); + w[60] = hc_byte_perm_S (w[58], w[59], selector); + w[59] = hc_byte_perm_S (w[57], w[58], selector); + w[58] = hc_byte_perm_S (w[56], w[57], selector); + w[57] = hc_byte_perm_S (w[55], w[56], selector); + w[56] = hc_byte_perm_S (w[54], w[55], selector); + w[55] = hc_byte_perm_S (w[53], w[54], selector); + w[54] = hc_byte_perm_S (w[52], w[53], selector); + w[53] = hc_byte_perm_S (w[51], w[52], selector); + w[52] = hc_byte_perm_S (w[50], w[51], selector); + w[51] = hc_byte_perm_S (w[49], w[50], selector); + w[50] = hc_byte_perm_S (w[48], w[49], selector); + w[49] = hc_byte_perm_S (w[47], w[48], selector); + w[48] = hc_byte_perm_S (w[46], w[47], selector); + w[47] = hc_byte_perm_S (w[45], w[46], selector); + w[46] = hc_byte_perm_S (w[44], w[45], selector); + w[45] = hc_byte_perm_S (w[43], w[44], selector); + w[44] = hc_byte_perm_S (w[42], w[43], selector); + w[43] = hc_byte_perm_S (w[41], w[42], selector); + w[42] = hc_byte_perm_S (w[40], w[41], selector); + w[41] = hc_byte_perm_S (w[39], w[40], selector); + w[40] = hc_byte_perm_S (w[38], w[39], selector); + w[39] = hc_byte_perm_S (w[37], w[38], selector); + w[38] = hc_byte_perm_S (w[36], w[37], selector); + w[37] = hc_byte_perm_S (w[35], w[36], selector); + w[36] = hc_byte_perm_S (w[34], w[35], selector); + w[35] = hc_byte_perm_S (w[33], w[34], selector); + w[34] = hc_byte_perm_S (w[32], w[33], selector); + w[33] = hc_byte_perm_S (w[31], w[32], selector); + w[32] = hc_byte_perm_S (w[30], w[31], selector); + w[31] = hc_byte_perm_S (w[29], w[30], selector); + w[30] = hc_byte_perm_S (w[28], w[29], selector); + w[29] = hc_byte_perm_S (w[27], w[28], selector); + w[28] = hc_byte_perm_S (w[26], w[27], selector); + w[27] = hc_byte_perm_S (w[25], w[26], selector); + w[26] = hc_byte_perm_S (w[24], w[25], selector); + w[25] = hc_byte_perm_S (w[23], w[24], selector); + w[24] = hc_byte_perm_S (w[22], w[23], selector); + w[23] = hc_byte_perm_S (w[21], w[22], selector); + w[22] = hc_byte_perm_S (w[20], w[21], selector); + w[21] = hc_byte_perm_S (w[19], w[20], selector); + w[20] = hc_byte_perm_S (w[18], w[19], selector); + w[19] = hc_byte_perm_S (w[17], w[18], selector); + w[18] = hc_byte_perm_S (w[16], w[17], selector); + w[17] = hc_byte_perm_S (w[15], w[16], selector); + w[16] = hc_byte_perm_S (w[14], w[15], selector); + w[15] = hc_byte_perm_S (w[13], w[14], selector); + w[14] = hc_byte_perm_S (w[12], w[13], selector); + w[13] = hc_byte_perm_S (w[11], w[12], selector); + w[12] = hc_byte_perm_S (w[10], w[11], selector); + w[11] = hc_byte_perm_S (w[ 9], w[10], selector); + w[10] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[ 9] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[ 8] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[ 7] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[ 6] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[ 5] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[ 4] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[ 3] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[ 2] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[ 1] = hc_byte_perm_S ( 0, w[ 0], selector); w[ 0] = 0; break; case 2: - w[63] = __byte_perm_S (w[60], w[61], selector); - w[62] = __byte_perm_S (w[59], w[60], selector); - w[61] = __byte_perm_S (w[58], w[59], selector); - w[60] = __byte_perm_S (w[57], w[58], selector); - w[59] = __byte_perm_S (w[56], w[57], selector); - w[58] = __byte_perm_S (w[55], w[56], selector); - w[57] = __byte_perm_S (w[54], w[55], selector); - w[56] = __byte_perm_S (w[53], w[54], selector); - w[55] = __byte_perm_S (w[52], w[53], selector); - w[54] = __byte_perm_S (w[51], w[52], selector); - w[53] = __byte_perm_S (w[50], w[51], selector); - w[52] = __byte_perm_S (w[49], w[50], selector); - w[51] = __byte_perm_S (w[48], w[49], selector); - w[50] = __byte_perm_S (w[47], w[48], selector); - w[49] = __byte_perm_S (w[46], w[47], selector); - w[48] = __byte_perm_S (w[45], w[46], selector); - w[47] = __byte_perm_S (w[44], w[45], selector); - w[46] = __byte_perm_S (w[43], w[44], selector); - w[45] = __byte_perm_S (w[42], w[43], selector); - w[44] = __byte_perm_S (w[41], w[42], selector); - w[43] = __byte_perm_S (w[40], w[41], selector); - w[42] = __byte_perm_S (w[39], w[40], selector); - w[41] = __byte_perm_S (w[38], w[39], selector); - w[40] = __byte_perm_S (w[37], w[38], selector); - w[39] = __byte_perm_S (w[36], w[37], selector); - w[38] = __byte_perm_S (w[35], w[36], selector); - w[37] = __byte_perm_S (w[34], w[35], selector); - w[36] = __byte_perm_S (w[33], w[34], selector); - w[35] = __byte_perm_S (w[32], w[33], selector); - w[34] = __byte_perm_S (w[31], w[32], selector); - w[33] = __byte_perm_S (w[30], w[31], selector); - w[32] = __byte_perm_S (w[29], w[30], selector); - w[31] = __byte_perm_S (w[28], w[29], selector); - w[30] = __byte_perm_S (w[27], w[28], selector); - w[29] = __byte_perm_S (w[26], w[27], selector); - w[28] = __byte_perm_S (w[25], w[26], selector); - w[27] = __byte_perm_S (w[24], w[25], selector); - w[26] = __byte_perm_S (w[23], w[24], selector); - w[25] = __byte_perm_S (w[22], w[23], selector); - w[24] = __byte_perm_S (w[21], w[22], selector); - w[23] = __byte_perm_S (w[20], w[21], selector); - w[22] = __byte_perm_S (w[19], w[20], selector); - w[21] = __byte_perm_S (w[18], w[19], selector); - w[20] = __byte_perm_S (w[17], w[18], selector); - w[19] = __byte_perm_S (w[16], w[17], selector); - w[18] = __byte_perm_S (w[15], w[16], selector); - w[17] = __byte_perm_S (w[14], w[15], selector); - w[16] = __byte_perm_S (w[13], w[14], selector); - w[15] = __byte_perm_S (w[12], w[13], selector); - w[14] = __byte_perm_S (w[11], w[12], selector); - w[13] = __byte_perm_S (w[10], w[11], selector); - w[12] = __byte_perm_S (w[ 9], w[10], selector); - w[11] = __byte_perm_S (w[ 8], w[ 9], selector); - w[10] = __byte_perm_S (w[ 7], w[ 8], selector); - w[ 9] = __byte_perm_S (w[ 6], w[ 7], selector); - w[ 8] = __byte_perm_S (w[ 5], w[ 6], selector); - w[ 7] = __byte_perm_S (w[ 4], w[ 5], selector); - w[ 6] = __byte_perm_S (w[ 3], w[ 4], selector); - w[ 5] = __byte_perm_S (w[ 2], w[ 3], selector); - w[ 4] = __byte_perm_S (w[ 1], w[ 2], selector); - w[ 3] = __byte_perm_S (w[ 0], w[ 1], selector); - w[ 2] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[60], w[61], selector); + w[62] = hc_byte_perm_S (w[59], w[60], selector); + w[61] = hc_byte_perm_S (w[58], w[59], selector); + w[60] = hc_byte_perm_S (w[57], w[58], selector); + w[59] = hc_byte_perm_S (w[56], w[57], selector); + w[58] = hc_byte_perm_S (w[55], w[56], selector); + w[57] = hc_byte_perm_S (w[54], w[55], selector); + w[56] = hc_byte_perm_S (w[53], w[54], selector); + w[55] = hc_byte_perm_S (w[52], w[53], selector); + w[54] = hc_byte_perm_S (w[51], w[52], selector); + w[53] = hc_byte_perm_S (w[50], w[51], selector); + w[52] = hc_byte_perm_S (w[49], w[50], selector); + w[51] = hc_byte_perm_S (w[48], w[49], selector); + w[50] = hc_byte_perm_S (w[47], w[48], selector); + w[49] = hc_byte_perm_S (w[46], w[47], selector); + w[48] = hc_byte_perm_S (w[45], w[46], selector); + w[47] = hc_byte_perm_S (w[44], w[45], selector); + w[46] = hc_byte_perm_S (w[43], w[44], selector); + w[45] = hc_byte_perm_S (w[42], w[43], selector); + w[44] = hc_byte_perm_S (w[41], w[42], selector); + w[43] = hc_byte_perm_S (w[40], w[41], selector); + w[42] = hc_byte_perm_S (w[39], w[40], selector); + w[41] = hc_byte_perm_S (w[38], w[39], selector); + w[40] = hc_byte_perm_S (w[37], w[38], selector); + w[39] = hc_byte_perm_S (w[36], w[37], selector); + w[38] = hc_byte_perm_S (w[35], w[36], selector); + w[37] = hc_byte_perm_S (w[34], w[35], selector); + w[36] = hc_byte_perm_S (w[33], w[34], selector); + w[35] = hc_byte_perm_S (w[32], w[33], selector); + w[34] = hc_byte_perm_S (w[31], w[32], selector); + w[33] = hc_byte_perm_S (w[30], w[31], selector); + w[32] = hc_byte_perm_S (w[29], w[30], selector); + w[31] = hc_byte_perm_S (w[28], w[29], selector); + w[30] = hc_byte_perm_S (w[27], w[28], selector); + w[29] = hc_byte_perm_S (w[26], w[27], selector); + w[28] = hc_byte_perm_S (w[25], w[26], selector); + w[27] = hc_byte_perm_S (w[24], w[25], selector); + w[26] = hc_byte_perm_S (w[23], w[24], selector); + w[25] = hc_byte_perm_S (w[22], w[23], selector); + w[24] = hc_byte_perm_S (w[21], w[22], selector); + w[23] = hc_byte_perm_S (w[20], w[21], selector); + w[22] = hc_byte_perm_S (w[19], w[20], selector); + w[21] = hc_byte_perm_S (w[18], w[19], selector); + w[20] = hc_byte_perm_S (w[17], w[18], selector); + w[19] = hc_byte_perm_S (w[16], w[17], selector); + w[18] = hc_byte_perm_S (w[15], w[16], selector); + w[17] = hc_byte_perm_S (w[14], w[15], selector); + w[16] = hc_byte_perm_S (w[13], w[14], selector); + w[15] = hc_byte_perm_S (w[12], w[13], selector); + w[14] = hc_byte_perm_S (w[11], w[12], selector); + w[13] = hc_byte_perm_S (w[10], w[11], selector); + w[12] = hc_byte_perm_S (w[ 9], w[10], selector); + w[11] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[10] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[ 9] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[ 8] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[ 7] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[ 6] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[ 5] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[ 4] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[ 3] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[ 2] = hc_byte_perm_S ( 0, w[ 0], selector); w[ 1] = 0; w[ 0] = 0; break; case 3: - w[63] = __byte_perm_S (w[59], w[60], selector); - w[62] = __byte_perm_S (w[58], w[59], selector); - w[61] = __byte_perm_S (w[57], w[58], selector); - w[60] = __byte_perm_S (w[56], w[57], selector); - w[59] = __byte_perm_S (w[55], w[56], selector); - w[58] = __byte_perm_S (w[54], w[55], selector); - w[57] = __byte_perm_S (w[53], w[54], selector); - w[56] = __byte_perm_S (w[52], w[53], selector); - w[55] = __byte_perm_S (w[51], w[52], selector); - w[54] = __byte_perm_S (w[50], w[51], selector); - w[53] = __byte_perm_S (w[49], w[50], selector); - w[52] = __byte_perm_S (w[48], w[49], selector); - w[51] = __byte_perm_S (w[47], w[48], selector); - w[50] = __byte_perm_S (w[46], w[47], selector); - w[49] = __byte_perm_S (w[45], w[46], selector); - w[48] = __byte_perm_S (w[44], w[45], selector); - w[47] = __byte_perm_S (w[43], w[44], selector); - w[46] = __byte_perm_S (w[42], w[43], selector); - w[45] = __byte_perm_S (w[41], w[42], selector); - w[44] = __byte_perm_S (w[40], w[41], selector); - w[43] = __byte_perm_S (w[39], w[40], selector); - w[42] = __byte_perm_S (w[38], w[39], selector); - w[41] = __byte_perm_S (w[37], w[38], selector); - w[40] = __byte_perm_S (w[36], w[37], selector); - w[39] = __byte_perm_S (w[35], w[36], selector); - w[38] = __byte_perm_S (w[34], w[35], selector); - w[37] = __byte_perm_S (w[33], w[34], selector); - w[36] = __byte_perm_S (w[32], w[33], selector); - w[35] = __byte_perm_S (w[31], w[32], selector); - w[34] = __byte_perm_S (w[30], w[31], selector); - w[33] = __byte_perm_S (w[29], w[30], selector); - w[32] = __byte_perm_S (w[28], w[29], selector); - w[31] = __byte_perm_S (w[27], w[28], selector); - w[30] = __byte_perm_S (w[26], w[27], selector); - w[29] = __byte_perm_S (w[25], w[26], selector); - w[28] = __byte_perm_S (w[24], w[25], selector); - w[27] = __byte_perm_S (w[23], w[24], selector); - w[26] = __byte_perm_S (w[22], w[23], selector); - w[25] = __byte_perm_S (w[21], w[22], selector); - w[24] = __byte_perm_S (w[20], w[21], selector); - w[23] = __byte_perm_S (w[19], w[20], selector); - w[22] = __byte_perm_S (w[18], w[19], selector); - w[21] = __byte_perm_S (w[17], w[18], selector); - w[20] = __byte_perm_S (w[16], w[17], selector); - w[19] = __byte_perm_S (w[15], w[16], selector); - w[18] = __byte_perm_S (w[14], w[15], selector); - w[17] = __byte_perm_S (w[13], w[14], selector); - w[16] = __byte_perm_S (w[12], w[13], selector); - w[15] = __byte_perm_S (w[11], w[12], selector); - w[14] = __byte_perm_S (w[10], w[11], selector); - w[13] = __byte_perm_S (w[ 9], w[10], selector); - w[12] = __byte_perm_S (w[ 8], w[ 9], selector); - w[11] = __byte_perm_S (w[ 7], w[ 8], selector); - w[10] = __byte_perm_S (w[ 6], w[ 7], selector); - w[ 9] = __byte_perm_S (w[ 5], w[ 6], selector); - w[ 8] = __byte_perm_S (w[ 4], w[ 5], selector); - w[ 7] = __byte_perm_S (w[ 3], w[ 4], selector); - w[ 6] = __byte_perm_S (w[ 2], w[ 3], selector); - w[ 5] = __byte_perm_S (w[ 1], w[ 2], selector); - w[ 4] = __byte_perm_S (w[ 0], w[ 1], selector); - w[ 3] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[59], w[60], selector); + w[62] = hc_byte_perm_S (w[58], w[59], selector); + w[61] = hc_byte_perm_S (w[57], w[58], selector); + w[60] = hc_byte_perm_S (w[56], w[57], selector); + w[59] = hc_byte_perm_S (w[55], w[56], selector); + w[58] = hc_byte_perm_S (w[54], w[55], selector); + w[57] = hc_byte_perm_S (w[53], w[54], selector); + w[56] = hc_byte_perm_S (w[52], w[53], selector); + w[55] = hc_byte_perm_S (w[51], w[52], selector); + w[54] = hc_byte_perm_S (w[50], w[51], selector); + w[53] = hc_byte_perm_S (w[49], w[50], selector); + w[52] = hc_byte_perm_S (w[48], w[49], selector); + w[51] = hc_byte_perm_S (w[47], w[48], selector); + w[50] = hc_byte_perm_S (w[46], w[47], selector); + w[49] = hc_byte_perm_S (w[45], w[46], selector); + w[48] = hc_byte_perm_S (w[44], w[45], selector); + w[47] = hc_byte_perm_S (w[43], w[44], selector); + w[46] = hc_byte_perm_S (w[42], w[43], selector); + w[45] = hc_byte_perm_S (w[41], w[42], selector); + w[44] = hc_byte_perm_S (w[40], w[41], selector); + w[43] = hc_byte_perm_S (w[39], w[40], selector); + w[42] = hc_byte_perm_S (w[38], w[39], selector); + w[41] = hc_byte_perm_S (w[37], w[38], selector); + w[40] = hc_byte_perm_S (w[36], w[37], selector); + w[39] = hc_byte_perm_S (w[35], w[36], selector); + w[38] = hc_byte_perm_S (w[34], w[35], selector); + w[37] = hc_byte_perm_S (w[33], w[34], selector); + w[36] = hc_byte_perm_S (w[32], w[33], selector); + w[35] = hc_byte_perm_S (w[31], w[32], selector); + w[34] = hc_byte_perm_S (w[30], w[31], selector); + w[33] = hc_byte_perm_S (w[29], w[30], selector); + w[32] = hc_byte_perm_S (w[28], w[29], selector); + w[31] = hc_byte_perm_S (w[27], w[28], selector); + w[30] = hc_byte_perm_S (w[26], w[27], selector); + w[29] = hc_byte_perm_S (w[25], w[26], selector); + w[28] = hc_byte_perm_S (w[24], w[25], selector); + w[27] = hc_byte_perm_S (w[23], w[24], selector); + w[26] = hc_byte_perm_S (w[22], w[23], selector); + w[25] = hc_byte_perm_S (w[21], w[22], selector); + w[24] = hc_byte_perm_S (w[20], w[21], selector); + w[23] = hc_byte_perm_S (w[19], w[20], selector); + w[22] = hc_byte_perm_S (w[18], w[19], selector); + w[21] = hc_byte_perm_S (w[17], w[18], selector); + w[20] = hc_byte_perm_S (w[16], w[17], selector); + w[19] = hc_byte_perm_S (w[15], w[16], selector); + w[18] = hc_byte_perm_S (w[14], w[15], selector); + w[17] = hc_byte_perm_S (w[13], w[14], selector); + w[16] = hc_byte_perm_S (w[12], w[13], selector); + w[15] = hc_byte_perm_S (w[11], w[12], selector); + w[14] = hc_byte_perm_S (w[10], w[11], selector); + w[13] = hc_byte_perm_S (w[ 9], w[10], selector); + w[12] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[11] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[10] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[ 9] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[ 8] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[ 7] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[ 6] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[ 5] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[ 4] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[ 3] = hc_byte_perm_S ( 0, w[ 0], selector); w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; @@ -46996,66 +46996,66 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 4: - w[63] = __byte_perm_S (w[58], w[59], selector); - w[62] = __byte_perm_S (w[57], w[58], selector); - w[61] = __byte_perm_S (w[56], w[57], selector); - w[60] = __byte_perm_S (w[55], w[56], selector); - w[59] = __byte_perm_S (w[54], w[55], selector); - w[58] = __byte_perm_S (w[53], w[54], selector); - w[57] = __byte_perm_S (w[52], w[53], selector); - w[56] = __byte_perm_S (w[51], w[52], selector); - w[55] = __byte_perm_S (w[50], w[51], selector); - w[54] = __byte_perm_S (w[49], w[50], selector); - w[53] = __byte_perm_S (w[48], w[49], selector); - w[52] = __byte_perm_S (w[47], w[48], selector); - w[51] = __byte_perm_S (w[46], w[47], selector); - w[50] = __byte_perm_S (w[45], w[46], selector); - w[49] = __byte_perm_S (w[44], w[45], selector); - w[48] = __byte_perm_S (w[43], w[44], selector); - w[47] = __byte_perm_S (w[42], w[43], selector); - w[46] = __byte_perm_S (w[41], w[42], selector); - w[45] = __byte_perm_S (w[40], w[41], selector); - w[44] = __byte_perm_S (w[39], w[40], selector); - w[43] = __byte_perm_S (w[38], w[39], selector); - w[42] = __byte_perm_S (w[37], w[38], selector); - w[41] = __byte_perm_S (w[36], w[37], selector); - w[40] = __byte_perm_S (w[35], w[36], selector); - w[39] = __byte_perm_S (w[34], w[35], selector); - w[38] = __byte_perm_S (w[33], w[34], selector); - w[37] = __byte_perm_S (w[32], w[33], selector); - w[36] = __byte_perm_S (w[31], w[32], selector); - w[35] = __byte_perm_S (w[30], w[31], selector); - w[34] = __byte_perm_S (w[29], w[30], selector); - w[33] = __byte_perm_S (w[28], w[29], selector); - w[32] = __byte_perm_S (w[27], w[28], selector); - w[31] = __byte_perm_S (w[26], w[27], selector); - w[30] = __byte_perm_S (w[25], w[26], selector); - w[29] = __byte_perm_S (w[24], w[25], selector); - w[28] = __byte_perm_S (w[23], w[24], selector); - w[27] = __byte_perm_S (w[22], w[23], selector); - w[26] = __byte_perm_S (w[21], w[22], selector); - w[25] = __byte_perm_S (w[20], w[21], selector); - w[24] = __byte_perm_S (w[19], w[20], selector); - w[23] = __byte_perm_S (w[18], w[19], selector); - w[22] = __byte_perm_S (w[17], w[18], selector); - w[21] = __byte_perm_S (w[16], w[17], selector); - w[20] = __byte_perm_S (w[15], w[16], selector); - w[19] = __byte_perm_S (w[14], w[15], selector); - w[18] = __byte_perm_S (w[13], w[14], selector); - w[17] = __byte_perm_S (w[12], w[13], selector); - w[16] = __byte_perm_S (w[11], w[12], selector); - w[15] = __byte_perm_S (w[10], w[11], selector); - w[14] = __byte_perm_S (w[ 9], w[10], selector); - w[13] = __byte_perm_S (w[ 8], w[ 9], selector); - w[12] = __byte_perm_S (w[ 7], w[ 8], selector); - w[11] = __byte_perm_S (w[ 6], w[ 7], selector); - w[10] = __byte_perm_S (w[ 5], w[ 6], selector); - w[ 9] = __byte_perm_S (w[ 4], w[ 5], selector); - w[ 8] = __byte_perm_S (w[ 3], w[ 4], selector); - w[ 7] = __byte_perm_S (w[ 2], w[ 3], selector); - w[ 6] = __byte_perm_S (w[ 1], w[ 2], selector); - w[ 5] = __byte_perm_S (w[ 0], w[ 1], selector); - w[ 4] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[58], w[59], selector); + w[62] = hc_byte_perm_S (w[57], w[58], selector); + w[61] = hc_byte_perm_S (w[56], w[57], selector); + w[60] = hc_byte_perm_S (w[55], w[56], selector); + w[59] = hc_byte_perm_S (w[54], w[55], selector); + w[58] = hc_byte_perm_S (w[53], w[54], selector); + w[57] = hc_byte_perm_S (w[52], w[53], selector); + w[56] = hc_byte_perm_S (w[51], w[52], selector); + w[55] = hc_byte_perm_S (w[50], w[51], selector); + w[54] = hc_byte_perm_S (w[49], w[50], selector); + w[53] = hc_byte_perm_S (w[48], w[49], selector); + w[52] = hc_byte_perm_S (w[47], w[48], selector); + w[51] = hc_byte_perm_S (w[46], w[47], selector); + w[50] = hc_byte_perm_S (w[45], w[46], selector); + w[49] = hc_byte_perm_S (w[44], w[45], selector); + w[48] = hc_byte_perm_S (w[43], w[44], selector); + w[47] = hc_byte_perm_S (w[42], w[43], selector); + w[46] = hc_byte_perm_S (w[41], w[42], selector); + w[45] = hc_byte_perm_S (w[40], w[41], selector); + w[44] = hc_byte_perm_S (w[39], w[40], selector); + w[43] = hc_byte_perm_S (w[38], w[39], selector); + w[42] = hc_byte_perm_S (w[37], w[38], selector); + w[41] = hc_byte_perm_S (w[36], w[37], selector); + w[40] = hc_byte_perm_S (w[35], w[36], selector); + w[39] = hc_byte_perm_S (w[34], w[35], selector); + w[38] = hc_byte_perm_S (w[33], w[34], selector); + w[37] = hc_byte_perm_S (w[32], w[33], selector); + w[36] = hc_byte_perm_S (w[31], w[32], selector); + w[35] = hc_byte_perm_S (w[30], w[31], selector); + w[34] = hc_byte_perm_S (w[29], w[30], selector); + w[33] = hc_byte_perm_S (w[28], w[29], selector); + w[32] = hc_byte_perm_S (w[27], w[28], selector); + w[31] = hc_byte_perm_S (w[26], w[27], selector); + w[30] = hc_byte_perm_S (w[25], w[26], selector); + w[29] = hc_byte_perm_S (w[24], w[25], selector); + w[28] = hc_byte_perm_S (w[23], w[24], selector); + w[27] = hc_byte_perm_S (w[22], w[23], selector); + w[26] = hc_byte_perm_S (w[21], w[22], selector); + w[25] = hc_byte_perm_S (w[20], w[21], selector); + w[24] = hc_byte_perm_S (w[19], w[20], selector); + w[23] = hc_byte_perm_S (w[18], w[19], selector); + w[22] = hc_byte_perm_S (w[17], w[18], selector); + w[21] = hc_byte_perm_S (w[16], w[17], selector); + w[20] = hc_byte_perm_S (w[15], w[16], selector); + w[19] = hc_byte_perm_S (w[14], w[15], selector); + w[18] = hc_byte_perm_S (w[13], w[14], selector); + w[17] = hc_byte_perm_S (w[12], w[13], selector); + w[16] = hc_byte_perm_S (w[11], w[12], selector); + w[15] = hc_byte_perm_S (w[10], w[11], selector); + w[14] = hc_byte_perm_S (w[ 9], w[10], selector); + w[13] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[12] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[11] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[10] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[ 9] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[ 8] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[ 7] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[ 6] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[ 5] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[ 4] = hc_byte_perm_S ( 0, w[ 0], selector); w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; @@ -47064,65 +47064,65 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 5: - w[63] = __byte_perm_S (w[57], w[58], selector); - w[62] = __byte_perm_S (w[56], w[57], selector); - w[61] = __byte_perm_S (w[55], w[56], selector); - w[60] = __byte_perm_S (w[54], w[55], selector); - w[59] = __byte_perm_S (w[53], w[54], selector); - w[58] = __byte_perm_S (w[52], w[53], selector); - w[57] = __byte_perm_S (w[51], w[52], selector); - w[56] = __byte_perm_S (w[50], w[51], selector); - w[55] = __byte_perm_S (w[49], w[50], selector); - w[54] = __byte_perm_S (w[48], w[49], selector); - w[53] = __byte_perm_S (w[47], w[48], selector); - w[52] = __byte_perm_S (w[46], w[47], selector); - w[51] = __byte_perm_S (w[45], w[46], selector); - w[50] = __byte_perm_S (w[44], w[45], selector); - w[49] = __byte_perm_S (w[43], w[44], selector); - w[48] = __byte_perm_S (w[42], w[43], selector); - w[47] = __byte_perm_S (w[41], w[42], selector); - w[46] = __byte_perm_S (w[40], w[41], selector); - w[45] = __byte_perm_S (w[39], w[40], selector); - w[44] = __byte_perm_S (w[38], w[39], selector); - w[43] = __byte_perm_S (w[37], w[38], selector); - w[42] = __byte_perm_S (w[36], w[37], selector); - w[41] = __byte_perm_S (w[35], w[36], selector); - w[40] = __byte_perm_S (w[34], w[35], selector); - w[39] = __byte_perm_S (w[33], w[34], selector); - w[38] = __byte_perm_S (w[32], w[33], selector); - w[37] = __byte_perm_S (w[31], w[32], selector); - w[36] = __byte_perm_S (w[30], w[31], selector); - w[35] = __byte_perm_S (w[29], w[30], selector); - w[34] = __byte_perm_S (w[28], w[29], selector); - w[33] = __byte_perm_S (w[27], w[28], selector); - w[32] = __byte_perm_S (w[26], w[27], selector); - w[31] = __byte_perm_S (w[25], w[26], selector); - w[30] = __byte_perm_S (w[24], w[25], selector); - w[29] = __byte_perm_S (w[23], w[24], selector); - w[28] = __byte_perm_S (w[22], w[23], selector); - w[27] = __byte_perm_S (w[21], w[22], selector); - w[26] = __byte_perm_S (w[20], w[21], selector); - w[25] = __byte_perm_S (w[19], w[20], selector); - w[24] = __byte_perm_S (w[18], w[19], selector); - w[23] = __byte_perm_S (w[17], w[18], selector); - w[22] = __byte_perm_S (w[16], w[17], selector); - w[21] = __byte_perm_S (w[15], w[16], selector); - w[20] = __byte_perm_S (w[14], w[15], selector); - w[19] = __byte_perm_S (w[13], w[14], selector); - w[18] = __byte_perm_S (w[12], w[13], selector); - w[17] = __byte_perm_S (w[11], w[12], selector); - w[16] = __byte_perm_S (w[10], w[11], selector); - w[15] = __byte_perm_S (w[ 9], w[10], selector); - w[14] = __byte_perm_S (w[ 8], w[ 9], selector); - w[13] = __byte_perm_S (w[ 7], w[ 8], selector); - w[12] = __byte_perm_S (w[ 6], w[ 7], selector); - w[11] = __byte_perm_S (w[ 5], w[ 6], selector); - w[10] = __byte_perm_S (w[ 4], w[ 5], selector); - w[ 9] = __byte_perm_S (w[ 3], w[ 4], selector); - w[ 8] = __byte_perm_S (w[ 2], w[ 3], selector); - w[ 7] = __byte_perm_S (w[ 1], w[ 2], selector); - w[ 6] = __byte_perm_S (w[ 0], w[ 1], selector); - w[ 5] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[57], w[58], selector); + w[62] = hc_byte_perm_S (w[56], w[57], selector); + w[61] = hc_byte_perm_S (w[55], w[56], selector); + w[60] = hc_byte_perm_S (w[54], w[55], selector); + w[59] = hc_byte_perm_S (w[53], w[54], selector); + w[58] = hc_byte_perm_S (w[52], w[53], selector); + w[57] = hc_byte_perm_S (w[51], w[52], selector); + w[56] = hc_byte_perm_S (w[50], w[51], selector); + w[55] = hc_byte_perm_S (w[49], w[50], selector); + w[54] = hc_byte_perm_S (w[48], w[49], selector); + w[53] = hc_byte_perm_S (w[47], w[48], selector); + w[52] = hc_byte_perm_S (w[46], w[47], selector); + w[51] = hc_byte_perm_S (w[45], w[46], selector); + w[50] = hc_byte_perm_S (w[44], w[45], selector); + w[49] = hc_byte_perm_S (w[43], w[44], selector); + w[48] = hc_byte_perm_S (w[42], w[43], selector); + w[47] = hc_byte_perm_S (w[41], w[42], selector); + w[46] = hc_byte_perm_S (w[40], w[41], selector); + w[45] = hc_byte_perm_S (w[39], w[40], selector); + w[44] = hc_byte_perm_S (w[38], w[39], selector); + w[43] = hc_byte_perm_S (w[37], w[38], selector); + w[42] = hc_byte_perm_S (w[36], w[37], selector); + w[41] = hc_byte_perm_S (w[35], w[36], selector); + w[40] = hc_byte_perm_S (w[34], w[35], selector); + w[39] = hc_byte_perm_S (w[33], w[34], selector); + w[38] = hc_byte_perm_S (w[32], w[33], selector); + w[37] = hc_byte_perm_S (w[31], w[32], selector); + w[36] = hc_byte_perm_S (w[30], w[31], selector); + w[35] = hc_byte_perm_S (w[29], w[30], selector); + w[34] = hc_byte_perm_S (w[28], w[29], selector); + w[33] = hc_byte_perm_S (w[27], w[28], selector); + w[32] = hc_byte_perm_S (w[26], w[27], selector); + w[31] = hc_byte_perm_S (w[25], w[26], selector); + w[30] = hc_byte_perm_S (w[24], w[25], selector); + w[29] = hc_byte_perm_S (w[23], w[24], selector); + w[28] = hc_byte_perm_S (w[22], w[23], selector); + w[27] = hc_byte_perm_S (w[21], w[22], selector); + w[26] = hc_byte_perm_S (w[20], w[21], selector); + w[25] = hc_byte_perm_S (w[19], w[20], selector); + w[24] = hc_byte_perm_S (w[18], w[19], selector); + w[23] = hc_byte_perm_S (w[17], w[18], selector); + w[22] = hc_byte_perm_S (w[16], w[17], selector); + w[21] = hc_byte_perm_S (w[15], w[16], selector); + w[20] = hc_byte_perm_S (w[14], w[15], selector); + w[19] = hc_byte_perm_S (w[13], w[14], selector); + w[18] = hc_byte_perm_S (w[12], w[13], selector); + w[17] = hc_byte_perm_S (w[11], w[12], selector); + w[16] = hc_byte_perm_S (w[10], w[11], selector); + w[15] = hc_byte_perm_S (w[ 9], w[10], selector); + w[14] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[13] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[12] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[11] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[10] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[ 9] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[ 8] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[ 7] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[ 6] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[ 5] = hc_byte_perm_S ( 0, w[ 0], selector); w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; @@ -47132,64 +47132,64 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 6: - w[63] = __byte_perm_S (w[56], w[57], selector); - w[62] = __byte_perm_S (w[55], w[56], selector); - w[61] = __byte_perm_S (w[54], w[55], selector); - w[60] = __byte_perm_S (w[53], w[54], selector); - w[59] = __byte_perm_S (w[52], w[53], selector); - w[58] = __byte_perm_S (w[51], w[52], selector); - w[57] = __byte_perm_S (w[50], w[51], selector); - w[56] = __byte_perm_S (w[49], w[50], selector); - w[55] = __byte_perm_S (w[48], w[49], selector); - w[54] = __byte_perm_S (w[47], w[48], selector); - w[53] = __byte_perm_S (w[46], w[47], selector); - w[52] = __byte_perm_S (w[45], w[46], selector); - w[51] = __byte_perm_S (w[44], w[45], selector); - w[50] = __byte_perm_S (w[43], w[44], selector); - w[49] = __byte_perm_S (w[42], w[43], selector); - w[48] = __byte_perm_S (w[41], w[42], selector); - w[47] = __byte_perm_S (w[40], w[41], selector); - w[46] = __byte_perm_S (w[39], w[40], selector); - w[45] = __byte_perm_S (w[38], w[39], selector); - w[44] = __byte_perm_S (w[37], w[38], selector); - w[43] = __byte_perm_S (w[36], w[37], selector); - w[42] = __byte_perm_S (w[35], w[36], selector); - w[41] = __byte_perm_S (w[34], w[35], selector); - w[40] = __byte_perm_S (w[33], w[34], selector); - w[39] = __byte_perm_S (w[32], w[33], selector); - w[38] = __byte_perm_S (w[31], w[32], selector); - w[37] = __byte_perm_S (w[30], w[31], selector); - w[36] = __byte_perm_S (w[29], w[30], selector); - w[35] = __byte_perm_S (w[28], w[29], selector); - w[34] = __byte_perm_S (w[27], w[28], selector); - w[33] = __byte_perm_S (w[26], w[27], selector); - w[32] = __byte_perm_S (w[25], w[26], selector); - w[31] = __byte_perm_S (w[24], w[25], selector); - w[30] = __byte_perm_S (w[23], w[24], selector); - w[29] = __byte_perm_S (w[22], w[23], selector); - w[28] = __byte_perm_S (w[21], w[22], selector); - w[27] = __byte_perm_S (w[20], w[21], selector); - w[26] = __byte_perm_S (w[19], w[20], selector); - w[25] = __byte_perm_S (w[18], w[19], selector); - w[24] = __byte_perm_S (w[17], w[18], selector); - w[23] = __byte_perm_S (w[16], w[17], selector); - w[22] = __byte_perm_S (w[15], w[16], selector); - w[21] = __byte_perm_S (w[14], w[15], selector); - w[20] = __byte_perm_S (w[13], w[14], selector); - w[19] = __byte_perm_S (w[12], w[13], selector); - w[18] = __byte_perm_S (w[11], w[12], selector); - w[17] = __byte_perm_S (w[10], w[11], selector); - w[16] = __byte_perm_S (w[ 9], w[10], selector); - w[15] = __byte_perm_S (w[ 8], w[ 9], selector); - w[14] = __byte_perm_S (w[ 7], w[ 8], selector); - w[13] = __byte_perm_S (w[ 6], w[ 7], selector); - w[12] = __byte_perm_S (w[ 5], w[ 6], selector); - w[11] = __byte_perm_S (w[ 4], w[ 5], selector); - w[10] = __byte_perm_S (w[ 3], w[ 4], selector); - w[ 9] = __byte_perm_S (w[ 2], w[ 3], selector); - w[ 8] = __byte_perm_S (w[ 1], w[ 2], selector); - w[ 7] = __byte_perm_S (w[ 0], w[ 1], selector); - w[ 6] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[56], w[57], selector); + w[62] = hc_byte_perm_S (w[55], w[56], selector); + w[61] = hc_byte_perm_S (w[54], w[55], selector); + w[60] = hc_byte_perm_S (w[53], w[54], selector); + w[59] = hc_byte_perm_S (w[52], w[53], selector); + w[58] = hc_byte_perm_S (w[51], w[52], selector); + w[57] = hc_byte_perm_S (w[50], w[51], selector); + w[56] = hc_byte_perm_S (w[49], w[50], selector); + w[55] = hc_byte_perm_S (w[48], w[49], selector); + w[54] = hc_byte_perm_S (w[47], w[48], selector); + w[53] = hc_byte_perm_S (w[46], w[47], selector); + w[52] = hc_byte_perm_S (w[45], w[46], selector); + w[51] = hc_byte_perm_S (w[44], w[45], selector); + w[50] = hc_byte_perm_S (w[43], w[44], selector); + w[49] = hc_byte_perm_S (w[42], w[43], selector); + w[48] = hc_byte_perm_S (w[41], w[42], selector); + w[47] = hc_byte_perm_S (w[40], w[41], selector); + w[46] = hc_byte_perm_S (w[39], w[40], selector); + w[45] = hc_byte_perm_S (w[38], w[39], selector); + w[44] = hc_byte_perm_S (w[37], w[38], selector); + w[43] = hc_byte_perm_S (w[36], w[37], selector); + w[42] = hc_byte_perm_S (w[35], w[36], selector); + w[41] = hc_byte_perm_S (w[34], w[35], selector); + w[40] = hc_byte_perm_S (w[33], w[34], selector); + w[39] = hc_byte_perm_S (w[32], w[33], selector); + w[38] = hc_byte_perm_S (w[31], w[32], selector); + w[37] = hc_byte_perm_S (w[30], w[31], selector); + w[36] = hc_byte_perm_S (w[29], w[30], selector); + w[35] = hc_byte_perm_S (w[28], w[29], selector); + w[34] = hc_byte_perm_S (w[27], w[28], selector); + w[33] = hc_byte_perm_S (w[26], w[27], selector); + w[32] = hc_byte_perm_S (w[25], w[26], selector); + w[31] = hc_byte_perm_S (w[24], w[25], selector); + w[30] = hc_byte_perm_S (w[23], w[24], selector); + w[29] = hc_byte_perm_S (w[22], w[23], selector); + w[28] = hc_byte_perm_S (w[21], w[22], selector); + w[27] = hc_byte_perm_S (w[20], w[21], selector); + w[26] = hc_byte_perm_S (w[19], w[20], selector); + w[25] = hc_byte_perm_S (w[18], w[19], selector); + w[24] = hc_byte_perm_S (w[17], w[18], selector); + w[23] = hc_byte_perm_S (w[16], w[17], selector); + w[22] = hc_byte_perm_S (w[15], w[16], selector); + w[21] = hc_byte_perm_S (w[14], w[15], selector); + w[20] = hc_byte_perm_S (w[13], w[14], selector); + w[19] = hc_byte_perm_S (w[12], w[13], selector); + w[18] = hc_byte_perm_S (w[11], w[12], selector); + w[17] = hc_byte_perm_S (w[10], w[11], selector); + w[16] = hc_byte_perm_S (w[ 9], w[10], selector); + w[15] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[14] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[13] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[12] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[11] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[10] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[ 9] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[ 8] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[ 7] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[ 6] = hc_byte_perm_S ( 0, w[ 0], selector); w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; @@ -47200,63 +47200,63 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 7: - w[63] = __byte_perm_S (w[55], w[56], selector); - w[62] = __byte_perm_S (w[54], w[55], selector); - w[61] = __byte_perm_S (w[53], w[54], selector); - w[60] = __byte_perm_S (w[52], w[53], selector); - w[59] = __byte_perm_S (w[51], w[52], selector); - w[58] = __byte_perm_S (w[50], w[51], selector); - w[57] = __byte_perm_S (w[49], w[50], selector); - w[56] = __byte_perm_S (w[48], w[49], selector); - w[55] = __byte_perm_S (w[47], w[48], selector); - w[54] = __byte_perm_S (w[46], w[47], selector); - w[53] = __byte_perm_S (w[45], w[46], selector); - w[52] = __byte_perm_S (w[44], w[45], selector); - w[51] = __byte_perm_S (w[43], w[44], selector); - w[50] = __byte_perm_S (w[42], w[43], selector); - w[49] = __byte_perm_S (w[41], w[42], selector); - w[48] = __byte_perm_S (w[40], w[41], selector); - w[47] = __byte_perm_S (w[39], w[40], selector); - w[46] = __byte_perm_S (w[38], w[39], selector); - w[45] = __byte_perm_S (w[37], w[38], selector); - w[44] = __byte_perm_S (w[36], w[37], selector); - w[43] = __byte_perm_S (w[35], w[36], selector); - w[42] = __byte_perm_S (w[34], w[35], selector); - w[41] = __byte_perm_S (w[33], w[34], selector); - w[40] = __byte_perm_S (w[32], w[33], selector); - w[39] = __byte_perm_S (w[31], w[32], selector); - w[38] = __byte_perm_S (w[30], w[31], selector); - w[37] = __byte_perm_S (w[29], w[30], selector); - w[36] = __byte_perm_S (w[28], w[29], selector); - w[35] = __byte_perm_S (w[27], w[28], selector); - w[34] = __byte_perm_S (w[26], w[27], selector); - w[33] = __byte_perm_S (w[25], w[26], selector); - w[32] = __byte_perm_S (w[24], w[25], selector); - w[31] = __byte_perm_S (w[23], w[24], selector); - w[30] = __byte_perm_S (w[22], w[23], selector); - w[29] = __byte_perm_S (w[21], w[22], selector); - w[28] = __byte_perm_S (w[20], w[21], selector); - w[27] = __byte_perm_S (w[19], w[20], selector); - w[26] = __byte_perm_S (w[18], w[19], selector); - w[25] = __byte_perm_S (w[17], w[18], selector); - w[24] = __byte_perm_S (w[16], w[17], selector); - w[23] = __byte_perm_S (w[15], w[16], selector); - w[22] = __byte_perm_S (w[14], w[15], selector); - w[21] = __byte_perm_S (w[13], w[14], selector); - w[20] = __byte_perm_S (w[12], w[13], selector); - w[19] = __byte_perm_S (w[11], w[12], selector); - w[18] = __byte_perm_S (w[10], w[11], selector); - w[17] = __byte_perm_S (w[ 9], w[10], selector); - w[16] = __byte_perm_S (w[ 8], w[ 9], selector); - w[15] = __byte_perm_S (w[ 7], w[ 8], selector); - w[14] = __byte_perm_S (w[ 6], w[ 7], selector); - w[13] = __byte_perm_S (w[ 5], w[ 6], selector); - w[12] = __byte_perm_S (w[ 4], w[ 5], selector); - w[11] = __byte_perm_S (w[ 3], w[ 4], selector); - w[10] = __byte_perm_S (w[ 2], w[ 3], selector); - w[ 9] = __byte_perm_S (w[ 1], w[ 2], selector); - w[ 8] = __byte_perm_S (w[ 0], w[ 1], selector); - w[ 7] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[55], w[56], selector); + w[62] = hc_byte_perm_S (w[54], w[55], selector); + w[61] = hc_byte_perm_S (w[53], w[54], selector); + w[60] = hc_byte_perm_S (w[52], w[53], selector); + w[59] = hc_byte_perm_S (w[51], w[52], selector); + w[58] = hc_byte_perm_S (w[50], w[51], selector); + w[57] = hc_byte_perm_S (w[49], w[50], selector); + w[56] = hc_byte_perm_S (w[48], w[49], selector); + w[55] = hc_byte_perm_S (w[47], w[48], selector); + w[54] = hc_byte_perm_S (w[46], w[47], selector); + w[53] = hc_byte_perm_S (w[45], w[46], selector); + w[52] = hc_byte_perm_S (w[44], w[45], selector); + w[51] = hc_byte_perm_S (w[43], w[44], selector); + w[50] = hc_byte_perm_S (w[42], w[43], selector); + w[49] = hc_byte_perm_S (w[41], w[42], selector); + w[48] = hc_byte_perm_S (w[40], w[41], selector); + w[47] = hc_byte_perm_S (w[39], w[40], selector); + w[46] = hc_byte_perm_S (w[38], w[39], selector); + w[45] = hc_byte_perm_S (w[37], w[38], selector); + w[44] = hc_byte_perm_S (w[36], w[37], selector); + w[43] = hc_byte_perm_S (w[35], w[36], selector); + w[42] = hc_byte_perm_S (w[34], w[35], selector); + w[41] = hc_byte_perm_S (w[33], w[34], selector); + w[40] = hc_byte_perm_S (w[32], w[33], selector); + w[39] = hc_byte_perm_S (w[31], w[32], selector); + w[38] = hc_byte_perm_S (w[30], w[31], selector); + w[37] = hc_byte_perm_S (w[29], w[30], selector); + w[36] = hc_byte_perm_S (w[28], w[29], selector); + w[35] = hc_byte_perm_S (w[27], w[28], selector); + w[34] = hc_byte_perm_S (w[26], w[27], selector); + w[33] = hc_byte_perm_S (w[25], w[26], selector); + w[32] = hc_byte_perm_S (w[24], w[25], selector); + w[31] = hc_byte_perm_S (w[23], w[24], selector); + w[30] = hc_byte_perm_S (w[22], w[23], selector); + w[29] = hc_byte_perm_S (w[21], w[22], selector); + w[28] = hc_byte_perm_S (w[20], w[21], selector); + w[27] = hc_byte_perm_S (w[19], w[20], selector); + w[26] = hc_byte_perm_S (w[18], w[19], selector); + w[25] = hc_byte_perm_S (w[17], w[18], selector); + w[24] = hc_byte_perm_S (w[16], w[17], selector); + w[23] = hc_byte_perm_S (w[15], w[16], selector); + w[22] = hc_byte_perm_S (w[14], w[15], selector); + w[21] = hc_byte_perm_S (w[13], w[14], selector); + w[20] = hc_byte_perm_S (w[12], w[13], selector); + w[19] = hc_byte_perm_S (w[11], w[12], selector); + w[18] = hc_byte_perm_S (w[10], w[11], selector); + w[17] = hc_byte_perm_S (w[ 9], w[10], selector); + w[16] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[15] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[14] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[13] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[12] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[11] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[10] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[ 9] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[ 8] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[ 7] = hc_byte_perm_S ( 0, w[ 0], selector); w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; @@ -47268,62 +47268,62 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 8: - w[63] = __byte_perm_S (w[54], w[55], selector); - w[62] = __byte_perm_S (w[53], w[54], selector); - w[61] = __byte_perm_S (w[52], w[53], selector); - w[60] = __byte_perm_S (w[51], w[52], selector); - w[59] = __byte_perm_S (w[50], w[51], selector); - w[58] = __byte_perm_S (w[49], w[50], selector); - w[57] = __byte_perm_S (w[48], w[49], selector); - w[56] = __byte_perm_S (w[47], w[48], selector); - w[55] = __byte_perm_S (w[46], w[47], selector); - w[54] = __byte_perm_S (w[45], w[46], selector); - w[53] = __byte_perm_S (w[44], w[45], selector); - w[52] = __byte_perm_S (w[43], w[44], selector); - w[51] = __byte_perm_S (w[42], w[43], selector); - w[50] = __byte_perm_S (w[41], w[42], selector); - w[49] = __byte_perm_S (w[40], w[41], selector); - w[48] = __byte_perm_S (w[39], w[40], selector); - w[47] = __byte_perm_S (w[38], w[39], selector); - w[46] = __byte_perm_S (w[37], w[38], selector); - w[45] = __byte_perm_S (w[36], w[37], selector); - w[44] = __byte_perm_S (w[35], w[36], selector); - w[43] = __byte_perm_S (w[34], w[35], selector); - w[42] = __byte_perm_S (w[33], w[34], selector); - w[41] = __byte_perm_S (w[32], w[33], selector); - w[40] = __byte_perm_S (w[31], w[32], selector); - w[39] = __byte_perm_S (w[30], w[31], selector); - w[38] = __byte_perm_S (w[29], w[30], selector); - w[37] = __byte_perm_S (w[28], w[29], selector); - w[36] = __byte_perm_S (w[27], w[28], selector); - w[35] = __byte_perm_S (w[26], w[27], selector); - w[34] = __byte_perm_S (w[25], w[26], selector); - w[33] = __byte_perm_S (w[24], w[25], selector); - w[32] = __byte_perm_S (w[23], w[24], selector); - w[31] = __byte_perm_S (w[22], w[23], selector); - w[30] = __byte_perm_S (w[21], w[22], selector); - w[29] = __byte_perm_S (w[20], w[21], selector); - w[28] = __byte_perm_S (w[19], w[20], selector); - w[27] = __byte_perm_S (w[18], w[19], selector); - w[26] = __byte_perm_S (w[17], w[18], selector); - w[25] = __byte_perm_S (w[16], w[17], selector); - w[24] = __byte_perm_S (w[15], w[16], selector); - w[23] = __byte_perm_S (w[14], w[15], selector); - w[22] = __byte_perm_S (w[13], w[14], selector); - w[21] = __byte_perm_S (w[12], w[13], selector); - w[20] = __byte_perm_S (w[11], w[12], selector); - w[19] = __byte_perm_S (w[10], w[11], selector); - w[18] = __byte_perm_S (w[ 9], w[10], selector); - w[17] = __byte_perm_S (w[ 8], w[ 9], selector); - w[16] = __byte_perm_S (w[ 7], w[ 8], selector); - w[15] = __byte_perm_S (w[ 6], w[ 7], selector); - w[14] = __byte_perm_S (w[ 5], w[ 6], selector); - w[13] = __byte_perm_S (w[ 4], w[ 5], selector); - w[12] = __byte_perm_S (w[ 3], w[ 4], selector); - w[11] = __byte_perm_S (w[ 2], w[ 3], selector); - w[10] = __byte_perm_S (w[ 1], w[ 2], selector); - w[ 9] = __byte_perm_S (w[ 0], w[ 1], selector); - w[ 8] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[54], w[55], selector); + w[62] = hc_byte_perm_S (w[53], w[54], selector); + w[61] = hc_byte_perm_S (w[52], w[53], selector); + w[60] = hc_byte_perm_S (w[51], w[52], selector); + w[59] = hc_byte_perm_S (w[50], w[51], selector); + w[58] = hc_byte_perm_S (w[49], w[50], selector); + w[57] = hc_byte_perm_S (w[48], w[49], selector); + w[56] = hc_byte_perm_S (w[47], w[48], selector); + w[55] = hc_byte_perm_S (w[46], w[47], selector); + w[54] = hc_byte_perm_S (w[45], w[46], selector); + w[53] = hc_byte_perm_S (w[44], w[45], selector); + w[52] = hc_byte_perm_S (w[43], w[44], selector); + w[51] = hc_byte_perm_S (w[42], w[43], selector); + w[50] = hc_byte_perm_S (w[41], w[42], selector); + w[49] = hc_byte_perm_S (w[40], w[41], selector); + w[48] = hc_byte_perm_S (w[39], w[40], selector); + w[47] = hc_byte_perm_S (w[38], w[39], selector); + w[46] = hc_byte_perm_S (w[37], w[38], selector); + w[45] = hc_byte_perm_S (w[36], w[37], selector); + w[44] = hc_byte_perm_S (w[35], w[36], selector); + w[43] = hc_byte_perm_S (w[34], w[35], selector); + w[42] = hc_byte_perm_S (w[33], w[34], selector); + w[41] = hc_byte_perm_S (w[32], w[33], selector); + w[40] = hc_byte_perm_S (w[31], w[32], selector); + w[39] = hc_byte_perm_S (w[30], w[31], selector); + w[38] = hc_byte_perm_S (w[29], w[30], selector); + w[37] = hc_byte_perm_S (w[28], w[29], selector); + w[36] = hc_byte_perm_S (w[27], w[28], selector); + w[35] = hc_byte_perm_S (w[26], w[27], selector); + w[34] = hc_byte_perm_S (w[25], w[26], selector); + w[33] = hc_byte_perm_S (w[24], w[25], selector); + w[32] = hc_byte_perm_S (w[23], w[24], selector); + w[31] = hc_byte_perm_S (w[22], w[23], selector); + w[30] = hc_byte_perm_S (w[21], w[22], selector); + w[29] = hc_byte_perm_S (w[20], w[21], selector); + w[28] = hc_byte_perm_S (w[19], w[20], selector); + w[27] = hc_byte_perm_S (w[18], w[19], selector); + w[26] = hc_byte_perm_S (w[17], w[18], selector); + w[25] = hc_byte_perm_S (w[16], w[17], selector); + w[24] = hc_byte_perm_S (w[15], w[16], selector); + w[23] = hc_byte_perm_S (w[14], w[15], selector); + w[22] = hc_byte_perm_S (w[13], w[14], selector); + w[21] = hc_byte_perm_S (w[12], w[13], selector); + w[20] = hc_byte_perm_S (w[11], w[12], selector); + w[19] = hc_byte_perm_S (w[10], w[11], selector); + w[18] = hc_byte_perm_S (w[ 9], w[10], selector); + w[17] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[16] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[15] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[14] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[13] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[12] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[11] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[10] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[ 9] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[ 8] = hc_byte_perm_S ( 0, w[ 0], selector); w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; @@ -47336,61 +47336,61 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 9: - w[63] = __byte_perm_S (w[53], w[54], selector); - w[62] = __byte_perm_S (w[52], w[53], selector); - w[61] = __byte_perm_S (w[51], w[52], selector); - w[60] = __byte_perm_S (w[50], w[51], selector); - w[59] = __byte_perm_S (w[49], w[50], selector); - w[58] = __byte_perm_S (w[48], w[49], selector); - w[57] = __byte_perm_S (w[47], w[48], selector); - w[56] = __byte_perm_S (w[46], w[47], selector); - w[55] = __byte_perm_S (w[45], w[46], selector); - w[54] = __byte_perm_S (w[44], w[45], selector); - w[53] = __byte_perm_S (w[43], w[44], selector); - w[52] = __byte_perm_S (w[42], w[43], selector); - w[51] = __byte_perm_S (w[41], w[42], selector); - w[50] = __byte_perm_S (w[40], w[41], selector); - w[49] = __byte_perm_S (w[39], w[40], selector); - w[48] = __byte_perm_S (w[38], w[39], selector); - w[47] = __byte_perm_S (w[37], w[38], selector); - w[46] = __byte_perm_S (w[36], w[37], selector); - w[45] = __byte_perm_S (w[35], w[36], selector); - w[44] = __byte_perm_S (w[34], w[35], selector); - w[43] = __byte_perm_S (w[33], w[34], selector); - w[42] = __byte_perm_S (w[32], w[33], selector); - w[41] = __byte_perm_S (w[31], w[32], selector); - w[40] = __byte_perm_S (w[30], w[31], selector); - w[39] = __byte_perm_S (w[29], w[30], selector); - w[38] = __byte_perm_S (w[28], w[29], selector); - w[37] = __byte_perm_S (w[27], w[28], selector); - w[36] = __byte_perm_S (w[26], w[27], selector); - w[35] = __byte_perm_S (w[25], w[26], selector); - w[34] = __byte_perm_S (w[24], w[25], selector); - w[33] = __byte_perm_S (w[23], w[24], selector); - w[32] = __byte_perm_S (w[22], w[23], selector); - w[31] = __byte_perm_S (w[21], w[22], selector); - w[30] = __byte_perm_S (w[20], w[21], selector); - w[29] = __byte_perm_S (w[19], w[20], selector); - w[28] = __byte_perm_S (w[18], w[19], selector); - w[27] = __byte_perm_S (w[17], w[18], selector); - w[26] = __byte_perm_S (w[16], w[17], selector); - w[25] = __byte_perm_S (w[15], w[16], selector); - w[24] = __byte_perm_S (w[14], w[15], selector); - w[23] = __byte_perm_S (w[13], w[14], selector); - w[22] = __byte_perm_S (w[12], w[13], selector); - w[21] = __byte_perm_S (w[11], w[12], selector); - w[20] = __byte_perm_S (w[10], w[11], selector); - w[19] = __byte_perm_S (w[ 9], w[10], selector); - w[18] = __byte_perm_S (w[ 8], w[ 9], selector); - w[17] = __byte_perm_S (w[ 7], w[ 8], selector); - w[16] = __byte_perm_S (w[ 6], w[ 7], selector); - w[15] = __byte_perm_S (w[ 5], w[ 6], selector); - w[14] = __byte_perm_S (w[ 4], w[ 5], selector); - w[13] = __byte_perm_S (w[ 3], w[ 4], selector); - w[12] = __byte_perm_S (w[ 2], w[ 3], selector); - w[11] = __byte_perm_S (w[ 1], w[ 2], selector); - w[10] = __byte_perm_S (w[ 0], w[ 1], selector); - w[ 9] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[53], w[54], selector); + w[62] = hc_byte_perm_S (w[52], w[53], selector); + w[61] = hc_byte_perm_S (w[51], w[52], selector); + w[60] = hc_byte_perm_S (w[50], w[51], selector); + w[59] = hc_byte_perm_S (w[49], w[50], selector); + w[58] = hc_byte_perm_S (w[48], w[49], selector); + w[57] = hc_byte_perm_S (w[47], w[48], selector); + w[56] = hc_byte_perm_S (w[46], w[47], selector); + w[55] = hc_byte_perm_S (w[45], w[46], selector); + w[54] = hc_byte_perm_S (w[44], w[45], selector); + w[53] = hc_byte_perm_S (w[43], w[44], selector); + w[52] = hc_byte_perm_S (w[42], w[43], selector); + w[51] = hc_byte_perm_S (w[41], w[42], selector); + w[50] = hc_byte_perm_S (w[40], w[41], selector); + w[49] = hc_byte_perm_S (w[39], w[40], selector); + w[48] = hc_byte_perm_S (w[38], w[39], selector); + w[47] = hc_byte_perm_S (w[37], w[38], selector); + w[46] = hc_byte_perm_S (w[36], w[37], selector); + w[45] = hc_byte_perm_S (w[35], w[36], selector); + w[44] = hc_byte_perm_S (w[34], w[35], selector); + w[43] = hc_byte_perm_S (w[33], w[34], selector); + w[42] = hc_byte_perm_S (w[32], w[33], selector); + w[41] = hc_byte_perm_S (w[31], w[32], selector); + w[40] = hc_byte_perm_S (w[30], w[31], selector); + w[39] = hc_byte_perm_S (w[29], w[30], selector); + w[38] = hc_byte_perm_S (w[28], w[29], selector); + w[37] = hc_byte_perm_S (w[27], w[28], selector); + w[36] = hc_byte_perm_S (w[26], w[27], selector); + w[35] = hc_byte_perm_S (w[25], w[26], selector); + w[34] = hc_byte_perm_S (w[24], w[25], selector); + w[33] = hc_byte_perm_S (w[23], w[24], selector); + w[32] = hc_byte_perm_S (w[22], w[23], selector); + w[31] = hc_byte_perm_S (w[21], w[22], selector); + w[30] = hc_byte_perm_S (w[20], w[21], selector); + w[29] = hc_byte_perm_S (w[19], w[20], selector); + w[28] = hc_byte_perm_S (w[18], w[19], selector); + w[27] = hc_byte_perm_S (w[17], w[18], selector); + w[26] = hc_byte_perm_S (w[16], w[17], selector); + w[25] = hc_byte_perm_S (w[15], w[16], selector); + w[24] = hc_byte_perm_S (w[14], w[15], selector); + w[23] = hc_byte_perm_S (w[13], w[14], selector); + w[22] = hc_byte_perm_S (w[12], w[13], selector); + w[21] = hc_byte_perm_S (w[11], w[12], selector); + w[20] = hc_byte_perm_S (w[10], w[11], selector); + w[19] = hc_byte_perm_S (w[ 9], w[10], selector); + w[18] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[17] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[16] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[15] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[14] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[13] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[12] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[11] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[10] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[ 9] = hc_byte_perm_S ( 0, w[ 0], selector); w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; @@ -47404,60 +47404,60 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 10: - w[63] = __byte_perm_S (w[52], w[53], selector); - w[62] = __byte_perm_S (w[51], w[52], selector); - w[61] = __byte_perm_S (w[50], w[51], selector); - w[60] = __byte_perm_S (w[49], w[50], selector); - w[59] = __byte_perm_S (w[48], w[49], selector); - w[58] = __byte_perm_S (w[47], w[48], selector); - w[57] = __byte_perm_S (w[46], w[47], selector); - w[56] = __byte_perm_S (w[45], w[46], selector); - w[55] = __byte_perm_S (w[44], w[45], selector); - w[54] = __byte_perm_S (w[43], w[44], selector); - w[53] = __byte_perm_S (w[42], w[43], selector); - w[52] = __byte_perm_S (w[41], w[42], selector); - w[51] = __byte_perm_S (w[40], w[41], selector); - w[50] = __byte_perm_S (w[39], w[40], selector); - w[49] = __byte_perm_S (w[38], w[39], selector); - w[48] = __byte_perm_S (w[37], w[38], selector); - w[47] = __byte_perm_S (w[36], w[37], selector); - w[46] = __byte_perm_S (w[35], w[36], selector); - w[45] = __byte_perm_S (w[34], w[35], selector); - w[44] = __byte_perm_S (w[33], w[34], selector); - w[43] = __byte_perm_S (w[32], w[33], selector); - w[42] = __byte_perm_S (w[31], w[32], selector); - w[41] = __byte_perm_S (w[30], w[31], selector); - w[40] = __byte_perm_S (w[29], w[30], selector); - w[39] = __byte_perm_S (w[28], w[29], selector); - w[38] = __byte_perm_S (w[27], w[28], selector); - w[37] = __byte_perm_S (w[26], w[27], selector); - w[36] = __byte_perm_S (w[25], w[26], selector); - w[35] = __byte_perm_S (w[24], w[25], selector); - w[34] = __byte_perm_S (w[23], w[24], selector); - w[33] = __byte_perm_S (w[22], w[23], selector); - w[32] = __byte_perm_S (w[21], w[22], selector); - w[31] = __byte_perm_S (w[20], w[21], selector); - w[30] = __byte_perm_S (w[19], w[20], selector); - w[29] = __byte_perm_S (w[18], w[19], selector); - w[28] = __byte_perm_S (w[17], w[18], selector); - w[27] = __byte_perm_S (w[16], w[17], selector); - w[26] = __byte_perm_S (w[15], w[16], selector); - w[25] = __byte_perm_S (w[14], w[15], selector); - w[24] = __byte_perm_S (w[13], w[14], selector); - w[23] = __byte_perm_S (w[12], w[13], selector); - w[22] = __byte_perm_S (w[11], w[12], selector); - w[21] = __byte_perm_S (w[10], w[11], selector); - w[20] = __byte_perm_S (w[ 9], w[10], selector); - w[19] = __byte_perm_S (w[ 8], w[ 9], selector); - w[18] = __byte_perm_S (w[ 7], w[ 8], selector); - w[17] = __byte_perm_S (w[ 6], w[ 7], selector); - w[16] = __byte_perm_S (w[ 5], w[ 6], selector); - w[15] = __byte_perm_S (w[ 4], w[ 5], selector); - w[14] = __byte_perm_S (w[ 3], w[ 4], selector); - w[13] = __byte_perm_S (w[ 2], w[ 3], selector); - w[12] = __byte_perm_S (w[ 1], w[ 2], selector); - w[11] = __byte_perm_S (w[ 0], w[ 1], selector); - w[10] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[52], w[53], selector); + w[62] = hc_byte_perm_S (w[51], w[52], selector); + w[61] = hc_byte_perm_S (w[50], w[51], selector); + w[60] = hc_byte_perm_S (w[49], w[50], selector); + w[59] = hc_byte_perm_S (w[48], w[49], selector); + w[58] = hc_byte_perm_S (w[47], w[48], selector); + w[57] = hc_byte_perm_S (w[46], w[47], selector); + w[56] = hc_byte_perm_S (w[45], w[46], selector); + w[55] = hc_byte_perm_S (w[44], w[45], selector); + w[54] = hc_byte_perm_S (w[43], w[44], selector); + w[53] = hc_byte_perm_S (w[42], w[43], selector); + w[52] = hc_byte_perm_S (w[41], w[42], selector); + w[51] = hc_byte_perm_S (w[40], w[41], selector); + w[50] = hc_byte_perm_S (w[39], w[40], selector); + w[49] = hc_byte_perm_S (w[38], w[39], selector); + w[48] = hc_byte_perm_S (w[37], w[38], selector); + w[47] = hc_byte_perm_S (w[36], w[37], selector); + w[46] = hc_byte_perm_S (w[35], w[36], selector); + w[45] = hc_byte_perm_S (w[34], w[35], selector); + w[44] = hc_byte_perm_S (w[33], w[34], selector); + w[43] = hc_byte_perm_S (w[32], w[33], selector); + w[42] = hc_byte_perm_S (w[31], w[32], selector); + w[41] = hc_byte_perm_S (w[30], w[31], selector); + w[40] = hc_byte_perm_S (w[29], w[30], selector); + w[39] = hc_byte_perm_S (w[28], w[29], selector); + w[38] = hc_byte_perm_S (w[27], w[28], selector); + w[37] = hc_byte_perm_S (w[26], w[27], selector); + w[36] = hc_byte_perm_S (w[25], w[26], selector); + w[35] = hc_byte_perm_S (w[24], w[25], selector); + w[34] = hc_byte_perm_S (w[23], w[24], selector); + w[33] = hc_byte_perm_S (w[22], w[23], selector); + w[32] = hc_byte_perm_S (w[21], w[22], selector); + w[31] = hc_byte_perm_S (w[20], w[21], selector); + w[30] = hc_byte_perm_S (w[19], w[20], selector); + w[29] = hc_byte_perm_S (w[18], w[19], selector); + w[28] = hc_byte_perm_S (w[17], w[18], selector); + w[27] = hc_byte_perm_S (w[16], w[17], selector); + w[26] = hc_byte_perm_S (w[15], w[16], selector); + w[25] = hc_byte_perm_S (w[14], w[15], selector); + w[24] = hc_byte_perm_S (w[13], w[14], selector); + w[23] = hc_byte_perm_S (w[12], w[13], selector); + w[22] = hc_byte_perm_S (w[11], w[12], selector); + w[21] = hc_byte_perm_S (w[10], w[11], selector); + w[20] = hc_byte_perm_S (w[ 9], w[10], selector); + w[19] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[18] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[17] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[16] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[15] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[14] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[13] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[12] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[11] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[10] = hc_byte_perm_S ( 0, w[ 0], selector); w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; @@ -47472,59 +47472,59 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 11: - w[63] = __byte_perm_S (w[51], w[52], selector); - w[62] = __byte_perm_S (w[50], w[51], selector); - w[61] = __byte_perm_S (w[49], w[50], selector); - w[60] = __byte_perm_S (w[48], w[49], selector); - w[59] = __byte_perm_S (w[47], w[48], selector); - w[58] = __byte_perm_S (w[46], w[47], selector); - w[57] = __byte_perm_S (w[45], w[46], selector); - w[56] = __byte_perm_S (w[44], w[45], selector); - w[55] = __byte_perm_S (w[43], w[44], selector); - w[54] = __byte_perm_S (w[42], w[43], selector); - w[53] = __byte_perm_S (w[41], w[42], selector); - w[52] = __byte_perm_S (w[40], w[41], selector); - w[51] = __byte_perm_S (w[39], w[40], selector); - w[50] = __byte_perm_S (w[38], w[39], selector); - w[49] = __byte_perm_S (w[37], w[38], selector); - w[48] = __byte_perm_S (w[36], w[37], selector); - w[47] = __byte_perm_S (w[35], w[36], selector); - w[46] = __byte_perm_S (w[34], w[35], selector); - w[45] = __byte_perm_S (w[33], w[34], selector); - w[44] = __byte_perm_S (w[32], w[33], selector); - w[43] = __byte_perm_S (w[31], w[32], selector); - w[42] = __byte_perm_S (w[30], w[31], selector); - w[41] = __byte_perm_S (w[29], w[30], selector); - w[40] = __byte_perm_S (w[28], w[29], selector); - w[39] = __byte_perm_S (w[27], w[28], selector); - w[38] = __byte_perm_S (w[26], w[27], selector); - w[37] = __byte_perm_S (w[25], w[26], selector); - w[36] = __byte_perm_S (w[24], w[25], selector); - w[35] = __byte_perm_S (w[23], w[24], selector); - w[34] = __byte_perm_S (w[22], w[23], selector); - w[33] = __byte_perm_S (w[21], w[22], selector); - w[32] = __byte_perm_S (w[20], w[21], selector); - w[31] = __byte_perm_S (w[19], w[20], selector); - w[30] = __byte_perm_S (w[18], w[19], selector); - w[29] = __byte_perm_S (w[17], w[18], selector); - w[28] = __byte_perm_S (w[16], w[17], selector); - w[27] = __byte_perm_S (w[15], w[16], selector); - w[26] = __byte_perm_S (w[14], w[15], selector); - w[25] = __byte_perm_S (w[13], w[14], selector); - w[24] = __byte_perm_S (w[12], w[13], selector); - w[23] = __byte_perm_S (w[11], w[12], selector); - w[22] = __byte_perm_S (w[10], w[11], selector); - w[21] = __byte_perm_S (w[ 9], w[10], selector); - w[20] = __byte_perm_S (w[ 8], w[ 9], selector); - w[19] = __byte_perm_S (w[ 7], w[ 8], selector); - w[18] = __byte_perm_S (w[ 6], w[ 7], selector); - w[17] = __byte_perm_S (w[ 5], w[ 6], selector); - w[16] = __byte_perm_S (w[ 4], w[ 5], selector); - w[15] = __byte_perm_S (w[ 3], w[ 4], selector); - w[14] = __byte_perm_S (w[ 2], w[ 3], selector); - w[13] = __byte_perm_S (w[ 1], w[ 2], selector); - w[12] = __byte_perm_S (w[ 0], w[ 1], selector); - w[11] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[51], w[52], selector); + w[62] = hc_byte_perm_S (w[50], w[51], selector); + w[61] = hc_byte_perm_S (w[49], w[50], selector); + w[60] = hc_byte_perm_S (w[48], w[49], selector); + w[59] = hc_byte_perm_S (w[47], w[48], selector); + w[58] = hc_byte_perm_S (w[46], w[47], selector); + w[57] = hc_byte_perm_S (w[45], w[46], selector); + w[56] = hc_byte_perm_S (w[44], w[45], selector); + w[55] = hc_byte_perm_S (w[43], w[44], selector); + w[54] = hc_byte_perm_S (w[42], w[43], selector); + w[53] = hc_byte_perm_S (w[41], w[42], selector); + w[52] = hc_byte_perm_S (w[40], w[41], selector); + w[51] = hc_byte_perm_S (w[39], w[40], selector); + w[50] = hc_byte_perm_S (w[38], w[39], selector); + w[49] = hc_byte_perm_S (w[37], w[38], selector); + w[48] = hc_byte_perm_S (w[36], w[37], selector); + w[47] = hc_byte_perm_S (w[35], w[36], selector); + w[46] = hc_byte_perm_S (w[34], w[35], selector); + w[45] = hc_byte_perm_S (w[33], w[34], selector); + w[44] = hc_byte_perm_S (w[32], w[33], selector); + w[43] = hc_byte_perm_S (w[31], w[32], selector); + w[42] = hc_byte_perm_S (w[30], w[31], selector); + w[41] = hc_byte_perm_S (w[29], w[30], selector); + w[40] = hc_byte_perm_S (w[28], w[29], selector); + w[39] = hc_byte_perm_S (w[27], w[28], selector); + w[38] = hc_byte_perm_S (w[26], w[27], selector); + w[37] = hc_byte_perm_S (w[25], w[26], selector); + w[36] = hc_byte_perm_S (w[24], w[25], selector); + w[35] = hc_byte_perm_S (w[23], w[24], selector); + w[34] = hc_byte_perm_S (w[22], w[23], selector); + w[33] = hc_byte_perm_S (w[21], w[22], selector); + w[32] = hc_byte_perm_S (w[20], w[21], selector); + w[31] = hc_byte_perm_S (w[19], w[20], selector); + w[30] = hc_byte_perm_S (w[18], w[19], selector); + w[29] = hc_byte_perm_S (w[17], w[18], selector); + w[28] = hc_byte_perm_S (w[16], w[17], selector); + w[27] = hc_byte_perm_S (w[15], w[16], selector); + w[26] = hc_byte_perm_S (w[14], w[15], selector); + w[25] = hc_byte_perm_S (w[13], w[14], selector); + w[24] = hc_byte_perm_S (w[12], w[13], selector); + w[23] = hc_byte_perm_S (w[11], w[12], selector); + w[22] = hc_byte_perm_S (w[10], w[11], selector); + w[21] = hc_byte_perm_S (w[ 9], w[10], selector); + w[20] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[19] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[18] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[17] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[16] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[15] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[14] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[13] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[12] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[11] = hc_byte_perm_S ( 0, w[ 0], selector); w[10] = 0; w[ 9] = 0; w[ 8] = 0; @@ -47540,58 +47540,58 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 12: - w[63] = __byte_perm_S (w[50], w[51], selector); - w[62] = __byte_perm_S (w[49], w[50], selector); - w[61] = __byte_perm_S (w[48], w[49], selector); - w[60] = __byte_perm_S (w[47], w[48], selector); - w[59] = __byte_perm_S (w[46], w[47], selector); - w[58] = __byte_perm_S (w[45], w[46], selector); - w[57] = __byte_perm_S (w[44], w[45], selector); - w[56] = __byte_perm_S (w[43], w[44], selector); - w[55] = __byte_perm_S (w[42], w[43], selector); - w[54] = __byte_perm_S (w[41], w[42], selector); - w[53] = __byte_perm_S (w[40], w[41], selector); - w[52] = __byte_perm_S (w[39], w[40], selector); - w[51] = __byte_perm_S (w[38], w[39], selector); - w[50] = __byte_perm_S (w[37], w[38], selector); - w[49] = __byte_perm_S (w[36], w[37], selector); - w[48] = __byte_perm_S (w[35], w[36], selector); - w[47] = __byte_perm_S (w[34], w[35], selector); - w[46] = __byte_perm_S (w[33], w[34], selector); - w[45] = __byte_perm_S (w[32], w[33], selector); - w[44] = __byte_perm_S (w[31], w[32], selector); - w[43] = __byte_perm_S (w[30], w[31], selector); - w[42] = __byte_perm_S (w[29], w[30], selector); - w[41] = __byte_perm_S (w[28], w[29], selector); - w[40] = __byte_perm_S (w[27], w[28], selector); - w[39] = __byte_perm_S (w[26], w[27], selector); - w[38] = __byte_perm_S (w[25], w[26], selector); - w[37] = __byte_perm_S (w[24], w[25], selector); - w[36] = __byte_perm_S (w[23], w[24], selector); - w[35] = __byte_perm_S (w[22], w[23], selector); - w[34] = __byte_perm_S (w[21], w[22], selector); - w[33] = __byte_perm_S (w[20], w[21], selector); - w[32] = __byte_perm_S (w[19], w[20], selector); - w[31] = __byte_perm_S (w[18], w[19], selector); - w[30] = __byte_perm_S (w[17], w[18], selector); - w[29] = __byte_perm_S (w[16], w[17], selector); - w[28] = __byte_perm_S (w[15], w[16], selector); - w[27] = __byte_perm_S (w[14], w[15], selector); - w[26] = __byte_perm_S (w[13], w[14], selector); - w[25] = __byte_perm_S (w[12], w[13], selector); - w[24] = __byte_perm_S (w[11], w[12], selector); - w[23] = __byte_perm_S (w[10], w[11], selector); - w[22] = __byte_perm_S (w[ 9], w[10], selector); - w[21] = __byte_perm_S (w[ 8], w[ 9], selector); - w[20] = __byte_perm_S (w[ 7], w[ 8], selector); - w[19] = __byte_perm_S (w[ 6], w[ 7], selector); - w[18] = __byte_perm_S (w[ 5], w[ 6], selector); - w[17] = __byte_perm_S (w[ 4], w[ 5], selector); - w[16] = __byte_perm_S (w[ 3], w[ 4], selector); - w[15] = __byte_perm_S (w[ 2], w[ 3], selector); - w[14] = __byte_perm_S (w[ 1], w[ 2], selector); - w[13] = __byte_perm_S (w[ 0], w[ 1], selector); - w[12] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[50], w[51], selector); + w[62] = hc_byte_perm_S (w[49], w[50], selector); + w[61] = hc_byte_perm_S (w[48], w[49], selector); + w[60] = hc_byte_perm_S (w[47], w[48], selector); + w[59] = hc_byte_perm_S (w[46], w[47], selector); + w[58] = hc_byte_perm_S (w[45], w[46], selector); + w[57] = hc_byte_perm_S (w[44], w[45], selector); + w[56] = hc_byte_perm_S (w[43], w[44], selector); + w[55] = hc_byte_perm_S (w[42], w[43], selector); + w[54] = hc_byte_perm_S (w[41], w[42], selector); + w[53] = hc_byte_perm_S (w[40], w[41], selector); + w[52] = hc_byte_perm_S (w[39], w[40], selector); + w[51] = hc_byte_perm_S (w[38], w[39], selector); + w[50] = hc_byte_perm_S (w[37], w[38], selector); + w[49] = hc_byte_perm_S (w[36], w[37], selector); + w[48] = hc_byte_perm_S (w[35], w[36], selector); + w[47] = hc_byte_perm_S (w[34], w[35], selector); + w[46] = hc_byte_perm_S (w[33], w[34], selector); + w[45] = hc_byte_perm_S (w[32], w[33], selector); + w[44] = hc_byte_perm_S (w[31], w[32], selector); + w[43] = hc_byte_perm_S (w[30], w[31], selector); + w[42] = hc_byte_perm_S (w[29], w[30], selector); + w[41] = hc_byte_perm_S (w[28], w[29], selector); + w[40] = hc_byte_perm_S (w[27], w[28], selector); + w[39] = hc_byte_perm_S (w[26], w[27], selector); + w[38] = hc_byte_perm_S (w[25], w[26], selector); + w[37] = hc_byte_perm_S (w[24], w[25], selector); + w[36] = hc_byte_perm_S (w[23], w[24], selector); + w[35] = hc_byte_perm_S (w[22], w[23], selector); + w[34] = hc_byte_perm_S (w[21], w[22], selector); + w[33] = hc_byte_perm_S (w[20], w[21], selector); + w[32] = hc_byte_perm_S (w[19], w[20], selector); + w[31] = hc_byte_perm_S (w[18], w[19], selector); + w[30] = hc_byte_perm_S (w[17], w[18], selector); + w[29] = hc_byte_perm_S (w[16], w[17], selector); + w[28] = hc_byte_perm_S (w[15], w[16], selector); + w[27] = hc_byte_perm_S (w[14], w[15], selector); + w[26] = hc_byte_perm_S (w[13], w[14], selector); + w[25] = hc_byte_perm_S (w[12], w[13], selector); + w[24] = hc_byte_perm_S (w[11], w[12], selector); + w[23] = hc_byte_perm_S (w[10], w[11], selector); + w[22] = hc_byte_perm_S (w[ 9], w[10], selector); + w[21] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[20] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[19] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[18] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[17] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[16] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[15] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[14] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[13] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[12] = hc_byte_perm_S ( 0, w[ 0], selector); w[11] = 0; w[10] = 0; w[ 9] = 0; @@ -47608,57 +47608,57 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 13: - w[63] = __byte_perm_S (w[49], w[50], selector); - w[62] = __byte_perm_S (w[48], w[49], selector); - w[61] = __byte_perm_S (w[47], w[48], selector); - w[60] = __byte_perm_S (w[46], w[47], selector); - w[59] = __byte_perm_S (w[45], w[46], selector); - w[58] = __byte_perm_S (w[44], w[45], selector); - w[57] = __byte_perm_S (w[43], w[44], selector); - w[56] = __byte_perm_S (w[42], w[43], selector); - w[55] = __byte_perm_S (w[41], w[42], selector); - w[54] = __byte_perm_S (w[40], w[41], selector); - w[53] = __byte_perm_S (w[39], w[40], selector); - w[52] = __byte_perm_S (w[38], w[39], selector); - w[51] = __byte_perm_S (w[37], w[38], selector); - w[50] = __byte_perm_S (w[36], w[37], selector); - w[49] = __byte_perm_S (w[35], w[36], selector); - w[48] = __byte_perm_S (w[34], w[35], selector); - w[47] = __byte_perm_S (w[33], w[34], selector); - w[46] = __byte_perm_S (w[32], w[33], selector); - w[45] = __byte_perm_S (w[31], w[32], selector); - w[44] = __byte_perm_S (w[30], w[31], selector); - w[43] = __byte_perm_S (w[29], w[30], selector); - w[42] = __byte_perm_S (w[28], w[29], selector); - w[41] = __byte_perm_S (w[27], w[28], selector); - w[40] = __byte_perm_S (w[26], w[27], selector); - w[39] = __byte_perm_S (w[25], w[26], selector); - w[38] = __byte_perm_S (w[24], w[25], selector); - w[37] = __byte_perm_S (w[23], w[24], selector); - w[36] = __byte_perm_S (w[22], w[23], selector); - w[35] = __byte_perm_S (w[21], w[22], selector); - w[34] = __byte_perm_S (w[20], w[21], selector); - w[33] = __byte_perm_S (w[19], w[20], selector); - w[32] = __byte_perm_S (w[18], w[19], selector); - w[31] = __byte_perm_S (w[17], w[18], selector); - w[30] = __byte_perm_S (w[16], w[17], selector); - w[29] = __byte_perm_S (w[15], w[16], selector); - w[28] = __byte_perm_S (w[14], w[15], selector); - w[27] = __byte_perm_S (w[13], w[14], selector); - w[26] = __byte_perm_S (w[12], w[13], selector); - w[25] = __byte_perm_S (w[11], w[12], selector); - w[24] = __byte_perm_S (w[10], w[11], selector); - w[23] = __byte_perm_S (w[ 9], w[10], selector); - w[22] = __byte_perm_S (w[ 8], w[ 9], selector); - w[21] = __byte_perm_S (w[ 7], w[ 8], selector); - w[20] = __byte_perm_S (w[ 6], w[ 7], selector); - w[19] = __byte_perm_S (w[ 5], w[ 6], selector); - w[18] = __byte_perm_S (w[ 4], w[ 5], selector); - w[17] = __byte_perm_S (w[ 3], w[ 4], selector); - w[16] = __byte_perm_S (w[ 2], w[ 3], selector); - w[15] = __byte_perm_S (w[ 1], w[ 2], selector); - w[14] = __byte_perm_S (w[ 0], w[ 1], selector); - w[13] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[49], w[50], selector); + w[62] = hc_byte_perm_S (w[48], w[49], selector); + w[61] = hc_byte_perm_S (w[47], w[48], selector); + w[60] = hc_byte_perm_S (w[46], w[47], selector); + w[59] = hc_byte_perm_S (w[45], w[46], selector); + w[58] = hc_byte_perm_S (w[44], w[45], selector); + w[57] = hc_byte_perm_S (w[43], w[44], selector); + w[56] = hc_byte_perm_S (w[42], w[43], selector); + w[55] = hc_byte_perm_S (w[41], w[42], selector); + w[54] = hc_byte_perm_S (w[40], w[41], selector); + w[53] = hc_byte_perm_S (w[39], w[40], selector); + w[52] = hc_byte_perm_S (w[38], w[39], selector); + w[51] = hc_byte_perm_S (w[37], w[38], selector); + w[50] = hc_byte_perm_S (w[36], w[37], selector); + w[49] = hc_byte_perm_S (w[35], w[36], selector); + w[48] = hc_byte_perm_S (w[34], w[35], selector); + w[47] = hc_byte_perm_S (w[33], w[34], selector); + w[46] = hc_byte_perm_S (w[32], w[33], selector); + w[45] = hc_byte_perm_S (w[31], w[32], selector); + w[44] = hc_byte_perm_S (w[30], w[31], selector); + w[43] = hc_byte_perm_S (w[29], w[30], selector); + w[42] = hc_byte_perm_S (w[28], w[29], selector); + w[41] = hc_byte_perm_S (w[27], w[28], selector); + w[40] = hc_byte_perm_S (w[26], w[27], selector); + w[39] = hc_byte_perm_S (w[25], w[26], selector); + w[38] = hc_byte_perm_S (w[24], w[25], selector); + w[37] = hc_byte_perm_S (w[23], w[24], selector); + w[36] = hc_byte_perm_S (w[22], w[23], selector); + w[35] = hc_byte_perm_S (w[21], w[22], selector); + w[34] = hc_byte_perm_S (w[20], w[21], selector); + w[33] = hc_byte_perm_S (w[19], w[20], selector); + w[32] = hc_byte_perm_S (w[18], w[19], selector); + w[31] = hc_byte_perm_S (w[17], w[18], selector); + w[30] = hc_byte_perm_S (w[16], w[17], selector); + w[29] = hc_byte_perm_S (w[15], w[16], selector); + w[28] = hc_byte_perm_S (w[14], w[15], selector); + w[27] = hc_byte_perm_S (w[13], w[14], selector); + w[26] = hc_byte_perm_S (w[12], w[13], selector); + w[25] = hc_byte_perm_S (w[11], w[12], selector); + w[24] = hc_byte_perm_S (w[10], w[11], selector); + w[23] = hc_byte_perm_S (w[ 9], w[10], selector); + w[22] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[21] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[20] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[19] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[18] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[17] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[16] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[15] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[14] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[13] = hc_byte_perm_S ( 0, w[ 0], selector); w[12] = 0; w[11] = 0; w[10] = 0; @@ -47676,56 +47676,56 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 14: - w[63] = __byte_perm_S (w[48], w[49], selector); - w[62] = __byte_perm_S (w[47], w[48], selector); - w[61] = __byte_perm_S (w[46], w[47], selector); - w[60] = __byte_perm_S (w[45], w[46], selector); - w[59] = __byte_perm_S (w[44], w[45], selector); - w[58] = __byte_perm_S (w[43], w[44], selector); - w[57] = __byte_perm_S (w[42], w[43], selector); - w[56] = __byte_perm_S (w[41], w[42], selector); - w[55] = __byte_perm_S (w[40], w[41], selector); - w[54] = __byte_perm_S (w[39], w[40], selector); - w[53] = __byte_perm_S (w[38], w[39], selector); - w[52] = __byte_perm_S (w[37], w[38], selector); - w[51] = __byte_perm_S (w[36], w[37], selector); - w[50] = __byte_perm_S (w[35], w[36], selector); - w[49] = __byte_perm_S (w[34], w[35], selector); - w[48] = __byte_perm_S (w[33], w[34], selector); - w[47] = __byte_perm_S (w[32], w[33], selector); - w[46] = __byte_perm_S (w[31], w[32], selector); - w[45] = __byte_perm_S (w[30], w[31], selector); - w[44] = __byte_perm_S (w[29], w[30], selector); - w[43] = __byte_perm_S (w[28], w[29], selector); - w[42] = __byte_perm_S (w[27], w[28], selector); - w[41] = __byte_perm_S (w[26], w[27], selector); - w[40] = __byte_perm_S (w[25], w[26], selector); - w[39] = __byte_perm_S (w[24], w[25], selector); - w[38] = __byte_perm_S (w[23], w[24], selector); - w[37] = __byte_perm_S (w[22], w[23], selector); - w[36] = __byte_perm_S (w[21], w[22], selector); - w[35] = __byte_perm_S (w[20], w[21], selector); - w[34] = __byte_perm_S (w[19], w[20], selector); - w[33] = __byte_perm_S (w[18], w[19], selector); - w[32] = __byte_perm_S (w[17], w[18], selector); - w[31] = __byte_perm_S (w[16], w[17], selector); - w[30] = __byte_perm_S (w[15], w[16], selector); - w[29] = __byte_perm_S (w[14], w[15], selector); - w[28] = __byte_perm_S (w[13], w[14], selector); - w[27] = __byte_perm_S (w[12], w[13], selector); - w[26] = __byte_perm_S (w[11], w[12], selector); - w[25] = __byte_perm_S (w[10], w[11], selector); - w[24] = __byte_perm_S (w[ 9], w[10], selector); - w[23] = __byte_perm_S (w[ 8], w[ 9], selector); - w[22] = __byte_perm_S (w[ 7], w[ 8], selector); - w[21] = __byte_perm_S (w[ 6], w[ 7], selector); - w[20] = __byte_perm_S (w[ 5], w[ 6], selector); - w[19] = __byte_perm_S (w[ 4], w[ 5], selector); - w[18] = __byte_perm_S (w[ 3], w[ 4], selector); - w[17] = __byte_perm_S (w[ 2], w[ 3], selector); - w[16] = __byte_perm_S (w[ 1], w[ 2], selector); - w[15] = __byte_perm_S (w[ 0], w[ 1], selector); - w[14] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[48], w[49], selector); + w[62] = hc_byte_perm_S (w[47], w[48], selector); + w[61] = hc_byte_perm_S (w[46], w[47], selector); + w[60] = hc_byte_perm_S (w[45], w[46], selector); + w[59] = hc_byte_perm_S (w[44], w[45], selector); + w[58] = hc_byte_perm_S (w[43], w[44], selector); + w[57] = hc_byte_perm_S (w[42], w[43], selector); + w[56] = hc_byte_perm_S (w[41], w[42], selector); + w[55] = hc_byte_perm_S (w[40], w[41], selector); + w[54] = hc_byte_perm_S (w[39], w[40], selector); + w[53] = hc_byte_perm_S (w[38], w[39], selector); + w[52] = hc_byte_perm_S (w[37], w[38], selector); + w[51] = hc_byte_perm_S (w[36], w[37], selector); + w[50] = hc_byte_perm_S (w[35], w[36], selector); + w[49] = hc_byte_perm_S (w[34], w[35], selector); + w[48] = hc_byte_perm_S (w[33], w[34], selector); + w[47] = hc_byte_perm_S (w[32], w[33], selector); + w[46] = hc_byte_perm_S (w[31], w[32], selector); + w[45] = hc_byte_perm_S (w[30], w[31], selector); + w[44] = hc_byte_perm_S (w[29], w[30], selector); + w[43] = hc_byte_perm_S (w[28], w[29], selector); + w[42] = hc_byte_perm_S (w[27], w[28], selector); + w[41] = hc_byte_perm_S (w[26], w[27], selector); + w[40] = hc_byte_perm_S (w[25], w[26], selector); + w[39] = hc_byte_perm_S (w[24], w[25], selector); + w[38] = hc_byte_perm_S (w[23], w[24], selector); + w[37] = hc_byte_perm_S (w[22], w[23], selector); + w[36] = hc_byte_perm_S (w[21], w[22], selector); + w[35] = hc_byte_perm_S (w[20], w[21], selector); + w[34] = hc_byte_perm_S (w[19], w[20], selector); + w[33] = hc_byte_perm_S (w[18], w[19], selector); + w[32] = hc_byte_perm_S (w[17], w[18], selector); + w[31] = hc_byte_perm_S (w[16], w[17], selector); + w[30] = hc_byte_perm_S (w[15], w[16], selector); + w[29] = hc_byte_perm_S (w[14], w[15], selector); + w[28] = hc_byte_perm_S (w[13], w[14], selector); + w[27] = hc_byte_perm_S (w[12], w[13], selector); + w[26] = hc_byte_perm_S (w[11], w[12], selector); + w[25] = hc_byte_perm_S (w[10], w[11], selector); + w[24] = hc_byte_perm_S (w[ 9], w[10], selector); + w[23] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[22] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[21] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[20] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[19] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[18] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[17] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[16] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[15] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[14] = hc_byte_perm_S ( 0, w[ 0], selector); w[13] = 0; w[12] = 0; w[11] = 0; @@ -47744,55 +47744,55 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 15: - w[63] = __byte_perm_S (w[47], w[48], selector); - w[62] = __byte_perm_S (w[46], w[47], selector); - w[61] = __byte_perm_S (w[45], w[46], selector); - w[60] = __byte_perm_S (w[44], w[45], selector); - w[59] = __byte_perm_S (w[43], w[44], selector); - w[58] = __byte_perm_S (w[42], w[43], selector); - w[57] = __byte_perm_S (w[41], w[42], selector); - w[56] = __byte_perm_S (w[40], w[41], selector); - w[55] = __byte_perm_S (w[39], w[40], selector); - w[54] = __byte_perm_S (w[38], w[39], selector); - w[53] = __byte_perm_S (w[37], w[38], selector); - w[52] = __byte_perm_S (w[36], w[37], selector); - w[51] = __byte_perm_S (w[35], w[36], selector); - w[50] = __byte_perm_S (w[34], w[35], selector); - w[49] = __byte_perm_S (w[33], w[34], selector); - w[48] = __byte_perm_S (w[32], w[33], selector); - w[47] = __byte_perm_S (w[31], w[32], selector); - w[46] = __byte_perm_S (w[30], w[31], selector); - w[45] = __byte_perm_S (w[29], w[30], selector); - w[44] = __byte_perm_S (w[28], w[29], selector); - w[43] = __byte_perm_S (w[27], w[28], selector); - w[42] = __byte_perm_S (w[26], w[27], selector); - w[41] = __byte_perm_S (w[25], w[26], selector); - w[40] = __byte_perm_S (w[24], w[25], selector); - w[39] = __byte_perm_S (w[23], w[24], selector); - w[38] = __byte_perm_S (w[22], w[23], selector); - w[37] = __byte_perm_S (w[21], w[22], selector); - w[36] = __byte_perm_S (w[20], w[21], selector); - w[35] = __byte_perm_S (w[19], w[20], selector); - w[34] = __byte_perm_S (w[18], w[19], selector); - w[33] = __byte_perm_S (w[17], w[18], selector); - w[32] = __byte_perm_S (w[16], w[17], selector); - w[31] = __byte_perm_S (w[15], w[16], selector); - w[30] = __byte_perm_S (w[14], w[15], selector); - w[29] = __byte_perm_S (w[13], w[14], selector); - w[28] = __byte_perm_S (w[12], w[13], selector); - w[27] = __byte_perm_S (w[11], w[12], selector); - w[26] = __byte_perm_S (w[10], w[11], selector); - w[25] = __byte_perm_S (w[ 9], w[10], selector); - w[24] = __byte_perm_S (w[ 8], w[ 9], selector); - w[23] = __byte_perm_S (w[ 7], w[ 8], selector); - w[22] = __byte_perm_S (w[ 6], w[ 7], selector); - w[21] = __byte_perm_S (w[ 5], w[ 6], selector); - w[20] = __byte_perm_S (w[ 4], w[ 5], selector); - w[19] = __byte_perm_S (w[ 3], w[ 4], selector); - w[18] = __byte_perm_S (w[ 2], w[ 3], selector); - w[17] = __byte_perm_S (w[ 1], w[ 2], selector); - w[16] = __byte_perm_S (w[ 0], w[ 1], selector); - w[15] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[47], w[48], selector); + w[62] = hc_byte_perm_S (w[46], w[47], selector); + w[61] = hc_byte_perm_S (w[45], w[46], selector); + w[60] = hc_byte_perm_S (w[44], w[45], selector); + w[59] = hc_byte_perm_S (w[43], w[44], selector); + w[58] = hc_byte_perm_S (w[42], w[43], selector); + w[57] = hc_byte_perm_S (w[41], w[42], selector); + w[56] = hc_byte_perm_S (w[40], w[41], selector); + w[55] = hc_byte_perm_S (w[39], w[40], selector); + w[54] = hc_byte_perm_S (w[38], w[39], selector); + w[53] = hc_byte_perm_S (w[37], w[38], selector); + w[52] = hc_byte_perm_S (w[36], w[37], selector); + w[51] = hc_byte_perm_S (w[35], w[36], selector); + w[50] = hc_byte_perm_S (w[34], w[35], selector); + w[49] = hc_byte_perm_S (w[33], w[34], selector); + w[48] = hc_byte_perm_S (w[32], w[33], selector); + w[47] = hc_byte_perm_S (w[31], w[32], selector); + w[46] = hc_byte_perm_S (w[30], w[31], selector); + w[45] = hc_byte_perm_S (w[29], w[30], selector); + w[44] = hc_byte_perm_S (w[28], w[29], selector); + w[43] = hc_byte_perm_S (w[27], w[28], selector); + w[42] = hc_byte_perm_S (w[26], w[27], selector); + w[41] = hc_byte_perm_S (w[25], w[26], selector); + w[40] = hc_byte_perm_S (w[24], w[25], selector); + w[39] = hc_byte_perm_S (w[23], w[24], selector); + w[38] = hc_byte_perm_S (w[22], w[23], selector); + w[37] = hc_byte_perm_S (w[21], w[22], selector); + w[36] = hc_byte_perm_S (w[20], w[21], selector); + w[35] = hc_byte_perm_S (w[19], w[20], selector); + w[34] = hc_byte_perm_S (w[18], w[19], selector); + w[33] = hc_byte_perm_S (w[17], w[18], selector); + w[32] = hc_byte_perm_S (w[16], w[17], selector); + w[31] = hc_byte_perm_S (w[15], w[16], selector); + w[30] = hc_byte_perm_S (w[14], w[15], selector); + w[29] = hc_byte_perm_S (w[13], w[14], selector); + w[28] = hc_byte_perm_S (w[12], w[13], selector); + w[27] = hc_byte_perm_S (w[11], w[12], selector); + w[26] = hc_byte_perm_S (w[10], w[11], selector); + w[25] = hc_byte_perm_S (w[ 9], w[10], selector); + w[24] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[23] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[22] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[21] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[20] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[19] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[18] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[17] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[16] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[15] = hc_byte_perm_S ( 0, w[ 0], selector); w[14] = 0; w[13] = 0; w[12] = 0; @@ -47812,54 +47812,54 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 16: - w[63] = __byte_perm_S (w[46], w[47], selector); - w[62] = __byte_perm_S (w[45], w[46], selector); - w[61] = __byte_perm_S (w[44], w[45], selector); - w[60] = __byte_perm_S (w[43], w[44], selector); - w[59] = __byte_perm_S (w[42], w[43], selector); - w[58] = __byte_perm_S (w[41], w[42], selector); - w[57] = __byte_perm_S (w[40], w[41], selector); - w[56] = __byte_perm_S (w[39], w[40], selector); - w[55] = __byte_perm_S (w[38], w[39], selector); - w[54] = __byte_perm_S (w[37], w[38], selector); - w[53] = __byte_perm_S (w[36], w[37], selector); - w[52] = __byte_perm_S (w[35], w[36], selector); - w[51] = __byte_perm_S (w[34], w[35], selector); - w[50] = __byte_perm_S (w[33], w[34], selector); - w[49] = __byte_perm_S (w[32], w[33], selector); - w[48] = __byte_perm_S (w[31], w[32], selector); - w[47] = __byte_perm_S (w[30], w[31], selector); - w[46] = __byte_perm_S (w[29], w[30], selector); - w[45] = __byte_perm_S (w[28], w[29], selector); - w[44] = __byte_perm_S (w[27], w[28], selector); - w[43] = __byte_perm_S (w[26], w[27], selector); - w[42] = __byte_perm_S (w[25], w[26], selector); - w[41] = __byte_perm_S (w[24], w[25], selector); - w[40] = __byte_perm_S (w[23], w[24], selector); - w[39] = __byte_perm_S (w[22], w[23], selector); - w[38] = __byte_perm_S (w[21], w[22], selector); - w[37] = __byte_perm_S (w[20], w[21], selector); - w[36] = __byte_perm_S (w[19], w[20], selector); - w[35] = __byte_perm_S (w[18], w[19], selector); - w[34] = __byte_perm_S (w[17], w[18], selector); - w[33] = __byte_perm_S (w[16], w[17], selector); - w[32] = __byte_perm_S (w[15], w[16], selector); - w[31] = __byte_perm_S (w[14], w[15], selector); - w[30] = __byte_perm_S (w[13], w[14], selector); - w[29] = __byte_perm_S (w[12], w[13], selector); - w[28] = __byte_perm_S (w[11], w[12], selector); - w[27] = __byte_perm_S (w[10], w[11], selector); - w[26] = __byte_perm_S (w[ 9], w[10], selector); - w[25] = __byte_perm_S (w[ 8], w[ 9], selector); - w[24] = __byte_perm_S (w[ 7], w[ 8], selector); - w[23] = __byte_perm_S (w[ 6], w[ 7], selector); - w[22] = __byte_perm_S (w[ 5], w[ 6], selector); - w[21] = __byte_perm_S (w[ 4], w[ 5], selector); - w[20] = __byte_perm_S (w[ 3], w[ 4], selector); - w[19] = __byte_perm_S (w[ 2], w[ 3], selector); - w[18] = __byte_perm_S (w[ 1], w[ 2], selector); - w[17] = __byte_perm_S (w[ 0], w[ 1], selector); - w[16] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[46], w[47], selector); + w[62] = hc_byte_perm_S (w[45], w[46], selector); + w[61] = hc_byte_perm_S (w[44], w[45], selector); + w[60] = hc_byte_perm_S (w[43], w[44], selector); + w[59] = hc_byte_perm_S (w[42], w[43], selector); + w[58] = hc_byte_perm_S (w[41], w[42], selector); + w[57] = hc_byte_perm_S (w[40], w[41], selector); + w[56] = hc_byte_perm_S (w[39], w[40], selector); + w[55] = hc_byte_perm_S (w[38], w[39], selector); + w[54] = hc_byte_perm_S (w[37], w[38], selector); + w[53] = hc_byte_perm_S (w[36], w[37], selector); + w[52] = hc_byte_perm_S (w[35], w[36], selector); + w[51] = hc_byte_perm_S (w[34], w[35], selector); + w[50] = hc_byte_perm_S (w[33], w[34], selector); + w[49] = hc_byte_perm_S (w[32], w[33], selector); + w[48] = hc_byte_perm_S (w[31], w[32], selector); + w[47] = hc_byte_perm_S (w[30], w[31], selector); + w[46] = hc_byte_perm_S (w[29], w[30], selector); + w[45] = hc_byte_perm_S (w[28], w[29], selector); + w[44] = hc_byte_perm_S (w[27], w[28], selector); + w[43] = hc_byte_perm_S (w[26], w[27], selector); + w[42] = hc_byte_perm_S (w[25], w[26], selector); + w[41] = hc_byte_perm_S (w[24], w[25], selector); + w[40] = hc_byte_perm_S (w[23], w[24], selector); + w[39] = hc_byte_perm_S (w[22], w[23], selector); + w[38] = hc_byte_perm_S (w[21], w[22], selector); + w[37] = hc_byte_perm_S (w[20], w[21], selector); + w[36] = hc_byte_perm_S (w[19], w[20], selector); + w[35] = hc_byte_perm_S (w[18], w[19], selector); + w[34] = hc_byte_perm_S (w[17], w[18], selector); + w[33] = hc_byte_perm_S (w[16], w[17], selector); + w[32] = hc_byte_perm_S (w[15], w[16], selector); + w[31] = hc_byte_perm_S (w[14], w[15], selector); + w[30] = hc_byte_perm_S (w[13], w[14], selector); + w[29] = hc_byte_perm_S (w[12], w[13], selector); + w[28] = hc_byte_perm_S (w[11], w[12], selector); + w[27] = hc_byte_perm_S (w[10], w[11], selector); + w[26] = hc_byte_perm_S (w[ 9], w[10], selector); + w[25] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[24] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[23] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[22] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[21] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[20] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[19] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[18] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[17] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[16] = hc_byte_perm_S ( 0, w[ 0], selector); w[15] = 0; w[14] = 0; w[13] = 0; @@ -47880,53 +47880,53 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 17: - w[63] = __byte_perm_S (w[45], w[46], selector); - w[62] = __byte_perm_S (w[44], w[45], selector); - w[61] = __byte_perm_S (w[43], w[44], selector); - w[60] = __byte_perm_S (w[42], w[43], selector); - w[59] = __byte_perm_S (w[41], w[42], selector); - w[58] = __byte_perm_S (w[40], w[41], selector); - w[57] = __byte_perm_S (w[39], w[40], selector); - w[56] = __byte_perm_S (w[38], w[39], selector); - w[55] = __byte_perm_S (w[37], w[38], selector); - w[54] = __byte_perm_S (w[36], w[37], selector); - w[53] = __byte_perm_S (w[35], w[36], selector); - w[52] = __byte_perm_S (w[34], w[35], selector); - w[51] = __byte_perm_S (w[33], w[34], selector); - w[50] = __byte_perm_S (w[32], w[33], selector); - w[49] = __byte_perm_S (w[31], w[32], selector); - w[48] = __byte_perm_S (w[30], w[31], selector); - w[47] = __byte_perm_S (w[29], w[30], selector); - w[46] = __byte_perm_S (w[28], w[29], selector); - w[45] = __byte_perm_S (w[27], w[28], selector); - w[44] = __byte_perm_S (w[26], w[27], selector); - w[43] = __byte_perm_S (w[25], w[26], selector); - w[42] = __byte_perm_S (w[24], w[25], selector); - w[41] = __byte_perm_S (w[23], w[24], selector); - w[40] = __byte_perm_S (w[22], w[23], selector); - w[39] = __byte_perm_S (w[21], w[22], selector); - w[38] = __byte_perm_S (w[20], w[21], selector); - w[37] = __byte_perm_S (w[19], w[20], selector); - w[36] = __byte_perm_S (w[18], w[19], selector); - w[35] = __byte_perm_S (w[17], w[18], selector); - w[34] = __byte_perm_S (w[16], w[17], selector); - w[33] = __byte_perm_S (w[15], w[16], selector); - w[32] = __byte_perm_S (w[14], w[15], selector); - w[31] = __byte_perm_S (w[13], w[14], selector); - w[30] = __byte_perm_S (w[12], w[13], selector); - w[29] = __byte_perm_S (w[11], w[12], selector); - w[28] = __byte_perm_S (w[10], w[11], selector); - w[27] = __byte_perm_S (w[ 9], w[10], selector); - w[26] = __byte_perm_S (w[ 8], w[ 9], selector); - w[25] = __byte_perm_S (w[ 7], w[ 8], selector); - w[24] = __byte_perm_S (w[ 6], w[ 7], selector); - w[23] = __byte_perm_S (w[ 5], w[ 6], selector); - w[22] = __byte_perm_S (w[ 4], w[ 5], selector); - w[21] = __byte_perm_S (w[ 3], w[ 4], selector); - w[20] = __byte_perm_S (w[ 2], w[ 3], selector); - w[19] = __byte_perm_S (w[ 1], w[ 2], selector); - w[18] = __byte_perm_S (w[ 0], w[ 1], selector); - w[17] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[45], w[46], selector); + w[62] = hc_byte_perm_S (w[44], w[45], selector); + w[61] = hc_byte_perm_S (w[43], w[44], selector); + w[60] = hc_byte_perm_S (w[42], w[43], selector); + w[59] = hc_byte_perm_S (w[41], w[42], selector); + w[58] = hc_byte_perm_S (w[40], w[41], selector); + w[57] = hc_byte_perm_S (w[39], w[40], selector); + w[56] = hc_byte_perm_S (w[38], w[39], selector); + w[55] = hc_byte_perm_S (w[37], w[38], selector); + w[54] = hc_byte_perm_S (w[36], w[37], selector); + w[53] = hc_byte_perm_S (w[35], w[36], selector); + w[52] = hc_byte_perm_S (w[34], w[35], selector); + w[51] = hc_byte_perm_S (w[33], w[34], selector); + w[50] = hc_byte_perm_S (w[32], w[33], selector); + w[49] = hc_byte_perm_S (w[31], w[32], selector); + w[48] = hc_byte_perm_S (w[30], w[31], selector); + w[47] = hc_byte_perm_S (w[29], w[30], selector); + w[46] = hc_byte_perm_S (w[28], w[29], selector); + w[45] = hc_byte_perm_S (w[27], w[28], selector); + w[44] = hc_byte_perm_S (w[26], w[27], selector); + w[43] = hc_byte_perm_S (w[25], w[26], selector); + w[42] = hc_byte_perm_S (w[24], w[25], selector); + w[41] = hc_byte_perm_S (w[23], w[24], selector); + w[40] = hc_byte_perm_S (w[22], w[23], selector); + w[39] = hc_byte_perm_S (w[21], w[22], selector); + w[38] = hc_byte_perm_S (w[20], w[21], selector); + w[37] = hc_byte_perm_S (w[19], w[20], selector); + w[36] = hc_byte_perm_S (w[18], w[19], selector); + w[35] = hc_byte_perm_S (w[17], w[18], selector); + w[34] = hc_byte_perm_S (w[16], w[17], selector); + w[33] = hc_byte_perm_S (w[15], w[16], selector); + w[32] = hc_byte_perm_S (w[14], w[15], selector); + w[31] = hc_byte_perm_S (w[13], w[14], selector); + w[30] = hc_byte_perm_S (w[12], w[13], selector); + w[29] = hc_byte_perm_S (w[11], w[12], selector); + w[28] = hc_byte_perm_S (w[10], w[11], selector); + w[27] = hc_byte_perm_S (w[ 9], w[10], selector); + w[26] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[25] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[24] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[23] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[22] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[21] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[20] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[19] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[18] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[17] = hc_byte_perm_S ( 0, w[ 0], selector); w[16] = 0; w[15] = 0; w[14] = 0; @@ -47948,52 +47948,52 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 18: - w[63] = __byte_perm_S (w[44], w[45], selector); - w[62] = __byte_perm_S (w[43], w[44], selector); - w[61] = __byte_perm_S (w[42], w[43], selector); - w[60] = __byte_perm_S (w[41], w[42], selector); - w[59] = __byte_perm_S (w[40], w[41], selector); - w[58] = __byte_perm_S (w[39], w[40], selector); - w[57] = __byte_perm_S (w[38], w[39], selector); - w[56] = __byte_perm_S (w[37], w[38], selector); - w[55] = __byte_perm_S (w[36], w[37], selector); - w[54] = __byte_perm_S (w[35], w[36], selector); - w[53] = __byte_perm_S (w[34], w[35], selector); - w[52] = __byte_perm_S (w[33], w[34], selector); - w[51] = __byte_perm_S (w[32], w[33], selector); - w[50] = __byte_perm_S (w[31], w[32], selector); - w[49] = __byte_perm_S (w[30], w[31], selector); - w[48] = __byte_perm_S (w[29], w[30], selector); - w[47] = __byte_perm_S (w[28], w[29], selector); - w[46] = __byte_perm_S (w[27], w[28], selector); - w[45] = __byte_perm_S (w[26], w[27], selector); - w[44] = __byte_perm_S (w[25], w[26], selector); - w[43] = __byte_perm_S (w[24], w[25], selector); - w[42] = __byte_perm_S (w[23], w[24], selector); - w[41] = __byte_perm_S (w[22], w[23], selector); - w[40] = __byte_perm_S (w[21], w[22], selector); - w[39] = __byte_perm_S (w[20], w[21], selector); - w[38] = __byte_perm_S (w[19], w[20], selector); - w[37] = __byte_perm_S (w[18], w[19], selector); - w[36] = __byte_perm_S (w[17], w[18], selector); - w[35] = __byte_perm_S (w[16], w[17], selector); - w[34] = __byte_perm_S (w[15], w[16], selector); - w[33] = __byte_perm_S (w[14], w[15], selector); - w[32] = __byte_perm_S (w[13], w[14], selector); - w[31] = __byte_perm_S (w[12], w[13], selector); - w[30] = __byte_perm_S (w[11], w[12], selector); - w[29] = __byte_perm_S (w[10], w[11], selector); - w[28] = __byte_perm_S (w[ 9], w[10], selector); - w[27] = __byte_perm_S (w[ 8], w[ 9], selector); - w[26] = __byte_perm_S (w[ 7], w[ 8], selector); - w[25] = __byte_perm_S (w[ 6], w[ 7], selector); - w[24] = __byte_perm_S (w[ 5], w[ 6], selector); - w[23] = __byte_perm_S (w[ 4], w[ 5], selector); - w[22] = __byte_perm_S (w[ 3], w[ 4], selector); - w[21] = __byte_perm_S (w[ 2], w[ 3], selector); - w[20] = __byte_perm_S (w[ 1], w[ 2], selector); - w[19] = __byte_perm_S (w[ 0], w[ 1], selector); - w[18] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[44], w[45], selector); + w[62] = hc_byte_perm_S (w[43], w[44], selector); + w[61] = hc_byte_perm_S (w[42], w[43], selector); + w[60] = hc_byte_perm_S (w[41], w[42], selector); + w[59] = hc_byte_perm_S (w[40], w[41], selector); + w[58] = hc_byte_perm_S (w[39], w[40], selector); + w[57] = hc_byte_perm_S (w[38], w[39], selector); + w[56] = hc_byte_perm_S (w[37], w[38], selector); + w[55] = hc_byte_perm_S (w[36], w[37], selector); + w[54] = hc_byte_perm_S (w[35], w[36], selector); + w[53] = hc_byte_perm_S (w[34], w[35], selector); + w[52] = hc_byte_perm_S (w[33], w[34], selector); + w[51] = hc_byte_perm_S (w[32], w[33], selector); + w[50] = hc_byte_perm_S (w[31], w[32], selector); + w[49] = hc_byte_perm_S (w[30], w[31], selector); + w[48] = hc_byte_perm_S (w[29], w[30], selector); + w[47] = hc_byte_perm_S (w[28], w[29], selector); + w[46] = hc_byte_perm_S (w[27], w[28], selector); + w[45] = hc_byte_perm_S (w[26], w[27], selector); + w[44] = hc_byte_perm_S (w[25], w[26], selector); + w[43] = hc_byte_perm_S (w[24], w[25], selector); + w[42] = hc_byte_perm_S (w[23], w[24], selector); + w[41] = hc_byte_perm_S (w[22], w[23], selector); + w[40] = hc_byte_perm_S (w[21], w[22], selector); + w[39] = hc_byte_perm_S (w[20], w[21], selector); + w[38] = hc_byte_perm_S (w[19], w[20], selector); + w[37] = hc_byte_perm_S (w[18], w[19], selector); + w[36] = hc_byte_perm_S (w[17], w[18], selector); + w[35] = hc_byte_perm_S (w[16], w[17], selector); + w[34] = hc_byte_perm_S (w[15], w[16], selector); + w[33] = hc_byte_perm_S (w[14], w[15], selector); + w[32] = hc_byte_perm_S (w[13], w[14], selector); + w[31] = hc_byte_perm_S (w[12], w[13], selector); + w[30] = hc_byte_perm_S (w[11], w[12], selector); + w[29] = hc_byte_perm_S (w[10], w[11], selector); + w[28] = hc_byte_perm_S (w[ 9], w[10], selector); + w[27] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[26] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[25] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[24] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[23] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[22] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[21] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[20] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[19] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[18] = hc_byte_perm_S ( 0, w[ 0], selector); w[17] = 0; w[16] = 0; w[15] = 0; @@ -48016,51 +48016,51 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 19: - w[63] = __byte_perm_S (w[43], w[44], selector); - w[62] = __byte_perm_S (w[42], w[43], selector); - w[61] = __byte_perm_S (w[41], w[42], selector); - w[60] = __byte_perm_S (w[40], w[41], selector); - w[59] = __byte_perm_S (w[39], w[40], selector); - w[58] = __byte_perm_S (w[38], w[39], selector); - w[57] = __byte_perm_S (w[37], w[38], selector); - w[56] = __byte_perm_S (w[36], w[37], selector); - w[55] = __byte_perm_S (w[35], w[36], selector); - w[54] = __byte_perm_S (w[34], w[35], selector); - w[53] = __byte_perm_S (w[33], w[34], selector); - w[52] = __byte_perm_S (w[32], w[33], selector); - w[51] = __byte_perm_S (w[31], w[32], selector); - w[50] = __byte_perm_S (w[30], w[31], selector); - w[49] = __byte_perm_S (w[29], w[30], selector); - w[48] = __byte_perm_S (w[28], w[29], selector); - w[47] = __byte_perm_S (w[27], w[28], selector); - w[46] = __byte_perm_S (w[26], w[27], selector); - w[45] = __byte_perm_S (w[25], w[26], selector); - w[44] = __byte_perm_S (w[24], w[25], selector); - w[43] = __byte_perm_S (w[23], w[24], selector); - w[42] = __byte_perm_S (w[22], w[23], selector); - w[41] = __byte_perm_S (w[21], w[22], selector); - w[40] = __byte_perm_S (w[20], w[21], selector); - w[39] = __byte_perm_S (w[19], w[20], selector); - w[38] = __byte_perm_S (w[18], w[19], selector); - w[37] = __byte_perm_S (w[17], w[18], selector); - w[36] = __byte_perm_S (w[16], w[17], selector); - w[35] = __byte_perm_S (w[15], w[16], selector); - w[34] = __byte_perm_S (w[14], w[15], selector); - w[33] = __byte_perm_S (w[13], w[14], selector); - w[32] = __byte_perm_S (w[12], w[13], selector); - w[31] = __byte_perm_S (w[11], w[12], selector); - w[30] = __byte_perm_S (w[10], w[11], selector); - w[29] = __byte_perm_S (w[ 9], w[10], selector); - w[28] = __byte_perm_S (w[ 8], w[ 9], selector); - w[27] = __byte_perm_S (w[ 7], w[ 8], selector); - w[26] = __byte_perm_S (w[ 6], w[ 7], selector); - w[25] = __byte_perm_S (w[ 5], w[ 6], selector); - w[24] = __byte_perm_S (w[ 4], w[ 5], selector); - w[23] = __byte_perm_S (w[ 3], w[ 4], selector); - w[22] = __byte_perm_S (w[ 2], w[ 3], selector); - w[21] = __byte_perm_S (w[ 1], w[ 2], selector); - w[20] = __byte_perm_S (w[ 0], w[ 1], selector); - w[19] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[43], w[44], selector); + w[62] = hc_byte_perm_S (w[42], w[43], selector); + w[61] = hc_byte_perm_S (w[41], w[42], selector); + w[60] = hc_byte_perm_S (w[40], w[41], selector); + w[59] = hc_byte_perm_S (w[39], w[40], selector); + w[58] = hc_byte_perm_S (w[38], w[39], selector); + w[57] = hc_byte_perm_S (w[37], w[38], selector); + w[56] = hc_byte_perm_S (w[36], w[37], selector); + w[55] = hc_byte_perm_S (w[35], w[36], selector); + w[54] = hc_byte_perm_S (w[34], w[35], selector); + w[53] = hc_byte_perm_S (w[33], w[34], selector); + w[52] = hc_byte_perm_S (w[32], w[33], selector); + w[51] = hc_byte_perm_S (w[31], w[32], selector); + w[50] = hc_byte_perm_S (w[30], w[31], selector); + w[49] = hc_byte_perm_S (w[29], w[30], selector); + w[48] = hc_byte_perm_S (w[28], w[29], selector); + w[47] = hc_byte_perm_S (w[27], w[28], selector); + w[46] = hc_byte_perm_S (w[26], w[27], selector); + w[45] = hc_byte_perm_S (w[25], w[26], selector); + w[44] = hc_byte_perm_S (w[24], w[25], selector); + w[43] = hc_byte_perm_S (w[23], w[24], selector); + w[42] = hc_byte_perm_S (w[22], w[23], selector); + w[41] = hc_byte_perm_S (w[21], w[22], selector); + w[40] = hc_byte_perm_S (w[20], w[21], selector); + w[39] = hc_byte_perm_S (w[19], w[20], selector); + w[38] = hc_byte_perm_S (w[18], w[19], selector); + w[37] = hc_byte_perm_S (w[17], w[18], selector); + w[36] = hc_byte_perm_S (w[16], w[17], selector); + w[35] = hc_byte_perm_S (w[15], w[16], selector); + w[34] = hc_byte_perm_S (w[14], w[15], selector); + w[33] = hc_byte_perm_S (w[13], w[14], selector); + w[32] = hc_byte_perm_S (w[12], w[13], selector); + w[31] = hc_byte_perm_S (w[11], w[12], selector); + w[30] = hc_byte_perm_S (w[10], w[11], selector); + w[29] = hc_byte_perm_S (w[ 9], w[10], selector); + w[28] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[27] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[26] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[25] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[24] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[23] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[22] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[21] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[20] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[19] = hc_byte_perm_S ( 0, w[ 0], selector); w[18] = 0; w[17] = 0; w[16] = 0; @@ -48084,50 +48084,50 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 20: - w[63] = __byte_perm_S (w[42], w[43], selector); - w[62] = __byte_perm_S (w[41], w[42], selector); - w[61] = __byte_perm_S (w[40], w[41], selector); - w[60] = __byte_perm_S (w[39], w[40], selector); - w[59] = __byte_perm_S (w[38], w[39], selector); - w[58] = __byte_perm_S (w[37], w[38], selector); - w[57] = __byte_perm_S (w[36], w[37], selector); - w[56] = __byte_perm_S (w[35], w[36], selector); - w[55] = __byte_perm_S (w[34], w[35], selector); - w[54] = __byte_perm_S (w[33], w[34], selector); - w[53] = __byte_perm_S (w[32], w[33], selector); - w[52] = __byte_perm_S (w[31], w[32], selector); - w[51] = __byte_perm_S (w[30], w[31], selector); - w[50] = __byte_perm_S (w[29], w[30], selector); - w[49] = __byte_perm_S (w[28], w[29], selector); - w[48] = __byte_perm_S (w[27], w[28], selector); - w[47] = __byte_perm_S (w[26], w[27], selector); - w[46] = __byte_perm_S (w[25], w[26], selector); - w[45] = __byte_perm_S (w[24], w[25], selector); - w[44] = __byte_perm_S (w[23], w[24], selector); - w[43] = __byte_perm_S (w[22], w[23], selector); - w[42] = __byte_perm_S (w[21], w[22], selector); - w[41] = __byte_perm_S (w[20], w[21], selector); - w[40] = __byte_perm_S (w[19], w[20], selector); - w[39] = __byte_perm_S (w[18], w[19], selector); - w[38] = __byte_perm_S (w[17], w[18], selector); - w[37] = __byte_perm_S (w[16], w[17], selector); - w[36] = __byte_perm_S (w[15], w[16], selector); - w[35] = __byte_perm_S (w[14], w[15], selector); - w[34] = __byte_perm_S (w[13], w[14], selector); - w[33] = __byte_perm_S (w[12], w[13], selector); - w[32] = __byte_perm_S (w[11], w[12], selector); - w[31] = __byte_perm_S (w[10], w[11], selector); - w[30] = __byte_perm_S (w[ 9], w[10], selector); - w[29] = __byte_perm_S (w[ 8], w[ 9], selector); - w[28] = __byte_perm_S (w[ 7], w[ 8], selector); - w[27] = __byte_perm_S (w[ 6], w[ 7], selector); - w[26] = __byte_perm_S (w[ 5], w[ 6], selector); - w[25] = __byte_perm_S (w[ 4], w[ 5], selector); - w[24] = __byte_perm_S (w[ 3], w[ 4], selector); - w[23] = __byte_perm_S (w[ 2], w[ 3], selector); - w[22] = __byte_perm_S (w[ 1], w[ 2], selector); - w[21] = __byte_perm_S (w[ 0], w[ 1], selector); - w[20] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[42], w[43], selector); + w[62] = hc_byte_perm_S (w[41], w[42], selector); + w[61] = hc_byte_perm_S (w[40], w[41], selector); + w[60] = hc_byte_perm_S (w[39], w[40], selector); + w[59] = hc_byte_perm_S (w[38], w[39], selector); + w[58] = hc_byte_perm_S (w[37], w[38], selector); + w[57] = hc_byte_perm_S (w[36], w[37], selector); + w[56] = hc_byte_perm_S (w[35], w[36], selector); + w[55] = hc_byte_perm_S (w[34], w[35], selector); + w[54] = hc_byte_perm_S (w[33], w[34], selector); + w[53] = hc_byte_perm_S (w[32], w[33], selector); + w[52] = hc_byte_perm_S (w[31], w[32], selector); + w[51] = hc_byte_perm_S (w[30], w[31], selector); + w[50] = hc_byte_perm_S (w[29], w[30], selector); + w[49] = hc_byte_perm_S (w[28], w[29], selector); + w[48] = hc_byte_perm_S (w[27], w[28], selector); + w[47] = hc_byte_perm_S (w[26], w[27], selector); + w[46] = hc_byte_perm_S (w[25], w[26], selector); + w[45] = hc_byte_perm_S (w[24], w[25], selector); + w[44] = hc_byte_perm_S (w[23], w[24], selector); + w[43] = hc_byte_perm_S (w[22], w[23], selector); + w[42] = hc_byte_perm_S (w[21], w[22], selector); + w[41] = hc_byte_perm_S (w[20], w[21], selector); + w[40] = hc_byte_perm_S (w[19], w[20], selector); + w[39] = hc_byte_perm_S (w[18], w[19], selector); + w[38] = hc_byte_perm_S (w[17], w[18], selector); + w[37] = hc_byte_perm_S (w[16], w[17], selector); + w[36] = hc_byte_perm_S (w[15], w[16], selector); + w[35] = hc_byte_perm_S (w[14], w[15], selector); + w[34] = hc_byte_perm_S (w[13], w[14], selector); + w[33] = hc_byte_perm_S (w[12], w[13], selector); + w[32] = hc_byte_perm_S (w[11], w[12], selector); + w[31] = hc_byte_perm_S (w[10], w[11], selector); + w[30] = hc_byte_perm_S (w[ 9], w[10], selector); + w[29] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[28] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[27] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[26] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[25] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[24] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[23] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[22] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[21] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[20] = hc_byte_perm_S ( 0, w[ 0], selector); w[19] = 0; w[18] = 0; w[17] = 0; @@ -48152,49 +48152,49 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 21: - w[63] = __byte_perm_S (w[41], w[42], selector); - w[62] = __byte_perm_S (w[40], w[41], selector); - w[61] = __byte_perm_S (w[39], w[40], selector); - w[60] = __byte_perm_S (w[38], w[39], selector); - w[59] = __byte_perm_S (w[37], w[38], selector); - w[58] = __byte_perm_S (w[36], w[37], selector); - w[57] = __byte_perm_S (w[35], w[36], selector); - w[56] = __byte_perm_S (w[34], w[35], selector); - w[55] = __byte_perm_S (w[33], w[34], selector); - w[54] = __byte_perm_S (w[32], w[33], selector); - w[53] = __byte_perm_S (w[31], w[32], selector); - w[52] = __byte_perm_S (w[30], w[31], selector); - w[51] = __byte_perm_S (w[29], w[30], selector); - w[50] = __byte_perm_S (w[28], w[29], selector); - w[49] = __byte_perm_S (w[27], w[28], selector); - w[48] = __byte_perm_S (w[26], w[27], selector); - w[47] = __byte_perm_S (w[25], w[26], selector); - w[46] = __byte_perm_S (w[24], w[25], selector); - w[45] = __byte_perm_S (w[23], w[24], selector); - w[44] = __byte_perm_S (w[22], w[23], selector); - w[43] = __byte_perm_S (w[21], w[22], selector); - w[42] = __byte_perm_S (w[20], w[21], selector); - w[41] = __byte_perm_S (w[19], w[20], selector); - w[40] = __byte_perm_S (w[18], w[19], selector); - w[39] = __byte_perm_S (w[17], w[18], selector); - w[38] = __byte_perm_S (w[16], w[17], selector); - w[37] = __byte_perm_S (w[15], w[16], selector); - w[36] = __byte_perm_S (w[14], w[15], selector); - w[35] = __byte_perm_S (w[13], w[14], selector); - w[34] = __byte_perm_S (w[12], w[13], selector); - w[33] = __byte_perm_S (w[11], w[12], selector); - w[32] = __byte_perm_S (w[10], w[11], selector); - w[31] = __byte_perm_S (w[ 9], w[10], selector); - w[30] = __byte_perm_S (w[ 8], w[ 9], selector); - w[29] = __byte_perm_S (w[ 7], w[ 8], selector); - w[28] = __byte_perm_S (w[ 6], w[ 7], selector); - w[27] = __byte_perm_S (w[ 5], w[ 6], selector); - w[26] = __byte_perm_S (w[ 4], w[ 5], selector); - w[25] = __byte_perm_S (w[ 3], w[ 4], selector); - w[24] = __byte_perm_S (w[ 2], w[ 3], selector); - w[23] = __byte_perm_S (w[ 1], w[ 2], selector); - w[22] = __byte_perm_S (w[ 0], w[ 1], selector); - w[21] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[41], w[42], selector); + w[62] = hc_byte_perm_S (w[40], w[41], selector); + w[61] = hc_byte_perm_S (w[39], w[40], selector); + w[60] = hc_byte_perm_S (w[38], w[39], selector); + w[59] = hc_byte_perm_S (w[37], w[38], selector); + w[58] = hc_byte_perm_S (w[36], w[37], selector); + w[57] = hc_byte_perm_S (w[35], w[36], selector); + w[56] = hc_byte_perm_S (w[34], w[35], selector); + w[55] = hc_byte_perm_S (w[33], w[34], selector); + w[54] = hc_byte_perm_S (w[32], w[33], selector); + w[53] = hc_byte_perm_S (w[31], w[32], selector); + w[52] = hc_byte_perm_S (w[30], w[31], selector); + w[51] = hc_byte_perm_S (w[29], w[30], selector); + w[50] = hc_byte_perm_S (w[28], w[29], selector); + w[49] = hc_byte_perm_S (w[27], w[28], selector); + w[48] = hc_byte_perm_S (w[26], w[27], selector); + w[47] = hc_byte_perm_S (w[25], w[26], selector); + w[46] = hc_byte_perm_S (w[24], w[25], selector); + w[45] = hc_byte_perm_S (w[23], w[24], selector); + w[44] = hc_byte_perm_S (w[22], w[23], selector); + w[43] = hc_byte_perm_S (w[21], w[22], selector); + w[42] = hc_byte_perm_S (w[20], w[21], selector); + w[41] = hc_byte_perm_S (w[19], w[20], selector); + w[40] = hc_byte_perm_S (w[18], w[19], selector); + w[39] = hc_byte_perm_S (w[17], w[18], selector); + w[38] = hc_byte_perm_S (w[16], w[17], selector); + w[37] = hc_byte_perm_S (w[15], w[16], selector); + w[36] = hc_byte_perm_S (w[14], w[15], selector); + w[35] = hc_byte_perm_S (w[13], w[14], selector); + w[34] = hc_byte_perm_S (w[12], w[13], selector); + w[33] = hc_byte_perm_S (w[11], w[12], selector); + w[32] = hc_byte_perm_S (w[10], w[11], selector); + w[31] = hc_byte_perm_S (w[ 9], w[10], selector); + w[30] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[29] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[28] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[27] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[26] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[25] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[24] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[23] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[22] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[21] = hc_byte_perm_S ( 0, w[ 0], selector); w[20] = 0; w[19] = 0; w[18] = 0; @@ -48220,48 +48220,48 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 22: - w[63] = __byte_perm_S (w[40], w[41], selector); - w[62] = __byte_perm_S (w[39], w[40], selector); - w[61] = __byte_perm_S (w[38], w[39], selector); - w[60] = __byte_perm_S (w[37], w[38], selector); - w[59] = __byte_perm_S (w[36], w[37], selector); - w[58] = __byte_perm_S (w[35], w[36], selector); - w[57] = __byte_perm_S (w[34], w[35], selector); - w[56] = __byte_perm_S (w[33], w[34], selector); - w[55] = __byte_perm_S (w[32], w[33], selector); - w[54] = __byte_perm_S (w[31], w[32], selector); - w[53] = __byte_perm_S (w[30], w[31], selector); - w[52] = __byte_perm_S (w[29], w[30], selector); - w[51] = __byte_perm_S (w[28], w[29], selector); - w[50] = __byte_perm_S (w[27], w[28], selector); - w[49] = __byte_perm_S (w[26], w[27], selector); - w[48] = __byte_perm_S (w[25], w[26], selector); - w[47] = __byte_perm_S (w[24], w[25], selector); - w[46] = __byte_perm_S (w[23], w[24], selector); - w[45] = __byte_perm_S (w[22], w[23], selector); - w[44] = __byte_perm_S (w[21], w[22], selector); - w[43] = __byte_perm_S (w[20], w[21], selector); - w[42] = __byte_perm_S (w[19], w[20], selector); - w[41] = __byte_perm_S (w[18], w[19], selector); - w[40] = __byte_perm_S (w[17], w[18], selector); - w[39] = __byte_perm_S (w[16], w[17], selector); - w[38] = __byte_perm_S (w[15], w[16], selector); - w[37] = __byte_perm_S (w[14], w[15], selector); - w[36] = __byte_perm_S (w[13], w[14], selector); - w[35] = __byte_perm_S (w[12], w[13], selector); - w[34] = __byte_perm_S (w[11], w[12], selector); - w[33] = __byte_perm_S (w[10], w[11], selector); - w[32] = __byte_perm_S (w[ 9], w[10], selector); - w[31] = __byte_perm_S (w[ 8], w[ 9], selector); - w[30] = __byte_perm_S (w[ 7], w[ 8], selector); - w[29] = __byte_perm_S (w[ 6], w[ 7], selector); - w[28] = __byte_perm_S (w[ 5], w[ 6], selector); - w[27] = __byte_perm_S (w[ 4], w[ 5], selector); - w[26] = __byte_perm_S (w[ 3], w[ 4], selector); - w[25] = __byte_perm_S (w[ 2], w[ 3], selector); - w[24] = __byte_perm_S (w[ 1], w[ 2], selector); - w[23] = __byte_perm_S (w[ 0], w[ 1], selector); - w[22] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[40], w[41], selector); + w[62] = hc_byte_perm_S (w[39], w[40], selector); + w[61] = hc_byte_perm_S (w[38], w[39], selector); + w[60] = hc_byte_perm_S (w[37], w[38], selector); + w[59] = hc_byte_perm_S (w[36], w[37], selector); + w[58] = hc_byte_perm_S (w[35], w[36], selector); + w[57] = hc_byte_perm_S (w[34], w[35], selector); + w[56] = hc_byte_perm_S (w[33], w[34], selector); + w[55] = hc_byte_perm_S (w[32], w[33], selector); + w[54] = hc_byte_perm_S (w[31], w[32], selector); + w[53] = hc_byte_perm_S (w[30], w[31], selector); + w[52] = hc_byte_perm_S (w[29], w[30], selector); + w[51] = hc_byte_perm_S (w[28], w[29], selector); + w[50] = hc_byte_perm_S (w[27], w[28], selector); + w[49] = hc_byte_perm_S (w[26], w[27], selector); + w[48] = hc_byte_perm_S (w[25], w[26], selector); + w[47] = hc_byte_perm_S (w[24], w[25], selector); + w[46] = hc_byte_perm_S (w[23], w[24], selector); + w[45] = hc_byte_perm_S (w[22], w[23], selector); + w[44] = hc_byte_perm_S (w[21], w[22], selector); + w[43] = hc_byte_perm_S (w[20], w[21], selector); + w[42] = hc_byte_perm_S (w[19], w[20], selector); + w[41] = hc_byte_perm_S (w[18], w[19], selector); + w[40] = hc_byte_perm_S (w[17], w[18], selector); + w[39] = hc_byte_perm_S (w[16], w[17], selector); + w[38] = hc_byte_perm_S (w[15], w[16], selector); + w[37] = hc_byte_perm_S (w[14], w[15], selector); + w[36] = hc_byte_perm_S (w[13], w[14], selector); + w[35] = hc_byte_perm_S (w[12], w[13], selector); + w[34] = hc_byte_perm_S (w[11], w[12], selector); + w[33] = hc_byte_perm_S (w[10], w[11], selector); + w[32] = hc_byte_perm_S (w[ 9], w[10], selector); + w[31] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[30] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[29] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[28] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[27] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[26] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[25] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[24] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[23] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[22] = hc_byte_perm_S ( 0, w[ 0], selector); w[21] = 0; w[20] = 0; w[19] = 0; @@ -48288,47 +48288,47 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 23: - w[63] = __byte_perm_S (w[39], w[40], selector); - w[62] = __byte_perm_S (w[38], w[39], selector); - w[61] = __byte_perm_S (w[37], w[38], selector); - w[60] = __byte_perm_S (w[36], w[37], selector); - w[59] = __byte_perm_S (w[35], w[36], selector); - w[58] = __byte_perm_S (w[34], w[35], selector); - w[57] = __byte_perm_S (w[33], w[34], selector); - w[56] = __byte_perm_S (w[32], w[33], selector); - w[55] = __byte_perm_S (w[31], w[32], selector); - w[54] = __byte_perm_S (w[30], w[31], selector); - w[53] = __byte_perm_S (w[29], w[30], selector); - w[52] = __byte_perm_S (w[28], w[29], selector); - w[51] = __byte_perm_S (w[27], w[28], selector); - w[50] = __byte_perm_S (w[26], w[27], selector); - w[49] = __byte_perm_S (w[25], w[26], selector); - w[48] = __byte_perm_S (w[24], w[25], selector); - w[47] = __byte_perm_S (w[23], w[24], selector); - w[46] = __byte_perm_S (w[22], w[23], selector); - w[45] = __byte_perm_S (w[21], w[22], selector); - w[44] = __byte_perm_S (w[20], w[21], selector); - w[43] = __byte_perm_S (w[19], w[20], selector); - w[42] = __byte_perm_S (w[18], w[19], selector); - w[41] = __byte_perm_S (w[17], w[18], selector); - w[40] = __byte_perm_S (w[16], w[17], selector); - w[39] = __byte_perm_S (w[15], w[16], selector); - w[38] = __byte_perm_S (w[14], w[15], selector); - w[37] = __byte_perm_S (w[13], w[14], selector); - w[36] = __byte_perm_S (w[12], w[13], selector); - w[35] = __byte_perm_S (w[11], w[12], selector); - w[34] = __byte_perm_S (w[10], w[11], selector); - w[33] = __byte_perm_S (w[ 9], w[10], selector); - w[32] = __byte_perm_S (w[ 8], w[ 9], selector); - w[31] = __byte_perm_S (w[ 7], w[ 8], selector); - w[30] = __byte_perm_S (w[ 6], w[ 7], selector); - w[29] = __byte_perm_S (w[ 5], w[ 6], selector); - w[28] = __byte_perm_S (w[ 4], w[ 5], selector); - w[27] = __byte_perm_S (w[ 3], w[ 4], selector); - w[26] = __byte_perm_S (w[ 2], w[ 3], selector); - w[25] = __byte_perm_S (w[ 1], w[ 2], selector); - w[24] = __byte_perm_S (w[ 0], w[ 1], selector); - w[23] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[39], w[40], selector); + w[62] = hc_byte_perm_S (w[38], w[39], selector); + w[61] = hc_byte_perm_S (w[37], w[38], selector); + w[60] = hc_byte_perm_S (w[36], w[37], selector); + w[59] = hc_byte_perm_S (w[35], w[36], selector); + w[58] = hc_byte_perm_S (w[34], w[35], selector); + w[57] = hc_byte_perm_S (w[33], w[34], selector); + w[56] = hc_byte_perm_S (w[32], w[33], selector); + w[55] = hc_byte_perm_S (w[31], w[32], selector); + w[54] = hc_byte_perm_S (w[30], w[31], selector); + w[53] = hc_byte_perm_S (w[29], w[30], selector); + w[52] = hc_byte_perm_S (w[28], w[29], selector); + w[51] = hc_byte_perm_S (w[27], w[28], selector); + w[50] = hc_byte_perm_S (w[26], w[27], selector); + w[49] = hc_byte_perm_S (w[25], w[26], selector); + w[48] = hc_byte_perm_S (w[24], w[25], selector); + w[47] = hc_byte_perm_S (w[23], w[24], selector); + w[46] = hc_byte_perm_S (w[22], w[23], selector); + w[45] = hc_byte_perm_S (w[21], w[22], selector); + w[44] = hc_byte_perm_S (w[20], w[21], selector); + w[43] = hc_byte_perm_S (w[19], w[20], selector); + w[42] = hc_byte_perm_S (w[18], w[19], selector); + w[41] = hc_byte_perm_S (w[17], w[18], selector); + w[40] = hc_byte_perm_S (w[16], w[17], selector); + w[39] = hc_byte_perm_S (w[15], w[16], selector); + w[38] = hc_byte_perm_S (w[14], w[15], selector); + w[37] = hc_byte_perm_S (w[13], w[14], selector); + w[36] = hc_byte_perm_S (w[12], w[13], selector); + w[35] = hc_byte_perm_S (w[11], w[12], selector); + w[34] = hc_byte_perm_S (w[10], w[11], selector); + w[33] = hc_byte_perm_S (w[ 9], w[10], selector); + w[32] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[31] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[30] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[29] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[28] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[27] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[26] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[25] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[24] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[23] = hc_byte_perm_S ( 0, w[ 0], selector); w[22] = 0; w[21] = 0; w[20] = 0; @@ -48356,46 +48356,46 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 24: - w[63] = __byte_perm_S (w[38], w[39], selector); - w[62] = __byte_perm_S (w[37], w[38], selector); - w[61] = __byte_perm_S (w[36], w[37], selector); - w[60] = __byte_perm_S (w[35], w[36], selector); - w[59] = __byte_perm_S (w[34], w[35], selector); - w[58] = __byte_perm_S (w[33], w[34], selector); - w[57] = __byte_perm_S (w[32], w[33], selector); - w[56] = __byte_perm_S (w[31], w[32], selector); - w[55] = __byte_perm_S (w[30], w[31], selector); - w[54] = __byte_perm_S (w[29], w[30], selector); - w[53] = __byte_perm_S (w[28], w[29], selector); - w[52] = __byte_perm_S (w[27], w[28], selector); - w[51] = __byte_perm_S (w[26], w[27], selector); - w[50] = __byte_perm_S (w[25], w[26], selector); - w[49] = __byte_perm_S (w[24], w[25], selector); - w[48] = __byte_perm_S (w[23], w[24], selector); - w[47] = __byte_perm_S (w[22], w[23], selector); - w[46] = __byte_perm_S (w[21], w[22], selector); - w[45] = __byte_perm_S (w[20], w[21], selector); - w[44] = __byte_perm_S (w[19], w[20], selector); - w[43] = __byte_perm_S (w[18], w[19], selector); - w[42] = __byte_perm_S (w[17], w[18], selector); - w[41] = __byte_perm_S (w[16], w[17], selector); - w[40] = __byte_perm_S (w[15], w[16], selector); - w[39] = __byte_perm_S (w[14], w[15], selector); - w[38] = __byte_perm_S (w[13], w[14], selector); - w[37] = __byte_perm_S (w[12], w[13], selector); - w[36] = __byte_perm_S (w[11], w[12], selector); - w[35] = __byte_perm_S (w[10], w[11], selector); - w[34] = __byte_perm_S (w[ 9], w[10], selector); - w[33] = __byte_perm_S (w[ 8], w[ 9], selector); - w[32] = __byte_perm_S (w[ 7], w[ 8], selector); - w[31] = __byte_perm_S (w[ 6], w[ 7], selector); - w[30] = __byte_perm_S (w[ 5], w[ 6], selector); - w[29] = __byte_perm_S (w[ 4], w[ 5], selector); - w[28] = __byte_perm_S (w[ 3], w[ 4], selector); - w[27] = __byte_perm_S (w[ 2], w[ 3], selector); - w[26] = __byte_perm_S (w[ 1], w[ 2], selector); - w[25] = __byte_perm_S (w[ 0], w[ 1], selector); - w[24] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[38], w[39], selector); + w[62] = hc_byte_perm_S (w[37], w[38], selector); + w[61] = hc_byte_perm_S (w[36], w[37], selector); + w[60] = hc_byte_perm_S (w[35], w[36], selector); + w[59] = hc_byte_perm_S (w[34], w[35], selector); + w[58] = hc_byte_perm_S (w[33], w[34], selector); + w[57] = hc_byte_perm_S (w[32], w[33], selector); + w[56] = hc_byte_perm_S (w[31], w[32], selector); + w[55] = hc_byte_perm_S (w[30], w[31], selector); + w[54] = hc_byte_perm_S (w[29], w[30], selector); + w[53] = hc_byte_perm_S (w[28], w[29], selector); + w[52] = hc_byte_perm_S (w[27], w[28], selector); + w[51] = hc_byte_perm_S (w[26], w[27], selector); + w[50] = hc_byte_perm_S (w[25], w[26], selector); + w[49] = hc_byte_perm_S (w[24], w[25], selector); + w[48] = hc_byte_perm_S (w[23], w[24], selector); + w[47] = hc_byte_perm_S (w[22], w[23], selector); + w[46] = hc_byte_perm_S (w[21], w[22], selector); + w[45] = hc_byte_perm_S (w[20], w[21], selector); + w[44] = hc_byte_perm_S (w[19], w[20], selector); + w[43] = hc_byte_perm_S (w[18], w[19], selector); + w[42] = hc_byte_perm_S (w[17], w[18], selector); + w[41] = hc_byte_perm_S (w[16], w[17], selector); + w[40] = hc_byte_perm_S (w[15], w[16], selector); + w[39] = hc_byte_perm_S (w[14], w[15], selector); + w[38] = hc_byte_perm_S (w[13], w[14], selector); + w[37] = hc_byte_perm_S (w[12], w[13], selector); + w[36] = hc_byte_perm_S (w[11], w[12], selector); + w[35] = hc_byte_perm_S (w[10], w[11], selector); + w[34] = hc_byte_perm_S (w[ 9], w[10], selector); + w[33] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[32] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[31] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[30] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[29] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[28] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[27] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[26] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[25] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[24] = hc_byte_perm_S ( 0, w[ 0], selector); w[23] = 0; w[22] = 0; w[21] = 0; @@ -48424,45 +48424,45 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 25: - w[63] = __byte_perm_S (w[37], w[38], selector); - w[62] = __byte_perm_S (w[36], w[37], selector); - w[61] = __byte_perm_S (w[35], w[36], selector); - w[60] = __byte_perm_S (w[34], w[35], selector); - w[59] = __byte_perm_S (w[33], w[34], selector); - w[58] = __byte_perm_S (w[32], w[33], selector); - w[57] = __byte_perm_S (w[31], w[32], selector); - w[56] = __byte_perm_S (w[30], w[31], selector); - w[55] = __byte_perm_S (w[29], w[30], selector); - w[54] = __byte_perm_S (w[28], w[29], selector); - w[53] = __byte_perm_S (w[27], w[28], selector); - w[52] = __byte_perm_S (w[26], w[27], selector); - w[51] = __byte_perm_S (w[25], w[26], selector); - w[50] = __byte_perm_S (w[24], w[25], selector); - w[49] = __byte_perm_S (w[23], w[24], selector); - w[48] = __byte_perm_S (w[22], w[23], selector); - w[47] = __byte_perm_S (w[21], w[22], selector); - w[46] = __byte_perm_S (w[20], w[21], selector); - w[45] = __byte_perm_S (w[19], w[20], selector); - w[44] = __byte_perm_S (w[18], w[19], selector); - w[43] = __byte_perm_S (w[17], w[18], selector); - w[42] = __byte_perm_S (w[16], w[17], selector); - w[41] = __byte_perm_S (w[15], w[16], selector); - w[40] = __byte_perm_S (w[14], w[15], selector); - w[39] = __byte_perm_S (w[13], w[14], selector); - w[38] = __byte_perm_S (w[12], w[13], selector); - w[37] = __byte_perm_S (w[11], w[12], selector); - w[36] = __byte_perm_S (w[10], w[11], selector); - w[35] = __byte_perm_S (w[ 9], w[10], selector); - w[34] = __byte_perm_S (w[ 8], w[ 9], selector); - w[33] = __byte_perm_S (w[ 7], w[ 8], selector); - w[32] = __byte_perm_S (w[ 6], w[ 7], selector); - w[31] = __byte_perm_S (w[ 5], w[ 6], selector); - w[30] = __byte_perm_S (w[ 4], w[ 5], selector); - w[29] = __byte_perm_S (w[ 3], w[ 4], selector); - w[28] = __byte_perm_S (w[ 2], w[ 3], selector); - w[27] = __byte_perm_S (w[ 1], w[ 2], selector); - w[26] = __byte_perm_S (w[ 0], w[ 1], selector); - w[25] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[37], w[38], selector); + w[62] = hc_byte_perm_S (w[36], w[37], selector); + w[61] = hc_byte_perm_S (w[35], w[36], selector); + w[60] = hc_byte_perm_S (w[34], w[35], selector); + w[59] = hc_byte_perm_S (w[33], w[34], selector); + w[58] = hc_byte_perm_S (w[32], w[33], selector); + w[57] = hc_byte_perm_S (w[31], w[32], selector); + w[56] = hc_byte_perm_S (w[30], w[31], selector); + w[55] = hc_byte_perm_S (w[29], w[30], selector); + w[54] = hc_byte_perm_S (w[28], w[29], selector); + w[53] = hc_byte_perm_S (w[27], w[28], selector); + w[52] = hc_byte_perm_S (w[26], w[27], selector); + w[51] = hc_byte_perm_S (w[25], w[26], selector); + w[50] = hc_byte_perm_S (w[24], w[25], selector); + w[49] = hc_byte_perm_S (w[23], w[24], selector); + w[48] = hc_byte_perm_S (w[22], w[23], selector); + w[47] = hc_byte_perm_S (w[21], w[22], selector); + w[46] = hc_byte_perm_S (w[20], w[21], selector); + w[45] = hc_byte_perm_S (w[19], w[20], selector); + w[44] = hc_byte_perm_S (w[18], w[19], selector); + w[43] = hc_byte_perm_S (w[17], w[18], selector); + w[42] = hc_byte_perm_S (w[16], w[17], selector); + w[41] = hc_byte_perm_S (w[15], w[16], selector); + w[40] = hc_byte_perm_S (w[14], w[15], selector); + w[39] = hc_byte_perm_S (w[13], w[14], selector); + w[38] = hc_byte_perm_S (w[12], w[13], selector); + w[37] = hc_byte_perm_S (w[11], w[12], selector); + w[36] = hc_byte_perm_S (w[10], w[11], selector); + w[35] = hc_byte_perm_S (w[ 9], w[10], selector); + w[34] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[33] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[32] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[31] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[30] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[29] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[28] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[27] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[26] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[25] = hc_byte_perm_S ( 0, w[ 0], selector); w[24] = 0; w[23] = 0; w[22] = 0; @@ -48492,44 +48492,44 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 26: - w[63] = __byte_perm_S (w[36], w[37], selector); - w[62] = __byte_perm_S (w[35], w[36], selector); - w[61] = __byte_perm_S (w[34], w[35], selector); - w[60] = __byte_perm_S (w[33], w[34], selector); - w[59] = __byte_perm_S (w[32], w[33], selector); - w[58] = __byte_perm_S (w[31], w[32], selector); - w[57] = __byte_perm_S (w[30], w[31], selector); - w[56] = __byte_perm_S (w[29], w[30], selector); - w[55] = __byte_perm_S (w[28], w[29], selector); - w[54] = __byte_perm_S (w[27], w[28], selector); - w[53] = __byte_perm_S (w[26], w[27], selector); - w[52] = __byte_perm_S (w[25], w[26], selector); - w[51] = __byte_perm_S (w[24], w[25], selector); - w[50] = __byte_perm_S (w[23], w[24], selector); - w[49] = __byte_perm_S (w[22], w[23], selector); - w[48] = __byte_perm_S (w[21], w[22], selector); - w[47] = __byte_perm_S (w[20], w[21], selector); - w[46] = __byte_perm_S (w[19], w[20], selector); - w[45] = __byte_perm_S (w[18], w[19], selector); - w[44] = __byte_perm_S (w[17], w[18], selector); - w[43] = __byte_perm_S (w[16], w[17], selector); - w[42] = __byte_perm_S (w[15], w[16], selector); - w[41] = __byte_perm_S (w[14], w[15], selector); - w[40] = __byte_perm_S (w[13], w[14], selector); - w[39] = __byte_perm_S (w[12], w[13], selector); - w[38] = __byte_perm_S (w[11], w[12], selector); - w[37] = __byte_perm_S (w[10], w[11], selector); - w[36] = __byte_perm_S (w[ 9], w[10], selector); - w[35] = __byte_perm_S (w[ 8], w[ 9], selector); - w[34] = __byte_perm_S (w[ 7], w[ 8], selector); - w[33] = __byte_perm_S (w[ 6], w[ 7], selector); - w[32] = __byte_perm_S (w[ 5], w[ 6], selector); - w[31] = __byte_perm_S (w[ 4], w[ 5], selector); - w[30] = __byte_perm_S (w[ 3], w[ 4], selector); - w[29] = __byte_perm_S (w[ 2], w[ 3], selector); - w[28] = __byte_perm_S (w[ 1], w[ 2], selector); - w[27] = __byte_perm_S (w[ 0], w[ 1], selector); - w[26] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[36], w[37], selector); + w[62] = hc_byte_perm_S (w[35], w[36], selector); + w[61] = hc_byte_perm_S (w[34], w[35], selector); + w[60] = hc_byte_perm_S (w[33], w[34], selector); + w[59] = hc_byte_perm_S (w[32], w[33], selector); + w[58] = hc_byte_perm_S (w[31], w[32], selector); + w[57] = hc_byte_perm_S (w[30], w[31], selector); + w[56] = hc_byte_perm_S (w[29], w[30], selector); + w[55] = hc_byte_perm_S (w[28], w[29], selector); + w[54] = hc_byte_perm_S (w[27], w[28], selector); + w[53] = hc_byte_perm_S (w[26], w[27], selector); + w[52] = hc_byte_perm_S (w[25], w[26], selector); + w[51] = hc_byte_perm_S (w[24], w[25], selector); + w[50] = hc_byte_perm_S (w[23], w[24], selector); + w[49] = hc_byte_perm_S (w[22], w[23], selector); + w[48] = hc_byte_perm_S (w[21], w[22], selector); + w[47] = hc_byte_perm_S (w[20], w[21], selector); + w[46] = hc_byte_perm_S (w[19], w[20], selector); + w[45] = hc_byte_perm_S (w[18], w[19], selector); + w[44] = hc_byte_perm_S (w[17], w[18], selector); + w[43] = hc_byte_perm_S (w[16], w[17], selector); + w[42] = hc_byte_perm_S (w[15], w[16], selector); + w[41] = hc_byte_perm_S (w[14], w[15], selector); + w[40] = hc_byte_perm_S (w[13], w[14], selector); + w[39] = hc_byte_perm_S (w[12], w[13], selector); + w[38] = hc_byte_perm_S (w[11], w[12], selector); + w[37] = hc_byte_perm_S (w[10], w[11], selector); + w[36] = hc_byte_perm_S (w[ 9], w[10], selector); + w[35] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[34] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[33] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[32] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[31] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[30] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[29] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[28] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[27] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[26] = hc_byte_perm_S ( 0, w[ 0], selector); w[25] = 0; w[24] = 0; w[23] = 0; @@ -48560,43 +48560,43 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 27: - w[63] = __byte_perm_S (w[35], w[36], selector); - w[62] = __byte_perm_S (w[34], w[35], selector); - w[61] = __byte_perm_S (w[33], w[34], selector); - w[60] = __byte_perm_S (w[32], w[33], selector); - w[59] = __byte_perm_S (w[31], w[32], selector); - w[58] = __byte_perm_S (w[30], w[31], selector); - w[57] = __byte_perm_S (w[29], w[30], selector); - w[56] = __byte_perm_S (w[28], w[29], selector); - w[55] = __byte_perm_S (w[27], w[28], selector); - w[54] = __byte_perm_S (w[26], w[27], selector); - w[53] = __byte_perm_S (w[25], w[26], selector); - w[52] = __byte_perm_S (w[24], w[25], selector); - w[51] = __byte_perm_S (w[23], w[24], selector); - w[50] = __byte_perm_S (w[22], w[23], selector); - w[49] = __byte_perm_S (w[21], w[22], selector); - w[48] = __byte_perm_S (w[20], w[21], selector); - w[47] = __byte_perm_S (w[19], w[20], selector); - w[46] = __byte_perm_S (w[18], w[19], selector); - w[45] = __byte_perm_S (w[17], w[18], selector); - w[44] = __byte_perm_S (w[16], w[17], selector); - w[43] = __byte_perm_S (w[15], w[16], selector); - w[42] = __byte_perm_S (w[14], w[15], selector); - w[41] = __byte_perm_S (w[13], w[14], selector); - w[40] = __byte_perm_S (w[12], w[13], selector); - w[39] = __byte_perm_S (w[11], w[12], selector); - w[38] = __byte_perm_S (w[10], w[11], selector); - w[37] = __byte_perm_S (w[ 9], w[10], selector); - w[36] = __byte_perm_S (w[ 8], w[ 9], selector); - w[35] = __byte_perm_S (w[ 7], w[ 8], selector); - w[34] = __byte_perm_S (w[ 6], w[ 7], selector); - w[33] = __byte_perm_S (w[ 5], w[ 6], selector); - w[32] = __byte_perm_S (w[ 4], w[ 5], selector); - w[31] = __byte_perm_S (w[ 3], w[ 4], selector); - w[30] = __byte_perm_S (w[ 2], w[ 3], selector); - w[29] = __byte_perm_S (w[ 1], w[ 2], selector); - w[28] = __byte_perm_S (w[ 0], w[ 1], selector); - w[27] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[35], w[36], selector); + w[62] = hc_byte_perm_S (w[34], w[35], selector); + w[61] = hc_byte_perm_S (w[33], w[34], selector); + w[60] = hc_byte_perm_S (w[32], w[33], selector); + w[59] = hc_byte_perm_S (w[31], w[32], selector); + w[58] = hc_byte_perm_S (w[30], w[31], selector); + w[57] = hc_byte_perm_S (w[29], w[30], selector); + w[56] = hc_byte_perm_S (w[28], w[29], selector); + w[55] = hc_byte_perm_S (w[27], w[28], selector); + w[54] = hc_byte_perm_S (w[26], w[27], selector); + w[53] = hc_byte_perm_S (w[25], w[26], selector); + w[52] = hc_byte_perm_S (w[24], w[25], selector); + w[51] = hc_byte_perm_S (w[23], w[24], selector); + w[50] = hc_byte_perm_S (w[22], w[23], selector); + w[49] = hc_byte_perm_S (w[21], w[22], selector); + w[48] = hc_byte_perm_S (w[20], w[21], selector); + w[47] = hc_byte_perm_S (w[19], w[20], selector); + w[46] = hc_byte_perm_S (w[18], w[19], selector); + w[45] = hc_byte_perm_S (w[17], w[18], selector); + w[44] = hc_byte_perm_S (w[16], w[17], selector); + w[43] = hc_byte_perm_S (w[15], w[16], selector); + w[42] = hc_byte_perm_S (w[14], w[15], selector); + w[41] = hc_byte_perm_S (w[13], w[14], selector); + w[40] = hc_byte_perm_S (w[12], w[13], selector); + w[39] = hc_byte_perm_S (w[11], w[12], selector); + w[38] = hc_byte_perm_S (w[10], w[11], selector); + w[37] = hc_byte_perm_S (w[ 9], w[10], selector); + w[36] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[35] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[34] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[33] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[32] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[31] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[30] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[29] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[28] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[27] = hc_byte_perm_S ( 0, w[ 0], selector); w[26] = 0; w[25] = 0; w[24] = 0; @@ -48628,42 +48628,42 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 28: - w[63] = __byte_perm_S (w[34], w[35], selector); - w[62] = __byte_perm_S (w[33], w[34], selector); - w[61] = __byte_perm_S (w[32], w[33], selector); - w[60] = __byte_perm_S (w[31], w[32], selector); - w[59] = __byte_perm_S (w[30], w[31], selector); - w[58] = __byte_perm_S (w[29], w[30], selector); - w[57] = __byte_perm_S (w[28], w[29], selector); - w[56] = __byte_perm_S (w[27], w[28], selector); - w[55] = __byte_perm_S (w[26], w[27], selector); - w[54] = __byte_perm_S (w[25], w[26], selector); - w[53] = __byte_perm_S (w[24], w[25], selector); - w[52] = __byte_perm_S (w[23], w[24], selector); - w[51] = __byte_perm_S (w[22], w[23], selector); - w[50] = __byte_perm_S (w[21], w[22], selector); - w[49] = __byte_perm_S (w[20], w[21], selector); - w[48] = __byte_perm_S (w[19], w[20], selector); - w[47] = __byte_perm_S (w[18], w[19], selector); - w[46] = __byte_perm_S (w[17], w[18], selector); - w[45] = __byte_perm_S (w[16], w[17], selector); - w[44] = __byte_perm_S (w[15], w[16], selector); - w[43] = __byte_perm_S (w[14], w[15], selector); - w[42] = __byte_perm_S (w[13], w[14], selector); - w[41] = __byte_perm_S (w[12], w[13], selector); - w[40] = __byte_perm_S (w[11], w[12], selector); - w[39] = __byte_perm_S (w[10], w[11], selector); - w[38] = __byte_perm_S (w[ 9], w[10], selector); - w[37] = __byte_perm_S (w[ 8], w[ 9], selector); - w[36] = __byte_perm_S (w[ 7], w[ 8], selector); - w[35] = __byte_perm_S (w[ 6], w[ 7], selector); - w[34] = __byte_perm_S (w[ 5], w[ 6], selector); - w[33] = __byte_perm_S (w[ 4], w[ 5], selector); - w[32] = __byte_perm_S (w[ 3], w[ 4], selector); - w[31] = __byte_perm_S (w[ 2], w[ 3], selector); - w[30] = __byte_perm_S (w[ 1], w[ 2], selector); - w[29] = __byte_perm_S (w[ 0], w[ 1], selector); - w[28] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[34], w[35], selector); + w[62] = hc_byte_perm_S (w[33], w[34], selector); + w[61] = hc_byte_perm_S (w[32], w[33], selector); + w[60] = hc_byte_perm_S (w[31], w[32], selector); + w[59] = hc_byte_perm_S (w[30], w[31], selector); + w[58] = hc_byte_perm_S (w[29], w[30], selector); + w[57] = hc_byte_perm_S (w[28], w[29], selector); + w[56] = hc_byte_perm_S (w[27], w[28], selector); + w[55] = hc_byte_perm_S (w[26], w[27], selector); + w[54] = hc_byte_perm_S (w[25], w[26], selector); + w[53] = hc_byte_perm_S (w[24], w[25], selector); + w[52] = hc_byte_perm_S (w[23], w[24], selector); + w[51] = hc_byte_perm_S (w[22], w[23], selector); + w[50] = hc_byte_perm_S (w[21], w[22], selector); + w[49] = hc_byte_perm_S (w[20], w[21], selector); + w[48] = hc_byte_perm_S (w[19], w[20], selector); + w[47] = hc_byte_perm_S (w[18], w[19], selector); + w[46] = hc_byte_perm_S (w[17], w[18], selector); + w[45] = hc_byte_perm_S (w[16], w[17], selector); + w[44] = hc_byte_perm_S (w[15], w[16], selector); + w[43] = hc_byte_perm_S (w[14], w[15], selector); + w[42] = hc_byte_perm_S (w[13], w[14], selector); + w[41] = hc_byte_perm_S (w[12], w[13], selector); + w[40] = hc_byte_perm_S (w[11], w[12], selector); + w[39] = hc_byte_perm_S (w[10], w[11], selector); + w[38] = hc_byte_perm_S (w[ 9], w[10], selector); + w[37] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[36] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[35] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[34] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[33] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[32] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[31] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[30] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[29] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[28] = hc_byte_perm_S ( 0, w[ 0], selector); w[27] = 0; w[26] = 0; w[25] = 0; @@ -48696,41 +48696,41 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 29: - w[63] = __byte_perm_S (w[33], w[34], selector); - w[62] = __byte_perm_S (w[32], w[33], selector); - w[61] = __byte_perm_S (w[31], w[32], selector); - w[60] = __byte_perm_S (w[30], w[31], selector); - w[59] = __byte_perm_S (w[29], w[30], selector); - w[58] = __byte_perm_S (w[28], w[29], selector); - w[57] = __byte_perm_S (w[27], w[28], selector); - w[56] = __byte_perm_S (w[26], w[27], selector); - w[55] = __byte_perm_S (w[25], w[26], selector); - w[54] = __byte_perm_S (w[24], w[25], selector); - w[53] = __byte_perm_S (w[23], w[24], selector); - w[52] = __byte_perm_S (w[22], w[23], selector); - w[51] = __byte_perm_S (w[21], w[22], selector); - w[50] = __byte_perm_S (w[20], w[21], selector); - w[49] = __byte_perm_S (w[19], w[20], selector); - w[48] = __byte_perm_S (w[18], w[19], selector); - w[47] = __byte_perm_S (w[17], w[18], selector); - w[46] = __byte_perm_S (w[16], w[17], selector); - w[45] = __byte_perm_S (w[15], w[16], selector); - w[44] = __byte_perm_S (w[14], w[15], selector); - w[43] = __byte_perm_S (w[13], w[14], selector); - w[42] = __byte_perm_S (w[12], w[13], selector); - w[41] = __byte_perm_S (w[11], w[12], selector); - w[40] = __byte_perm_S (w[10], w[11], selector); - w[39] = __byte_perm_S (w[ 9], w[10], selector); - w[38] = __byte_perm_S (w[ 8], w[ 9], selector); - w[37] = __byte_perm_S (w[ 7], w[ 8], selector); - w[36] = __byte_perm_S (w[ 6], w[ 7], selector); - w[35] = __byte_perm_S (w[ 5], w[ 6], selector); - w[34] = __byte_perm_S (w[ 4], w[ 5], selector); - w[33] = __byte_perm_S (w[ 3], w[ 4], selector); - w[32] = __byte_perm_S (w[ 2], w[ 3], selector); - w[31] = __byte_perm_S (w[ 1], w[ 2], selector); - w[30] = __byte_perm_S (w[ 0], w[ 1], selector); - w[29] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[33], w[34], selector); + w[62] = hc_byte_perm_S (w[32], w[33], selector); + w[61] = hc_byte_perm_S (w[31], w[32], selector); + w[60] = hc_byte_perm_S (w[30], w[31], selector); + w[59] = hc_byte_perm_S (w[29], w[30], selector); + w[58] = hc_byte_perm_S (w[28], w[29], selector); + w[57] = hc_byte_perm_S (w[27], w[28], selector); + w[56] = hc_byte_perm_S (w[26], w[27], selector); + w[55] = hc_byte_perm_S (w[25], w[26], selector); + w[54] = hc_byte_perm_S (w[24], w[25], selector); + w[53] = hc_byte_perm_S (w[23], w[24], selector); + w[52] = hc_byte_perm_S (w[22], w[23], selector); + w[51] = hc_byte_perm_S (w[21], w[22], selector); + w[50] = hc_byte_perm_S (w[20], w[21], selector); + w[49] = hc_byte_perm_S (w[19], w[20], selector); + w[48] = hc_byte_perm_S (w[18], w[19], selector); + w[47] = hc_byte_perm_S (w[17], w[18], selector); + w[46] = hc_byte_perm_S (w[16], w[17], selector); + w[45] = hc_byte_perm_S (w[15], w[16], selector); + w[44] = hc_byte_perm_S (w[14], w[15], selector); + w[43] = hc_byte_perm_S (w[13], w[14], selector); + w[42] = hc_byte_perm_S (w[12], w[13], selector); + w[41] = hc_byte_perm_S (w[11], w[12], selector); + w[40] = hc_byte_perm_S (w[10], w[11], selector); + w[39] = hc_byte_perm_S (w[ 9], w[10], selector); + w[38] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[37] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[36] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[35] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[34] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[33] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[32] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[31] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[30] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[29] = hc_byte_perm_S ( 0, w[ 0], selector); w[28] = 0; w[27] = 0; w[26] = 0; @@ -48764,40 +48764,40 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 30: - w[63] = __byte_perm_S (w[32], w[33], selector); - w[62] = __byte_perm_S (w[31], w[32], selector); - w[61] = __byte_perm_S (w[30], w[31], selector); - w[60] = __byte_perm_S (w[29], w[30], selector); - w[59] = __byte_perm_S (w[28], w[29], selector); - w[58] = __byte_perm_S (w[27], w[28], selector); - w[57] = __byte_perm_S (w[26], w[27], selector); - w[56] = __byte_perm_S (w[25], w[26], selector); - w[55] = __byte_perm_S (w[24], w[25], selector); - w[54] = __byte_perm_S (w[23], w[24], selector); - w[53] = __byte_perm_S (w[22], w[23], selector); - w[52] = __byte_perm_S (w[21], w[22], selector); - w[51] = __byte_perm_S (w[20], w[21], selector); - w[50] = __byte_perm_S (w[19], w[20], selector); - w[49] = __byte_perm_S (w[18], w[19], selector); - w[48] = __byte_perm_S (w[17], w[18], selector); - w[47] = __byte_perm_S (w[16], w[17], selector); - w[46] = __byte_perm_S (w[15], w[16], selector); - w[45] = __byte_perm_S (w[14], w[15], selector); - w[44] = __byte_perm_S (w[13], w[14], selector); - w[43] = __byte_perm_S (w[12], w[13], selector); - w[42] = __byte_perm_S (w[11], w[12], selector); - w[41] = __byte_perm_S (w[10], w[11], selector); - w[40] = __byte_perm_S (w[ 9], w[10], selector); - w[39] = __byte_perm_S (w[ 8], w[ 9], selector); - w[38] = __byte_perm_S (w[ 7], w[ 8], selector); - w[37] = __byte_perm_S (w[ 6], w[ 7], selector); - w[36] = __byte_perm_S (w[ 5], w[ 6], selector); - w[35] = __byte_perm_S (w[ 4], w[ 5], selector); - w[34] = __byte_perm_S (w[ 3], w[ 4], selector); - w[33] = __byte_perm_S (w[ 2], w[ 3], selector); - w[32] = __byte_perm_S (w[ 1], w[ 2], selector); - w[31] = __byte_perm_S (w[ 0], w[ 1], selector); - w[30] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[32], w[33], selector); + w[62] = hc_byte_perm_S (w[31], w[32], selector); + w[61] = hc_byte_perm_S (w[30], w[31], selector); + w[60] = hc_byte_perm_S (w[29], w[30], selector); + w[59] = hc_byte_perm_S (w[28], w[29], selector); + w[58] = hc_byte_perm_S (w[27], w[28], selector); + w[57] = hc_byte_perm_S (w[26], w[27], selector); + w[56] = hc_byte_perm_S (w[25], w[26], selector); + w[55] = hc_byte_perm_S (w[24], w[25], selector); + w[54] = hc_byte_perm_S (w[23], w[24], selector); + w[53] = hc_byte_perm_S (w[22], w[23], selector); + w[52] = hc_byte_perm_S (w[21], w[22], selector); + w[51] = hc_byte_perm_S (w[20], w[21], selector); + w[50] = hc_byte_perm_S (w[19], w[20], selector); + w[49] = hc_byte_perm_S (w[18], w[19], selector); + w[48] = hc_byte_perm_S (w[17], w[18], selector); + w[47] = hc_byte_perm_S (w[16], w[17], selector); + w[46] = hc_byte_perm_S (w[15], w[16], selector); + w[45] = hc_byte_perm_S (w[14], w[15], selector); + w[44] = hc_byte_perm_S (w[13], w[14], selector); + w[43] = hc_byte_perm_S (w[12], w[13], selector); + w[42] = hc_byte_perm_S (w[11], w[12], selector); + w[41] = hc_byte_perm_S (w[10], w[11], selector); + w[40] = hc_byte_perm_S (w[ 9], w[10], selector); + w[39] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[38] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[37] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[36] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[35] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[34] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[33] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[32] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[31] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[30] = hc_byte_perm_S ( 0, w[ 0], selector); w[29] = 0; w[28] = 0; w[27] = 0; @@ -48832,39 +48832,39 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 31: - w[63] = __byte_perm_S (w[31], w[32], selector); - w[62] = __byte_perm_S (w[30], w[31], selector); - w[61] = __byte_perm_S (w[29], w[30], selector); - w[60] = __byte_perm_S (w[28], w[29], selector); - w[59] = __byte_perm_S (w[27], w[28], selector); - w[58] = __byte_perm_S (w[26], w[27], selector); - w[57] = __byte_perm_S (w[25], w[26], selector); - w[56] = __byte_perm_S (w[24], w[25], selector); - w[55] = __byte_perm_S (w[23], w[24], selector); - w[54] = __byte_perm_S (w[22], w[23], selector); - w[53] = __byte_perm_S (w[21], w[22], selector); - w[52] = __byte_perm_S (w[20], w[21], selector); - w[51] = __byte_perm_S (w[19], w[20], selector); - w[50] = __byte_perm_S (w[18], w[19], selector); - w[49] = __byte_perm_S (w[17], w[18], selector); - w[48] = __byte_perm_S (w[16], w[17], selector); - w[47] = __byte_perm_S (w[15], w[16], selector); - w[46] = __byte_perm_S (w[14], w[15], selector); - w[45] = __byte_perm_S (w[13], w[14], selector); - w[44] = __byte_perm_S (w[12], w[13], selector); - w[43] = __byte_perm_S (w[11], w[12], selector); - w[42] = __byte_perm_S (w[10], w[11], selector); - w[41] = __byte_perm_S (w[ 9], w[10], selector); - w[40] = __byte_perm_S (w[ 8], w[ 9], selector); - w[39] = __byte_perm_S (w[ 7], w[ 8], selector); - w[38] = __byte_perm_S (w[ 6], w[ 7], selector); - w[37] = __byte_perm_S (w[ 5], w[ 6], selector); - w[36] = __byte_perm_S (w[ 4], w[ 5], selector); - w[35] = __byte_perm_S (w[ 3], w[ 4], selector); - w[34] = __byte_perm_S (w[ 2], w[ 3], selector); - w[33] = __byte_perm_S (w[ 1], w[ 2], selector); - w[32] = __byte_perm_S (w[ 0], w[ 1], selector); - w[31] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[31], w[32], selector); + w[62] = hc_byte_perm_S (w[30], w[31], selector); + w[61] = hc_byte_perm_S (w[29], w[30], selector); + w[60] = hc_byte_perm_S (w[28], w[29], selector); + w[59] = hc_byte_perm_S (w[27], w[28], selector); + w[58] = hc_byte_perm_S (w[26], w[27], selector); + w[57] = hc_byte_perm_S (w[25], w[26], selector); + w[56] = hc_byte_perm_S (w[24], w[25], selector); + w[55] = hc_byte_perm_S (w[23], w[24], selector); + w[54] = hc_byte_perm_S (w[22], w[23], selector); + w[53] = hc_byte_perm_S (w[21], w[22], selector); + w[52] = hc_byte_perm_S (w[20], w[21], selector); + w[51] = hc_byte_perm_S (w[19], w[20], selector); + w[50] = hc_byte_perm_S (w[18], w[19], selector); + w[49] = hc_byte_perm_S (w[17], w[18], selector); + w[48] = hc_byte_perm_S (w[16], w[17], selector); + w[47] = hc_byte_perm_S (w[15], w[16], selector); + w[46] = hc_byte_perm_S (w[14], w[15], selector); + w[45] = hc_byte_perm_S (w[13], w[14], selector); + w[44] = hc_byte_perm_S (w[12], w[13], selector); + w[43] = hc_byte_perm_S (w[11], w[12], selector); + w[42] = hc_byte_perm_S (w[10], w[11], selector); + w[41] = hc_byte_perm_S (w[ 9], w[10], selector); + w[40] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[39] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[38] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[37] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[36] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[35] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[34] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[33] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[32] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[31] = hc_byte_perm_S ( 0, w[ 0], selector); w[30] = 0; w[29] = 0; w[28] = 0; @@ -48900,38 +48900,38 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 32: - w[63] = __byte_perm_S (w[30], w[31], selector); - w[62] = __byte_perm_S (w[29], w[30], selector); - w[61] = __byte_perm_S (w[28], w[29], selector); - w[60] = __byte_perm_S (w[27], w[28], selector); - w[59] = __byte_perm_S (w[26], w[27], selector); - w[58] = __byte_perm_S (w[25], w[26], selector); - w[57] = __byte_perm_S (w[24], w[25], selector); - w[56] = __byte_perm_S (w[23], w[24], selector); - w[55] = __byte_perm_S (w[22], w[23], selector); - w[54] = __byte_perm_S (w[21], w[22], selector); - w[53] = __byte_perm_S (w[20], w[21], selector); - w[52] = __byte_perm_S (w[19], w[20], selector); - w[51] = __byte_perm_S (w[18], w[19], selector); - w[50] = __byte_perm_S (w[17], w[18], selector); - w[49] = __byte_perm_S (w[16], w[17], selector); - w[48] = __byte_perm_S (w[15], w[16], selector); - w[47] = __byte_perm_S (w[14], w[15], selector); - w[46] = __byte_perm_S (w[13], w[14], selector); - w[45] = __byte_perm_S (w[12], w[13], selector); - w[44] = __byte_perm_S (w[11], w[12], selector); - w[43] = __byte_perm_S (w[10], w[11], selector); - w[42] = __byte_perm_S (w[ 9], w[10], selector); - w[41] = __byte_perm_S (w[ 8], w[ 9], selector); - w[40] = __byte_perm_S (w[ 7], w[ 8], selector); - w[39] = __byte_perm_S (w[ 6], w[ 7], selector); - w[38] = __byte_perm_S (w[ 5], w[ 6], selector); - w[37] = __byte_perm_S (w[ 4], w[ 5], selector); - w[36] = __byte_perm_S (w[ 3], w[ 4], selector); - w[35] = __byte_perm_S (w[ 2], w[ 3], selector); - w[34] = __byte_perm_S (w[ 1], w[ 2], selector); - w[33] = __byte_perm_S (w[ 0], w[ 1], selector); - w[32] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[30], w[31], selector); + w[62] = hc_byte_perm_S (w[29], w[30], selector); + w[61] = hc_byte_perm_S (w[28], w[29], selector); + w[60] = hc_byte_perm_S (w[27], w[28], selector); + w[59] = hc_byte_perm_S (w[26], w[27], selector); + w[58] = hc_byte_perm_S (w[25], w[26], selector); + w[57] = hc_byte_perm_S (w[24], w[25], selector); + w[56] = hc_byte_perm_S (w[23], w[24], selector); + w[55] = hc_byte_perm_S (w[22], w[23], selector); + w[54] = hc_byte_perm_S (w[21], w[22], selector); + w[53] = hc_byte_perm_S (w[20], w[21], selector); + w[52] = hc_byte_perm_S (w[19], w[20], selector); + w[51] = hc_byte_perm_S (w[18], w[19], selector); + w[50] = hc_byte_perm_S (w[17], w[18], selector); + w[49] = hc_byte_perm_S (w[16], w[17], selector); + w[48] = hc_byte_perm_S (w[15], w[16], selector); + w[47] = hc_byte_perm_S (w[14], w[15], selector); + w[46] = hc_byte_perm_S (w[13], w[14], selector); + w[45] = hc_byte_perm_S (w[12], w[13], selector); + w[44] = hc_byte_perm_S (w[11], w[12], selector); + w[43] = hc_byte_perm_S (w[10], w[11], selector); + w[42] = hc_byte_perm_S (w[ 9], w[10], selector); + w[41] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[40] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[39] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[38] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[37] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[36] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[35] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[34] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[33] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[32] = hc_byte_perm_S ( 0, w[ 0], selector); w[31] = 0; w[30] = 0; w[29] = 0; @@ -48968,37 +48968,37 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 33: - w[63] = __byte_perm_S (w[29], w[30], selector); - w[62] = __byte_perm_S (w[28], w[29], selector); - w[61] = __byte_perm_S (w[27], w[28], selector); - w[60] = __byte_perm_S (w[26], w[27], selector); - w[59] = __byte_perm_S (w[25], w[26], selector); - w[58] = __byte_perm_S (w[24], w[25], selector); - w[57] = __byte_perm_S (w[23], w[24], selector); - w[56] = __byte_perm_S (w[22], w[23], selector); - w[55] = __byte_perm_S (w[21], w[22], selector); - w[54] = __byte_perm_S (w[20], w[21], selector); - w[53] = __byte_perm_S (w[19], w[20], selector); - w[52] = __byte_perm_S (w[18], w[19], selector); - w[51] = __byte_perm_S (w[17], w[18], selector); - w[50] = __byte_perm_S (w[16], w[17], selector); - w[49] = __byte_perm_S (w[15], w[16], selector); - w[48] = __byte_perm_S (w[14], w[15], selector); - w[47] = __byte_perm_S (w[13], w[14], selector); - w[46] = __byte_perm_S (w[12], w[13], selector); - w[45] = __byte_perm_S (w[11], w[12], selector); - w[44] = __byte_perm_S (w[10], w[11], selector); - w[43] = __byte_perm_S (w[ 9], w[10], selector); - w[42] = __byte_perm_S (w[ 8], w[ 9], selector); - w[41] = __byte_perm_S (w[ 7], w[ 8], selector); - w[40] = __byte_perm_S (w[ 6], w[ 7], selector); - w[39] = __byte_perm_S (w[ 5], w[ 6], selector); - w[38] = __byte_perm_S (w[ 4], w[ 5], selector); - w[37] = __byte_perm_S (w[ 3], w[ 4], selector); - w[36] = __byte_perm_S (w[ 2], w[ 3], selector); - w[35] = __byte_perm_S (w[ 1], w[ 2], selector); - w[34] = __byte_perm_S (w[ 0], w[ 1], selector); - w[33] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[29], w[30], selector); + w[62] = hc_byte_perm_S (w[28], w[29], selector); + w[61] = hc_byte_perm_S (w[27], w[28], selector); + w[60] = hc_byte_perm_S (w[26], w[27], selector); + w[59] = hc_byte_perm_S (w[25], w[26], selector); + w[58] = hc_byte_perm_S (w[24], w[25], selector); + w[57] = hc_byte_perm_S (w[23], w[24], selector); + w[56] = hc_byte_perm_S (w[22], w[23], selector); + w[55] = hc_byte_perm_S (w[21], w[22], selector); + w[54] = hc_byte_perm_S (w[20], w[21], selector); + w[53] = hc_byte_perm_S (w[19], w[20], selector); + w[52] = hc_byte_perm_S (w[18], w[19], selector); + w[51] = hc_byte_perm_S (w[17], w[18], selector); + w[50] = hc_byte_perm_S (w[16], w[17], selector); + w[49] = hc_byte_perm_S (w[15], w[16], selector); + w[48] = hc_byte_perm_S (w[14], w[15], selector); + w[47] = hc_byte_perm_S (w[13], w[14], selector); + w[46] = hc_byte_perm_S (w[12], w[13], selector); + w[45] = hc_byte_perm_S (w[11], w[12], selector); + w[44] = hc_byte_perm_S (w[10], w[11], selector); + w[43] = hc_byte_perm_S (w[ 9], w[10], selector); + w[42] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[41] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[40] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[39] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[38] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[37] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[36] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[35] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[34] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[33] = hc_byte_perm_S ( 0, w[ 0], selector); w[32] = 0; w[31] = 0; w[30] = 0; @@ -49036,36 +49036,36 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 34: - w[63] = __byte_perm_S (w[28], w[29], selector); - w[62] = __byte_perm_S (w[27], w[28], selector); - w[61] = __byte_perm_S (w[26], w[27], selector); - w[60] = __byte_perm_S (w[25], w[26], selector); - w[59] = __byte_perm_S (w[24], w[25], selector); - w[58] = __byte_perm_S (w[23], w[24], selector); - w[57] = __byte_perm_S (w[22], w[23], selector); - w[56] = __byte_perm_S (w[21], w[22], selector); - w[55] = __byte_perm_S (w[20], w[21], selector); - w[54] = __byte_perm_S (w[19], w[20], selector); - w[53] = __byte_perm_S (w[18], w[19], selector); - w[52] = __byte_perm_S (w[17], w[18], selector); - w[51] = __byte_perm_S (w[16], w[17], selector); - w[50] = __byte_perm_S (w[15], w[16], selector); - w[49] = __byte_perm_S (w[14], w[15], selector); - w[48] = __byte_perm_S (w[13], w[14], selector); - w[47] = __byte_perm_S (w[12], w[13], selector); - w[46] = __byte_perm_S (w[11], w[12], selector); - w[45] = __byte_perm_S (w[10], w[11], selector); - w[44] = __byte_perm_S (w[ 9], w[10], selector); - w[43] = __byte_perm_S (w[ 8], w[ 9], selector); - w[42] = __byte_perm_S (w[ 7], w[ 8], selector); - w[41] = __byte_perm_S (w[ 6], w[ 7], selector); - w[40] = __byte_perm_S (w[ 5], w[ 6], selector); - w[39] = __byte_perm_S (w[ 4], w[ 5], selector); - w[38] = __byte_perm_S (w[ 3], w[ 4], selector); - w[37] = __byte_perm_S (w[ 2], w[ 3], selector); - w[36] = __byte_perm_S (w[ 1], w[ 2], selector); - w[35] = __byte_perm_S (w[ 0], w[ 1], selector); - w[34] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[28], w[29], selector); + w[62] = hc_byte_perm_S (w[27], w[28], selector); + w[61] = hc_byte_perm_S (w[26], w[27], selector); + w[60] = hc_byte_perm_S (w[25], w[26], selector); + w[59] = hc_byte_perm_S (w[24], w[25], selector); + w[58] = hc_byte_perm_S (w[23], w[24], selector); + w[57] = hc_byte_perm_S (w[22], w[23], selector); + w[56] = hc_byte_perm_S (w[21], w[22], selector); + w[55] = hc_byte_perm_S (w[20], w[21], selector); + w[54] = hc_byte_perm_S (w[19], w[20], selector); + w[53] = hc_byte_perm_S (w[18], w[19], selector); + w[52] = hc_byte_perm_S (w[17], w[18], selector); + w[51] = hc_byte_perm_S (w[16], w[17], selector); + w[50] = hc_byte_perm_S (w[15], w[16], selector); + w[49] = hc_byte_perm_S (w[14], w[15], selector); + w[48] = hc_byte_perm_S (w[13], w[14], selector); + w[47] = hc_byte_perm_S (w[12], w[13], selector); + w[46] = hc_byte_perm_S (w[11], w[12], selector); + w[45] = hc_byte_perm_S (w[10], w[11], selector); + w[44] = hc_byte_perm_S (w[ 9], w[10], selector); + w[43] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[42] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[41] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[40] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[39] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[38] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[37] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[36] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[35] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[34] = hc_byte_perm_S ( 0, w[ 0], selector); w[33] = 0; w[32] = 0; w[31] = 0; @@ -49104,35 +49104,35 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 35: - w[63] = __byte_perm_S (w[27], w[28], selector); - w[62] = __byte_perm_S (w[26], w[27], selector); - w[61] = __byte_perm_S (w[25], w[26], selector); - w[60] = __byte_perm_S (w[24], w[25], selector); - w[59] = __byte_perm_S (w[23], w[24], selector); - w[58] = __byte_perm_S (w[22], w[23], selector); - w[57] = __byte_perm_S (w[21], w[22], selector); - w[56] = __byte_perm_S (w[20], w[21], selector); - w[55] = __byte_perm_S (w[19], w[20], selector); - w[54] = __byte_perm_S (w[18], w[19], selector); - w[53] = __byte_perm_S (w[17], w[18], selector); - w[52] = __byte_perm_S (w[16], w[17], selector); - w[51] = __byte_perm_S (w[15], w[16], selector); - w[50] = __byte_perm_S (w[14], w[15], selector); - w[49] = __byte_perm_S (w[13], w[14], selector); - w[48] = __byte_perm_S (w[12], w[13], selector); - w[47] = __byte_perm_S (w[11], w[12], selector); - w[46] = __byte_perm_S (w[10], w[11], selector); - w[45] = __byte_perm_S (w[ 9], w[10], selector); - w[44] = __byte_perm_S (w[ 8], w[ 9], selector); - w[43] = __byte_perm_S (w[ 7], w[ 8], selector); - w[42] = __byte_perm_S (w[ 6], w[ 7], selector); - w[41] = __byte_perm_S (w[ 5], w[ 6], selector); - w[40] = __byte_perm_S (w[ 4], w[ 5], selector); - w[39] = __byte_perm_S (w[ 3], w[ 4], selector); - w[38] = __byte_perm_S (w[ 2], w[ 3], selector); - w[37] = __byte_perm_S (w[ 1], w[ 2], selector); - w[36] = __byte_perm_S (w[ 0], w[ 1], selector); - w[35] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[27], w[28], selector); + w[62] = hc_byte_perm_S (w[26], w[27], selector); + w[61] = hc_byte_perm_S (w[25], w[26], selector); + w[60] = hc_byte_perm_S (w[24], w[25], selector); + w[59] = hc_byte_perm_S (w[23], w[24], selector); + w[58] = hc_byte_perm_S (w[22], w[23], selector); + w[57] = hc_byte_perm_S (w[21], w[22], selector); + w[56] = hc_byte_perm_S (w[20], w[21], selector); + w[55] = hc_byte_perm_S (w[19], w[20], selector); + w[54] = hc_byte_perm_S (w[18], w[19], selector); + w[53] = hc_byte_perm_S (w[17], w[18], selector); + w[52] = hc_byte_perm_S (w[16], w[17], selector); + w[51] = hc_byte_perm_S (w[15], w[16], selector); + w[50] = hc_byte_perm_S (w[14], w[15], selector); + w[49] = hc_byte_perm_S (w[13], w[14], selector); + w[48] = hc_byte_perm_S (w[12], w[13], selector); + w[47] = hc_byte_perm_S (w[11], w[12], selector); + w[46] = hc_byte_perm_S (w[10], w[11], selector); + w[45] = hc_byte_perm_S (w[ 9], w[10], selector); + w[44] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[43] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[42] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[41] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[40] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[39] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[38] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[37] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[36] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[35] = hc_byte_perm_S ( 0, w[ 0], selector); w[34] = 0; w[33] = 0; w[32] = 0; @@ -49172,34 +49172,34 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 36: - w[63] = __byte_perm_S (w[26], w[27], selector); - w[62] = __byte_perm_S (w[25], w[26], selector); - w[61] = __byte_perm_S (w[24], w[25], selector); - w[60] = __byte_perm_S (w[23], w[24], selector); - w[59] = __byte_perm_S (w[22], w[23], selector); - w[58] = __byte_perm_S (w[21], w[22], selector); - w[57] = __byte_perm_S (w[20], w[21], selector); - w[56] = __byte_perm_S (w[19], w[20], selector); - w[55] = __byte_perm_S (w[18], w[19], selector); - w[54] = __byte_perm_S (w[17], w[18], selector); - w[53] = __byte_perm_S (w[16], w[17], selector); - w[52] = __byte_perm_S (w[15], w[16], selector); - w[51] = __byte_perm_S (w[14], w[15], selector); - w[50] = __byte_perm_S (w[13], w[14], selector); - w[49] = __byte_perm_S (w[12], w[13], selector); - w[48] = __byte_perm_S (w[11], w[12], selector); - w[47] = __byte_perm_S (w[10], w[11], selector); - w[46] = __byte_perm_S (w[ 9], w[10], selector); - w[45] = __byte_perm_S (w[ 8], w[ 9], selector); - w[44] = __byte_perm_S (w[ 7], w[ 8], selector); - w[43] = __byte_perm_S (w[ 6], w[ 7], selector); - w[42] = __byte_perm_S (w[ 5], w[ 6], selector); - w[41] = __byte_perm_S (w[ 4], w[ 5], selector); - w[40] = __byte_perm_S (w[ 3], w[ 4], selector); - w[39] = __byte_perm_S (w[ 2], w[ 3], selector); - w[38] = __byte_perm_S (w[ 1], w[ 2], selector); - w[37] = __byte_perm_S (w[ 0], w[ 1], selector); - w[36] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[26], w[27], selector); + w[62] = hc_byte_perm_S (w[25], w[26], selector); + w[61] = hc_byte_perm_S (w[24], w[25], selector); + w[60] = hc_byte_perm_S (w[23], w[24], selector); + w[59] = hc_byte_perm_S (w[22], w[23], selector); + w[58] = hc_byte_perm_S (w[21], w[22], selector); + w[57] = hc_byte_perm_S (w[20], w[21], selector); + w[56] = hc_byte_perm_S (w[19], w[20], selector); + w[55] = hc_byte_perm_S (w[18], w[19], selector); + w[54] = hc_byte_perm_S (w[17], w[18], selector); + w[53] = hc_byte_perm_S (w[16], w[17], selector); + w[52] = hc_byte_perm_S (w[15], w[16], selector); + w[51] = hc_byte_perm_S (w[14], w[15], selector); + w[50] = hc_byte_perm_S (w[13], w[14], selector); + w[49] = hc_byte_perm_S (w[12], w[13], selector); + w[48] = hc_byte_perm_S (w[11], w[12], selector); + w[47] = hc_byte_perm_S (w[10], w[11], selector); + w[46] = hc_byte_perm_S (w[ 9], w[10], selector); + w[45] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[44] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[43] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[42] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[41] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[40] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[39] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[38] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[37] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[36] = hc_byte_perm_S ( 0, w[ 0], selector); w[35] = 0; w[34] = 0; w[33] = 0; @@ -49240,33 +49240,33 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 37: - w[63] = __byte_perm_S (w[25], w[26], selector); - w[62] = __byte_perm_S (w[24], w[25], selector); - w[61] = __byte_perm_S (w[23], w[24], selector); - w[60] = __byte_perm_S (w[22], w[23], selector); - w[59] = __byte_perm_S (w[21], w[22], selector); - w[58] = __byte_perm_S (w[20], w[21], selector); - w[57] = __byte_perm_S (w[19], w[20], selector); - w[56] = __byte_perm_S (w[18], w[19], selector); - w[55] = __byte_perm_S (w[17], w[18], selector); - w[54] = __byte_perm_S (w[16], w[17], selector); - w[53] = __byte_perm_S (w[15], w[16], selector); - w[52] = __byte_perm_S (w[14], w[15], selector); - w[51] = __byte_perm_S (w[13], w[14], selector); - w[50] = __byte_perm_S (w[12], w[13], selector); - w[49] = __byte_perm_S (w[11], w[12], selector); - w[48] = __byte_perm_S (w[10], w[11], selector); - w[47] = __byte_perm_S (w[ 9], w[10], selector); - w[46] = __byte_perm_S (w[ 8], w[ 9], selector); - w[45] = __byte_perm_S (w[ 7], w[ 8], selector); - w[44] = __byte_perm_S (w[ 6], w[ 7], selector); - w[43] = __byte_perm_S (w[ 5], w[ 6], selector); - w[42] = __byte_perm_S (w[ 4], w[ 5], selector); - w[41] = __byte_perm_S (w[ 3], w[ 4], selector); - w[40] = __byte_perm_S (w[ 2], w[ 3], selector); - w[39] = __byte_perm_S (w[ 1], w[ 2], selector); - w[38] = __byte_perm_S (w[ 0], w[ 1], selector); - w[37] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[25], w[26], selector); + w[62] = hc_byte_perm_S (w[24], w[25], selector); + w[61] = hc_byte_perm_S (w[23], w[24], selector); + w[60] = hc_byte_perm_S (w[22], w[23], selector); + w[59] = hc_byte_perm_S (w[21], w[22], selector); + w[58] = hc_byte_perm_S (w[20], w[21], selector); + w[57] = hc_byte_perm_S (w[19], w[20], selector); + w[56] = hc_byte_perm_S (w[18], w[19], selector); + w[55] = hc_byte_perm_S (w[17], w[18], selector); + w[54] = hc_byte_perm_S (w[16], w[17], selector); + w[53] = hc_byte_perm_S (w[15], w[16], selector); + w[52] = hc_byte_perm_S (w[14], w[15], selector); + w[51] = hc_byte_perm_S (w[13], w[14], selector); + w[50] = hc_byte_perm_S (w[12], w[13], selector); + w[49] = hc_byte_perm_S (w[11], w[12], selector); + w[48] = hc_byte_perm_S (w[10], w[11], selector); + w[47] = hc_byte_perm_S (w[ 9], w[10], selector); + w[46] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[45] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[44] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[43] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[42] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[41] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[40] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[39] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[38] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[37] = hc_byte_perm_S ( 0, w[ 0], selector); w[36] = 0; w[35] = 0; w[34] = 0; @@ -49308,32 +49308,32 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 38: - w[63] = __byte_perm_S (w[24], w[25], selector); - w[62] = __byte_perm_S (w[23], w[24], selector); - w[61] = __byte_perm_S (w[22], w[23], selector); - w[60] = __byte_perm_S (w[21], w[22], selector); - w[59] = __byte_perm_S (w[20], w[21], selector); - w[58] = __byte_perm_S (w[19], w[20], selector); - w[57] = __byte_perm_S (w[18], w[19], selector); - w[56] = __byte_perm_S (w[17], w[18], selector); - w[55] = __byte_perm_S (w[16], w[17], selector); - w[54] = __byte_perm_S (w[15], w[16], selector); - w[53] = __byte_perm_S (w[14], w[15], selector); - w[52] = __byte_perm_S (w[13], w[14], selector); - w[51] = __byte_perm_S (w[12], w[13], selector); - w[50] = __byte_perm_S (w[11], w[12], selector); - w[49] = __byte_perm_S (w[10], w[11], selector); - w[48] = __byte_perm_S (w[ 9], w[10], selector); - w[47] = __byte_perm_S (w[ 8], w[ 9], selector); - w[46] = __byte_perm_S (w[ 7], w[ 8], selector); - w[45] = __byte_perm_S (w[ 6], w[ 7], selector); - w[44] = __byte_perm_S (w[ 5], w[ 6], selector); - w[43] = __byte_perm_S (w[ 4], w[ 5], selector); - w[42] = __byte_perm_S (w[ 3], w[ 4], selector); - w[41] = __byte_perm_S (w[ 2], w[ 3], selector); - w[40] = __byte_perm_S (w[ 1], w[ 2], selector); - w[39] = __byte_perm_S (w[ 0], w[ 1], selector); - w[38] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[24], w[25], selector); + w[62] = hc_byte_perm_S (w[23], w[24], selector); + w[61] = hc_byte_perm_S (w[22], w[23], selector); + w[60] = hc_byte_perm_S (w[21], w[22], selector); + w[59] = hc_byte_perm_S (w[20], w[21], selector); + w[58] = hc_byte_perm_S (w[19], w[20], selector); + w[57] = hc_byte_perm_S (w[18], w[19], selector); + w[56] = hc_byte_perm_S (w[17], w[18], selector); + w[55] = hc_byte_perm_S (w[16], w[17], selector); + w[54] = hc_byte_perm_S (w[15], w[16], selector); + w[53] = hc_byte_perm_S (w[14], w[15], selector); + w[52] = hc_byte_perm_S (w[13], w[14], selector); + w[51] = hc_byte_perm_S (w[12], w[13], selector); + w[50] = hc_byte_perm_S (w[11], w[12], selector); + w[49] = hc_byte_perm_S (w[10], w[11], selector); + w[48] = hc_byte_perm_S (w[ 9], w[10], selector); + w[47] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[46] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[45] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[44] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[43] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[42] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[41] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[40] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[39] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[38] = hc_byte_perm_S ( 0, w[ 0], selector); w[37] = 0; w[36] = 0; w[35] = 0; @@ -49376,31 +49376,31 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 39: - w[63] = __byte_perm_S (w[23], w[24], selector); - w[62] = __byte_perm_S (w[22], w[23], selector); - w[61] = __byte_perm_S (w[21], w[22], selector); - w[60] = __byte_perm_S (w[20], w[21], selector); - w[59] = __byte_perm_S (w[19], w[20], selector); - w[58] = __byte_perm_S (w[18], w[19], selector); - w[57] = __byte_perm_S (w[17], w[18], selector); - w[56] = __byte_perm_S (w[16], w[17], selector); - w[55] = __byte_perm_S (w[15], w[16], selector); - w[54] = __byte_perm_S (w[14], w[15], selector); - w[53] = __byte_perm_S (w[13], w[14], selector); - w[52] = __byte_perm_S (w[12], w[13], selector); - w[51] = __byte_perm_S (w[11], w[12], selector); - w[50] = __byte_perm_S (w[10], w[11], selector); - w[49] = __byte_perm_S (w[ 9], w[10], selector); - w[48] = __byte_perm_S (w[ 8], w[ 9], selector); - w[47] = __byte_perm_S (w[ 7], w[ 8], selector); - w[46] = __byte_perm_S (w[ 6], w[ 7], selector); - w[45] = __byte_perm_S (w[ 5], w[ 6], selector); - w[44] = __byte_perm_S (w[ 4], w[ 5], selector); - w[43] = __byte_perm_S (w[ 3], w[ 4], selector); - w[42] = __byte_perm_S (w[ 2], w[ 3], selector); - w[41] = __byte_perm_S (w[ 1], w[ 2], selector); - w[40] = __byte_perm_S (w[ 0], w[ 1], selector); - w[39] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[23], w[24], selector); + w[62] = hc_byte_perm_S (w[22], w[23], selector); + w[61] = hc_byte_perm_S (w[21], w[22], selector); + w[60] = hc_byte_perm_S (w[20], w[21], selector); + w[59] = hc_byte_perm_S (w[19], w[20], selector); + w[58] = hc_byte_perm_S (w[18], w[19], selector); + w[57] = hc_byte_perm_S (w[17], w[18], selector); + w[56] = hc_byte_perm_S (w[16], w[17], selector); + w[55] = hc_byte_perm_S (w[15], w[16], selector); + w[54] = hc_byte_perm_S (w[14], w[15], selector); + w[53] = hc_byte_perm_S (w[13], w[14], selector); + w[52] = hc_byte_perm_S (w[12], w[13], selector); + w[51] = hc_byte_perm_S (w[11], w[12], selector); + w[50] = hc_byte_perm_S (w[10], w[11], selector); + w[49] = hc_byte_perm_S (w[ 9], w[10], selector); + w[48] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[47] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[46] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[45] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[44] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[43] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[42] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[41] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[40] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[39] = hc_byte_perm_S ( 0, w[ 0], selector); w[38] = 0; w[37] = 0; w[36] = 0; @@ -49444,30 +49444,30 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 40: - w[63] = __byte_perm_S (w[22], w[23], selector); - w[62] = __byte_perm_S (w[21], w[22], selector); - w[61] = __byte_perm_S (w[20], w[21], selector); - w[60] = __byte_perm_S (w[19], w[20], selector); - w[59] = __byte_perm_S (w[18], w[19], selector); - w[58] = __byte_perm_S (w[17], w[18], selector); - w[57] = __byte_perm_S (w[16], w[17], selector); - w[56] = __byte_perm_S (w[15], w[16], selector); - w[55] = __byte_perm_S (w[14], w[15], selector); - w[54] = __byte_perm_S (w[13], w[14], selector); - w[53] = __byte_perm_S (w[12], w[13], selector); - w[52] = __byte_perm_S (w[11], w[12], selector); - w[51] = __byte_perm_S (w[10], w[11], selector); - w[50] = __byte_perm_S (w[ 9], w[10], selector); - w[49] = __byte_perm_S (w[ 8], w[ 9], selector); - w[48] = __byte_perm_S (w[ 7], w[ 8], selector); - w[47] = __byte_perm_S (w[ 6], w[ 7], selector); - w[46] = __byte_perm_S (w[ 5], w[ 6], selector); - w[45] = __byte_perm_S (w[ 4], w[ 5], selector); - w[44] = __byte_perm_S (w[ 3], w[ 4], selector); - w[43] = __byte_perm_S (w[ 2], w[ 3], selector); - w[42] = __byte_perm_S (w[ 1], w[ 2], selector); - w[41] = __byte_perm_S (w[ 0], w[ 1], selector); - w[40] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[22], w[23], selector); + w[62] = hc_byte_perm_S (w[21], w[22], selector); + w[61] = hc_byte_perm_S (w[20], w[21], selector); + w[60] = hc_byte_perm_S (w[19], w[20], selector); + w[59] = hc_byte_perm_S (w[18], w[19], selector); + w[58] = hc_byte_perm_S (w[17], w[18], selector); + w[57] = hc_byte_perm_S (w[16], w[17], selector); + w[56] = hc_byte_perm_S (w[15], w[16], selector); + w[55] = hc_byte_perm_S (w[14], w[15], selector); + w[54] = hc_byte_perm_S (w[13], w[14], selector); + w[53] = hc_byte_perm_S (w[12], w[13], selector); + w[52] = hc_byte_perm_S (w[11], w[12], selector); + w[51] = hc_byte_perm_S (w[10], w[11], selector); + w[50] = hc_byte_perm_S (w[ 9], w[10], selector); + w[49] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[48] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[47] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[46] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[45] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[44] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[43] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[42] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[41] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[40] = hc_byte_perm_S ( 0, w[ 0], selector); w[39] = 0; w[38] = 0; w[37] = 0; @@ -49512,29 +49512,29 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 41: - w[63] = __byte_perm_S (w[21], w[22], selector); - w[62] = __byte_perm_S (w[20], w[21], selector); - w[61] = __byte_perm_S (w[19], w[20], selector); - w[60] = __byte_perm_S (w[18], w[19], selector); - w[59] = __byte_perm_S (w[17], w[18], selector); - w[58] = __byte_perm_S (w[16], w[17], selector); - w[57] = __byte_perm_S (w[15], w[16], selector); - w[56] = __byte_perm_S (w[14], w[15], selector); - w[55] = __byte_perm_S (w[13], w[14], selector); - w[54] = __byte_perm_S (w[12], w[13], selector); - w[53] = __byte_perm_S (w[11], w[12], selector); - w[52] = __byte_perm_S (w[10], w[11], selector); - w[51] = __byte_perm_S (w[ 9], w[10], selector); - w[50] = __byte_perm_S (w[ 8], w[ 9], selector); - w[49] = __byte_perm_S (w[ 7], w[ 8], selector); - w[48] = __byte_perm_S (w[ 6], w[ 7], selector); - w[47] = __byte_perm_S (w[ 5], w[ 6], selector); - w[46] = __byte_perm_S (w[ 4], w[ 5], selector); - w[45] = __byte_perm_S (w[ 3], w[ 4], selector); - w[44] = __byte_perm_S (w[ 2], w[ 3], selector); - w[43] = __byte_perm_S (w[ 1], w[ 2], selector); - w[42] = __byte_perm_S (w[ 0], w[ 1], selector); - w[41] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[21], w[22], selector); + w[62] = hc_byte_perm_S (w[20], w[21], selector); + w[61] = hc_byte_perm_S (w[19], w[20], selector); + w[60] = hc_byte_perm_S (w[18], w[19], selector); + w[59] = hc_byte_perm_S (w[17], w[18], selector); + w[58] = hc_byte_perm_S (w[16], w[17], selector); + w[57] = hc_byte_perm_S (w[15], w[16], selector); + w[56] = hc_byte_perm_S (w[14], w[15], selector); + w[55] = hc_byte_perm_S (w[13], w[14], selector); + w[54] = hc_byte_perm_S (w[12], w[13], selector); + w[53] = hc_byte_perm_S (w[11], w[12], selector); + w[52] = hc_byte_perm_S (w[10], w[11], selector); + w[51] = hc_byte_perm_S (w[ 9], w[10], selector); + w[50] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[49] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[48] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[47] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[46] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[45] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[44] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[43] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[42] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[41] = hc_byte_perm_S ( 0, w[ 0], selector); w[40] = 0; w[39] = 0; w[38] = 0; @@ -49580,28 +49580,28 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 42: - w[63] = __byte_perm_S (w[20], w[21], selector); - w[62] = __byte_perm_S (w[19], w[20], selector); - w[61] = __byte_perm_S (w[18], w[19], selector); - w[60] = __byte_perm_S (w[17], w[18], selector); - w[59] = __byte_perm_S (w[16], w[17], selector); - w[58] = __byte_perm_S (w[15], w[16], selector); - w[57] = __byte_perm_S (w[14], w[15], selector); - w[56] = __byte_perm_S (w[13], w[14], selector); - w[55] = __byte_perm_S (w[12], w[13], selector); - w[54] = __byte_perm_S (w[11], w[12], selector); - w[53] = __byte_perm_S (w[10], w[11], selector); - w[52] = __byte_perm_S (w[ 9], w[10], selector); - w[51] = __byte_perm_S (w[ 8], w[ 9], selector); - w[50] = __byte_perm_S (w[ 7], w[ 8], selector); - w[49] = __byte_perm_S (w[ 6], w[ 7], selector); - w[48] = __byte_perm_S (w[ 5], w[ 6], selector); - w[47] = __byte_perm_S (w[ 4], w[ 5], selector); - w[46] = __byte_perm_S (w[ 3], w[ 4], selector); - w[45] = __byte_perm_S (w[ 2], w[ 3], selector); - w[44] = __byte_perm_S (w[ 1], w[ 2], selector); - w[43] = __byte_perm_S (w[ 0], w[ 1], selector); - w[42] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[20], w[21], selector); + w[62] = hc_byte_perm_S (w[19], w[20], selector); + w[61] = hc_byte_perm_S (w[18], w[19], selector); + w[60] = hc_byte_perm_S (w[17], w[18], selector); + w[59] = hc_byte_perm_S (w[16], w[17], selector); + w[58] = hc_byte_perm_S (w[15], w[16], selector); + w[57] = hc_byte_perm_S (w[14], w[15], selector); + w[56] = hc_byte_perm_S (w[13], w[14], selector); + w[55] = hc_byte_perm_S (w[12], w[13], selector); + w[54] = hc_byte_perm_S (w[11], w[12], selector); + w[53] = hc_byte_perm_S (w[10], w[11], selector); + w[52] = hc_byte_perm_S (w[ 9], w[10], selector); + w[51] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[50] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[49] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[48] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[47] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[46] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[45] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[44] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[43] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[42] = hc_byte_perm_S ( 0, w[ 0], selector); w[41] = 0; w[40] = 0; w[39] = 0; @@ -49648,27 +49648,27 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 43: - w[63] = __byte_perm_S (w[19], w[20], selector); - w[62] = __byte_perm_S (w[18], w[19], selector); - w[61] = __byte_perm_S (w[17], w[18], selector); - w[60] = __byte_perm_S (w[16], w[17], selector); - w[59] = __byte_perm_S (w[15], w[16], selector); - w[58] = __byte_perm_S (w[14], w[15], selector); - w[57] = __byte_perm_S (w[13], w[14], selector); - w[56] = __byte_perm_S (w[12], w[13], selector); - w[55] = __byte_perm_S (w[11], w[12], selector); - w[54] = __byte_perm_S (w[10], w[11], selector); - w[53] = __byte_perm_S (w[ 9], w[10], selector); - w[52] = __byte_perm_S (w[ 8], w[ 9], selector); - w[51] = __byte_perm_S (w[ 7], w[ 8], selector); - w[50] = __byte_perm_S (w[ 6], w[ 7], selector); - w[49] = __byte_perm_S (w[ 5], w[ 6], selector); - w[48] = __byte_perm_S (w[ 4], w[ 5], selector); - w[47] = __byte_perm_S (w[ 3], w[ 4], selector); - w[46] = __byte_perm_S (w[ 2], w[ 3], selector); - w[45] = __byte_perm_S (w[ 1], w[ 2], selector); - w[44] = __byte_perm_S (w[ 0], w[ 1], selector); - w[43] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[19], w[20], selector); + w[62] = hc_byte_perm_S (w[18], w[19], selector); + w[61] = hc_byte_perm_S (w[17], w[18], selector); + w[60] = hc_byte_perm_S (w[16], w[17], selector); + w[59] = hc_byte_perm_S (w[15], w[16], selector); + w[58] = hc_byte_perm_S (w[14], w[15], selector); + w[57] = hc_byte_perm_S (w[13], w[14], selector); + w[56] = hc_byte_perm_S (w[12], w[13], selector); + w[55] = hc_byte_perm_S (w[11], w[12], selector); + w[54] = hc_byte_perm_S (w[10], w[11], selector); + w[53] = hc_byte_perm_S (w[ 9], w[10], selector); + w[52] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[51] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[50] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[49] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[48] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[47] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[46] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[45] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[44] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[43] = hc_byte_perm_S ( 0, w[ 0], selector); w[42] = 0; w[41] = 0; w[40] = 0; @@ -49716,26 +49716,26 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 44: - w[63] = __byte_perm_S (w[18], w[19], selector); - w[62] = __byte_perm_S (w[17], w[18], selector); - w[61] = __byte_perm_S (w[16], w[17], selector); - w[60] = __byte_perm_S (w[15], w[16], selector); - w[59] = __byte_perm_S (w[14], w[15], selector); - w[58] = __byte_perm_S (w[13], w[14], selector); - w[57] = __byte_perm_S (w[12], w[13], selector); - w[56] = __byte_perm_S (w[11], w[12], selector); - w[55] = __byte_perm_S (w[10], w[11], selector); - w[54] = __byte_perm_S (w[ 9], w[10], selector); - w[53] = __byte_perm_S (w[ 8], w[ 9], selector); - w[52] = __byte_perm_S (w[ 7], w[ 8], selector); - w[51] = __byte_perm_S (w[ 6], w[ 7], selector); - w[50] = __byte_perm_S (w[ 5], w[ 6], selector); - w[49] = __byte_perm_S (w[ 4], w[ 5], selector); - w[48] = __byte_perm_S (w[ 3], w[ 4], selector); - w[47] = __byte_perm_S (w[ 2], w[ 3], selector); - w[46] = __byte_perm_S (w[ 1], w[ 2], selector); - w[45] = __byte_perm_S (w[ 0], w[ 1], selector); - w[44] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[18], w[19], selector); + w[62] = hc_byte_perm_S (w[17], w[18], selector); + w[61] = hc_byte_perm_S (w[16], w[17], selector); + w[60] = hc_byte_perm_S (w[15], w[16], selector); + w[59] = hc_byte_perm_S (w[14], w[15], selector); + w[58] = hc_byte_perm_S (w[13], w[14], selector); + w[57] = hc_byte_perm_S (w[12], w[13], selector); + w[56] = hc_byte_perm_S (w[11], w[12], selector); + w[55] = hc_byte_perm_S (w[10], w[11], selector); + w[54] = hc_byte_perm_S (w[ 9], w[10], selector); + w[53] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[52] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[51] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[50] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[49] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[48] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[47] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[46] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[45] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[44] = hc_byte_perm_S ( 0, w[ 0], selector); w[43] = 0; w[42] = 0; w[41] = 0; @@ -49784,25 +49784,25 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 45: - w[63] = __byte_perm_S (w[17], w[18], selector); - w[62] = __byte_perm_S (w[16], w[17], selector); - w[61] = __byte_perm_S (w[15], w[16], selector); - w[60] = __byte_perm_S (w[14], w[15], selector); - w[59] = __byte_perm_S (w[13], w[14], selector); - w[58] = __byte_perm_S (w[12], w[13], selector); - w[57] = __byte_perm_S (w[11], w[12], selector); - w[56] = __byte_perm_S (w[10], w[11], selector); - w[55] = __byte_perm_S (w[ 9], w[10], selector); - w[54] = __byte_perm_S (w[ 8], w[ 9], selector); - w[53] = __byte_perm_S (w[ 7], w[ 8], selector); - w[52] = __byte_perm_S (w[ 6], w[ 7], selector); - w[51] = __byte_perm_S (w[ 5], w[ 6], selector); - w[50] = __byte_perm_S (w[ 4], w[ 5], selector); - w[49] = __byte_perm_S (w[ 3], w[ 4], selector); - w[48] = __byte_perm_S (w[ 2], w[ 3], selector); - w[47] = __byte_perm_S (w[ 1], w[ 2], selector); - w[46] = __byte_perm_S (w[ 0], w[ 1], selector); - w[45] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[17], w[18], selector); + w[62] = hc_byte_perm_S (w[16], w[17], selector); + w[61] = hc_byte_perm_S (w[15], w[16], selector); + w[60] = hc_byte_perm_S (w[14], w[15], selector); + w[59] = hc_byte_perm_S (w[13], w[14], selector); + w[58] = hc_byte_perm_S (w[12], w[13], selector); + w[57] = hc_byte_perm_S (w[11], w[12], selector); + w[56] = hc_byte_perm_S (w[10], w[11], selector); + w[55] = hc_byte_perm_S (w[ 9], w[10], selector); + w[54] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[53] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[52] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[51] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[50] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[49] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[48] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[47] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[46] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[45] = hc_byte_perm_S ( 0, w[ 0], selector); w[44] = 0; w[43] = 0; w[42] = 0; @@ -49852,24 +49852,24 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 46: - w[63] = __byte_perm_S (w[16], w[17], selector); - w[62] = __byte_perm_S (w[15], w[16], selector); - w[61] = __byte_perm_S (w[14], w[15], selector); - w[60] = __byte_perm_S (w[13], w[14], selector); - w[59] = __byte_perm_S (w[12], w[13], selector); - w[58] = __byte_perm_S (w[11], w[12], selector); - w[57] = __byte_perm_S (w[10], w[11], selector); - w[56] = __byte_perm_S (w[ 9], w[10], selector); - w[55] = __byte_perm_S (w[ 8], w[ 9], selector); - w[54] = __byte_perm_S (w[ 7], w[ 8], selector); - w[53] = __byte_perm_S (w[ 6], w[ 7], selector); - w[52] = __byte_perm_S (w[ 5], w[ 6], selector); - w[51] = __byte_perm_S (w[ 4], w[ 5], selector); - w[50] = __byte_perm_S (w[ 3], w[ 4], selector); - w[49] = __byte_perm_S (w[ 2], w[ 3], selector); - w[48] = __byte_perm_S (w[ 1], w[ 2], selector); - w[47] = __byte_perm_S (w[ 0], w[ 1], selector); - w[46] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[16], w[17], selector); + w[62] = hc_byte_perm_S (w[15], w[16], selector); + w[61] = hc_byte_perm_S (w[14], w[15], selector); + w[60] = hc_byte_perm_S (w[13], w[14], selector); + w[59] = hc_byte_perm_S (w[12], w[13], selector); + w[58] = hc_byte_perm_S (w[11], w[12], selector); + w[57] = hc_byte_perm_S (w[10], w[11], selector); + w[56] = hc_byte_perm_S (w[ 9], w[10], selector); + w[55] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[54] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[53] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[52] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[51] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[50] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[49] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[48] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[47] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[46] = hc_byte_perm_S ( 0, w[ 0], selector); w[45] = 0; w[44] = 0; w[43] = 0; @@ -49920,23 +49920,23 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 47: - w[63] = __byte_perm_S (w[15], w[16], selector); - w[62] = __byte_perm_S (w[14], w[15], selector); - w[61] = __byte_perm_S (w[13], w[14], selector); - w[60] = __byte_perm_S (w[12], w[13], selector); - w[59] = __byte_perm_S (w[11], w[12], selector); - w[58] = __byte_perm_S (w[10], w[11], selector); - w[57] = __byte_perm_S (w[ 9], w[10], selector); - w[56] = __byte_perm_S (w[ 8], w[ 9], selector); - w[55] = __byte_perm_S (w[ 7], w[ 8], selector); - w[54] = __byte_perm_S (w[ 6], w[ 7], selector); - w[53] = __byte_perm_S (w[ 5], w[ 6], selector); - w[52] = __byte_perm_S (w[ 4], w[ 5], selector); - w[51] = __byte_perm_S (w[ 3], w[ 4], selector); - w[50] = __byte_perm_S (w[ 2], w[ 3], selector); - w[49] = __byte_perm_S (w[ 1], w[ 2], selector); - w[48] = __byte_perm_S (w[ 0], w[ 1], selector); - w[47] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[15], w[16], selector); + w[62] = hc_byte_perm_S (w[14], w[15], selector); + w[61] = hc_byte_perm_S (w[13], w[14], selector); + w[60] = hc_byte_perm_S (w[12], w[13], selector); + w[59] = hc_byte_perm_S (w[11], w[12], selector); + w[58] = hc_byte_perm_S (w[10], w[11], selector); + w[57] = hc_byte_perm_S (w[ 9], w[10], selector); + w[56] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[55] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[54] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[53] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[52] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[51] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[50] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[49] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[48] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[47] = hc_byte_perm_S ( 0, w[ 0], selector); w[46] = 0; w[45] = 0; w[44] = 0; @@ -49988,22 +49988,22 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 48: - w[63] = __byte_perm_S (w[14], w[15], selector); - w[62] = __byte_perm_S (w[13], w[14], selector); - w[61] = __byte_perm_S (w[12], w[13], selector); - w[60] = __byte_perm_S (w[11], w[12], selector); - w[59] = __byte_perm_S (w[10], w[11], selector); - w[58] = __byte_perm_S (w[ 9], w[10], selector); - w[57] = __byte_perm_S (w[ 8], w[ 9], selector); - w[56] = __byte_perm_S (w[ 7], w[ 8], selector); - w[55] = __byte_perm_S (w[ 6], w[ 7], selector); - w[54] = __byte_perm_S (w[ 5], w[ 6], selector); - w[53] = __byte_perm_S (w[ 4], w[ 5], selector); - w[52] = __byte_perm_S (w[ 3], w[ 4], selector); - w[51] = __byte_perm_S (w[ 2], w[ 3], selector); - w[50] = __byte_perm_S (w[ 1], w[ 2], selector); - w[49] = __byte_perm_S (w[ 0], w[ 1], selector); - w[48] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[14], w[15], selector); + w[62] = hc_byte_perm_S (w[13], w[14], selector); + w[61] = hc_byte_perm_S (w[12], w[13], selector); + w[60] = hc_byte_perm_S (w[11], w[12], selector); + w[59] = hc_byte_perm_S (w[10], w[11], selector); + w[58] = hc_byte_perm_S (w[ 9], w[10], selector); + w[57] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[56] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[55] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[54] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[53] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[52] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[51] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[50] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[49] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[48] = hc_byte_perm_S ( 0, w[ 0], selector); w[47] = 0; w[46] = 0; w[45] = 0; @@ -50056,21 +50056,21 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 49: - w[63] = __byte_perm_S (w[13], w[14], selector); - w[62] = __byte_perm_S (w[12], w[13], selector); - w[61] = __byte_perm_S (w[11], w[12], selector); - w[60] = __byte_perm_S (w[10], w[11], selector); - w[59] = __byte_perm_S (w[ 9], w[10], selector); - w[58] = __byte_perm_S (w[ 8], w[ 9], selector); - w[57] = __byte_perm_S (w[ 7], w[ 8], selector); - w[56] = __byte_perm_S (w[ 6], w[ 7], selector); - w[55] = __byte_perm_S (w[ 5], w[ 6], selector); - w[54] = __byte_perm_S (w[ 4], w[ 5], selector); - w[53] = __byte_perm_S (w[ 3], w[ 4], selector); - w[52] = __byte_perm_S (w[ 2], w[ 3], selector); - w[51] = __byte_perm_S (w[ 1], w[ 2], selector); - w[50] = __byte_perm_S (w[ 0], w[ 1], selector); - w[49] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[13], w[14], selector); + w[62] = hc_byte_perm_S (w[12], w[13], selector); + w[61] = hc_byte_perm_S (w[11], w[12], selector); + w[60] = hc_byte_perm_S (w[10], w[11], selector); + w[59] = hc_byte_perm_S (w[ 9], w[10], selector); + w[58] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[57] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[56] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[55] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[54] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[53] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[52] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[51] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[50] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[49] = hc_byte_perm_S ( 0, w[ 0], selector); w[48] = 0; w[47] = 0; w[46] = 0; @@ -50124,20 +50124,20 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 50: - w[63] = __byte_perm_S (w[12], w[13], selector); - w[62] = __byte_perm_S (w[11], w[12], selector); - w[61] = __byte_perm_S (w[10], w[11], selector); - w[60] = __byte_perm_S (w[ 9], w[10], selector); - w[59] = __byte_perm_S (w[ 8], w[ 9], selector); - w[58] = __byte_perm_S (w[ 7], w[ 8], selector); - w[57] = __byte_perm_S (w[ 6], w[ 7], selector); - w[56] = __byte_perm_S (w[ 5], w[ 6], selector); - w[55] = __byte_perm_S (w[ 4], w[ 5], selector); - w[54] = __byte_perm_S (w[ 3], w[ 4], selector); - w[53] = __byte_perm_S (w[ 2], w[ 3], selector); - w[52] = __byte_perm_S (w[ 1], w[ 2], selector); - w[51] = __byte_perm_S (w[ 0], w[ 1], selector); - w[50] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[12], w[13], selector); + w[62] = hc_byte_perm_S (w[11], w[12], selector); + w[61] = hc_byte_perm_S (w[10], w[11], selector); + w[60] = hc_byte_perm_S (w[ 9], w[10], selector); + w[59] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[58] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[57] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[56] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[55] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[54] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[53] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[52] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[51] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[50] = hc_byte_perm_S ( 0, w[ 0], selector); w[49] = 0; w[48] = 0; w[47] = 0; @@ -50192,19 +50192,19 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 51: - w[63] = __byte_perm_S (w[11], w[12], selector); - w[62] = __byte_perm_S (w[10], w[11], selector); - w[61] = __byte_perm_S (w[ 9], w[10], selector); - w[60] = __byte_perm_S (w[ 8], w[ 9], selector); - w[59] = __byte_perm_S (w[ 7], w[ 8], selector); - w[58] = __byte_perm_S (w[ 6], w[ 7], selector); - w[57] = __byte_perm_S (w[ 5], w[ 6], selector); - w[56] = __byte_perm_S (w[ 4], w[ 5], selector); - w[55] = __byte_perm_S (w[ 3], w[ 4], selector); - w[54] = __byte_perm_S (w[ 2], w[ 3], selector); - w[53] = __byte_perm_S (w[ 1], w[ 2], selector); - w[52] = __byte_perm_S (w[ 0], w[ 1], selector); - w[51] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[11], w[12], selector); + w[62] = hc_byte_perm_S (w[10], w[11], selector); + w[61] = hc_byte_perm_S (w[ 9], w[10], selector); + w[60] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[59] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[58] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[57] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[56] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[55] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[54] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[53] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[52] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[51] = hc_byte_perm_S ( 0, w[ 0], selector); w[50] = 0; w[49] = 0; w[48] = 0; @@ -50260,18 +50260,18 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 52: - w[63] = __byte_perm_S (w[10], w[11], selector); - w[62] = __byte_perm_S (w[ 9], w[10], selector); - w[61] = __byte_perm_S (w[ 8], w[ 9], selector); - w[60] = __byte_perm_S (w[ 7], w[ 8], selector); - w[59] = __byte_perm_S (w[ 6], w[ 7], selector); - w[58] = __byte_perm_S (w[ 5], w[ 6], selector); - w[57] = __byte_perm_S (w[ 4], w[ 5], selector); - w[56] = __byte_perm_S (w[ 3], w[ 4], selector); - w[55] = __byte_perm_S (w[ 2], w[ 3], selector); - w[54] = __byte_perm_S (w[ 1], w[ 2], selector); - w[53] = __byte_perm_S (w[ 0], w[ 1], selector); - w[52] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[10], w[11], selector); + w[62] = hc_byte_perm_S (w[ 9], w[10], selector); + w[61] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[60] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[59] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[58] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[57] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[56] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[55] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[54] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[53] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[52] = hc_byte_perm_S ( 0, w[ 0], selector); w[51] = 0; w[50] = 0; w[49] = 0; @@ -50328,17 +50328,17 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 53: - w[63] = __byte_perm_S (w[ 9], w[10], selector); - w[62] = __byte_perm_S (w[ 8], w[ 9], selector); - w[61] = __byte_perm_S (w[ 7], w[ 8], selector); - w[60] = __byte_perm_S (w[ 6], w[ 7], selector); - w[59] = __byte_perm_S (w[ 5], w[ 6], selector); - w[58] = __byte_perm_S (w[ 4], w[ 5], selector); - w[57] = __byte_perm_S (w[ 3], w[ 4], selector); - w[56] = __byte_perm_S (w[ 2], w[ 3], selector); - w[55] = __byte_perm_S (w[ 1], w[ 2], selector); - w[54] = __byte_perm_S (w[ 0], w[ 1], selector); - w[53] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[ 9], w[10], selector); + w[62] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[61] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[60] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[59] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[58] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[57] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[56] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[55] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[54] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[53] = hc_byte_perm_S ( 0, w[ 0], selector); w[52] = 0; w[51] = 0; w[50] = 0; @@ -50396,16 +50396,16 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 54: - w[63] = __byte_perm_S (w[ 8], w[ 9], selector); - w[62] = __byte_perm_S (w[ 7], w[ 8], selector); - w[61] = __byte_perm_S (w[ 6], w[ 7], selector); - w[60] = __byte_perm_S (w[ 5], w[ 6], selector); - w[59] = __byte_perm_S (w[ 4], w[ 5], selector); - w[58] = __byte_perm_S (w[ 3], w[ 4], selector); - w[57] = __byte_perm_S (w[ 2], w[ 3], selector); - w[56] = __byte_perm_S (w[ 1], w[ 2], selector); - w[55] = __byte_perm_S (w[ 0], w[ 1], selector); - w[54] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[ 8], w[ 9], selector); + w[62] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[61] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[60] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[59] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[58] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[57] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[56] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[55] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[54] = hc_byte_perm_S ( 0, w[ 0], selector); w[53] = 0; w[52] = 0; w[51] = 0; @@ -50464,15 +50464,15 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 55: - w[63] = __byte_perm_S (w[ 7], w[ 8], selector); - w[62] = __byte_perm_S (w[ 6], w[ 7], selector); - w[61] = __byte_perm_S (w[ 5], w[ 6], selector); - w[60] = __byte_perm_S (w[ 4], w[ 5], selector); - w[59] = __byte_perm_S (w[ 3], w[ 4], selector); - w[58] = __byte_perm_S (w[ 2], w[ 3], selector); - w[57] = __byte_perm_S (w[ 1], w[ 2], selector); - w[56] = __byte_perm_S (w[ 0], w[ 1], selector); - w[55] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[ 7], w[ 8], selector); + w[62] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[61] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[60] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[59] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[58] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[57] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[56] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[55] = hc_byte_perm_S ( 0, w[ 0], selector); w[54] = 0; w[53] = 0; w[52] = 0; @@ -50532,14 +50532,14 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 56: - w[63] = __byte_perm_S (w[ 6], w[ 7], selector); - w[62] = __byte_perm_S (w[ 5], w[ 6], selector); - w[61] = __byte_perm_S (w[ 4], w[ 5], selector); - w[60] = __byte_perm_S (w[ 3], w[ 4], selector); - w[59] = __byte_perm_S (w[ 2], w[ 3], selector); - w[58] = __byte_perm_S (w[ 1], w[ 2], selector); - w[57] = __byte_perm_S (w[ 0], w[ 1], selector); - w[56] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[ 6], w[ 7], selector); + w[62] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[61] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[60] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[59] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[58] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[57] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[56] = hc_byte_perm_S ( 0, w[ 0], selector); w[55] = 0; w[54] = 0; w[53] = 0; @@ -50600,13 +50600,13 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 57: - w[63] = __byte_perm_S (w[ 5], w[ 6], selector); - w[62] = __byte_perm_S (w[ 4], w[ 5], selector); - w[61] = __byte_perm_S (w[ 3], w[ 4], selector); - w[60] = __byte_perm_S (w[ 2], w[ 3], selector); - w[59] = __byte_perm_S (w[ 1], w[ 2], selector); - w[58] = __byte_perm_S (w[ 0], w[ 1], selector); - w[57] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[ 5], w[ 6], selector); + w[62] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[61] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[60] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[59] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[58] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[57] = hc_byte_perm_S ( 0, w[ 0], selector); w[56] = 0; w[55] = 0; w[54] = 0; @@ -50668,12 +50668,12 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 58: - w[63] = __byte_perm_S (w[ 4], w[ 5], selector); - w[62] = __byte_perm_S (w[ 3], w[ 4], selector); - w[61] = __byte_perm_S (w[ 2], w[ 3], selector); - w[60] = __byte_perm_S (w[ 1], w[ 2], selector); - w[59] = __byte_perm_S (w[ 0], w[ 1], selector); - w[58] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[ 4], w[ 5], selector); + w[62] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[61] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[60] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[59] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[58] = hc_byte_perm_S ( 0, w[ 0], selector); w[57] = 0; w[56] = 0; w[55] = 0; @@ -50736,11 +50736,11 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 59: - w[63] = __byte_perm_S (w[ 3], w[ 4], selector); - w[62] = __byte_perm_S (w[ 2], w[ 3], selector); - w[61] = __byte_perm_S (w[ 1], w[ 2], selector); - w[60] = __byte_perm_S (w[ 0], w[ 1], selector); - w[59] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[ 3], w[ 4], selector); + w[62] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[61] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[60] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[59] = hc_byte_perm_S ( 0, w[ 0], selector); w[58] = 0; w[57] = 0; w[56] = 0; @@ -50804,10 +50804,10 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 60: - w[63] = __byte_perm_S (w[ 2], w[ 3], selector); - w[62] = __byte_perm_S (w[ 1], w[ 2], selector); - w[61] = __byte_perm_S (w[ 0], w[ 1], selector); - w[60] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[ 2], w[ 3], selector); + w[62] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[61] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[60] = hc_byte_perm_S ( 0, w[ 0], selector); w[59] = 0; w[58] = 0; w[57] = 0; @@ -50872,9 +50872,9 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 61: - w[63] = __byte_perm_S (w[ 1], w[ 2], selector); - w[62] = __byte_perm_S (w[ 0], w[ 1], selector); - w[61] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[ 1], w[ 2], selector); + w[62] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[61] = hc_byte_perm_S ( 0, w[ 0], selector); w[60] = 0; w[59] = 0; w[58] = 0; @@ -50940,8 +50940,8 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 62: - w[63] = __byte_perm_S (w[ 0], w[ 1], selector); - w[62] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S (w[ 0], w[ 1], selector); + w[62] = hc_byte_perm_S ( 0, w[ 0], selector); w[61] = 0; w[60] = 0; w[59] = 0; @@ -51008,7 +51008,7 @@ DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset) break; case 63: - w[63] = __byte_perm_S ( 0, w[ 0], selector); + w[63] = hc_byte_perm_S ( 0, w[ 0], selector); w[62] = 0; w[61] = 0; w[60] = 0; @@ -51086,271 +51086,271 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) switch (offset_switch) { case 0: - w[63] = amd_bytealign_S (w[62], w[63], offset); - w[62] = amd_bytealign_S (w[61], w[62], offset); - w[61] = amd_bytealign_S (w[60], w[61], offset); - w[60] = amd_bytealign_S (w[59], w[60], offset); - w[59] = amd_bytealign_S (w[58], w[59], offset); - w[58] = amd_bytealign_S (w[57], w[58], offset); - w[57] = amd_bytealign_S (w[56], w[57], offset); - w[56] = amd_bytealign_S (w[55], w[56], offset); - w[55] = amd_bytealign_S (w[54], w[55], offset); - w[54] = amd_bytealign_S (w[53], w[54], offset); - w[53] = amd_bytealign_S (w[52], w[53], offset); - w[52] = amd_bytealign_S (w[51], w[52], offset); - w[51] = amd_bytealign_S (w[50], w[51], offset); - w[50] = amd_bytealign_S (w[49], w[50], offset); - w[49] = amd_bytealign_S (w[48], w[49], offset); - w[48] = amd_bytealign_S (w[47], w[48], offset); - w[47] = amd_bytealign_S (w[46], w[47], offset); - w[46] = amd_bytealign_S (w[45], w[46], offset); - w[45] = amd_bytealign_S (w[44], w[45], offset); - w[44] = amd_bytealign_S (w[43], w[44], offset); - w[43] = amd_bytealign_S (w[42], w[43], offset); - w[42] = amd_bytealign_S (w[41], w[42], offset); - w[41] = amd_bytealign_S (w[40], w[41], offset); - w[40] = amd_bytealign_S (w[39], w[40], offset); - w[39] = amd_bytealign_S (w[38], w[39], offset); - w[38] = amd_bytealign_S (w[37], w[38], offset); - w[37] = amd_bytealign_S (w[36], w[37], offset); - w[36] = amd_bytealign_S (w[35], w[36], offset); - w[35] = amd_bytealign_S (w[34], w[35], offset); - w[34] = amd_bytealign_S (w[33], w[34], offset); - w[33] = amd_bytealign_S (w[32], w[33], offset); - w[32] = amd_bytealign_S (w[31], w[32], offset); - w[31] = amd_bytealign_S (w[30], w[31], offset); - w[30] = amd_bytealign_S (w[29], w[30], offset); - w[29] = amd_bytealign_S (w[28], w[29], offset); - w[28] = amd_bytealign_S (w[27], w[28], offset); - w[27] = amd_bytealign_S (w[26], w[27], offset); - w[26] = amd_bytealign_S (w[25], w[26], offset); - w[25] = amd_bytealign_S (w[24], w[25], offset); - w[24] = amd_bytealign_S (w[23], w[24], offset); - w[23] = amd_bytealign_S (w[22], w[23], offset); - w[22] = amd_bytealign_S (w[21], w[22], offset); - w[21] = amd_bytealign_S (w[20], w[21], offset); - w[20] = amd_bytealign_S (w[19], w[20], offset); - w[19] = amd_bytealign_S (w[18], w[19], offset); - w[18] = amd_bytealign_S (w[17], w[18], offset); - w[17] = amd_bytealign_S (w[16], w[17], offset); - w[16] = amd_bytealign_S (w[15], w[16], offset); - w[15] = amd_bytealign_S (w[14], w[15], offset); - w[14] = amd_bytealign_S (w[13], w[14], offset); - w[13] = amd_bytealign_S (w[12], w[13], offset); - w[12] = amd_bytealign_S (w[11], w[12], offset); - w[11] = amd_bytealign_S (w[10], w[11], offset); - w[10] = amd_bytealign_S (w[ 9], w[10], offset); - w[ 9] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[ 8] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[ 7] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[ 6] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[ 5] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[ 4] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[ 3] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[ 2] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[ 1] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[ 0] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[62], w[63], offset); + w[62] = hc_bytealign_S (w[61], w[62], offset); + w[61] = hc_bytealign_S (w[60], w[61], offset); + w[60] = hc_bytealign_S (w[59], w[60], offset); + w[59] = hc_bytealign_S (w[58], w[59], offset); + w[58] = hc_bytealign_S (w[57], w[58], offset); + w[57] = hc_bytealign_S (w[56], w[57], offset); + w[56] = hc_bytealign_S (w[55], w[56], offset); + w[55] = hc_bytealign_S (w[54], w[55], offset); + w[54] = hc_bytealign_S (w[53], w[54], offset); + w[53] = hc_bytealign_S (w[52], w[53], offset); + w[52] = hc_bytealign_S (w[51], w[52], offset); + w[51] = hc_bytealign_S (w[50], w[51], offset); + w[50] = hc_bytealign_S (w[49], w[50], offset); + w[49] = hc_bytealign_S (w[48], w[49], offset); + w[48] = hc_bytealign_S (w[47], w[48], offset); + w[47] = hc_bytealign_S (w[46], w[47], offset); + w[46] = hc_bytealign_S (w[45], w[46], offset); + w[45] = hc_bytealign_S (w[44], w[45], offset); + w[44] = hc_bytealign_S (w[43], w[44], offset); + w[43] = hc_bytealign_S (w[42], w[43], offset); + w[42] = hc_bytealign_S (w[41], w[42], offset); + w[41] = hc_bytealign_S (w[40], w[41], offset); + w[40] = hc_bytealign_S (w[39], w[40], offset); + w[39] = hc_bytealign_S (w[38], w[39], offset); + w[38] = hc_bytealign_S (w[37], w[38], offset); + w[37] = hc_bytealign_S (w[36], w[37], offset); + w[36] = hc_bytealign_S (w[35], w[36], offset); + w[35] = hc_bytealign_S (w[34], w[35], offset); + w[34] = hc_bytealign_S (w[33], w[34], offset); + w[33] = hc_bytealign_S (w[32], w[33], offset); + w[32] = hc_bytealign_S (w[31], w[32], offset); + w[31] = hc_bytealign_S (w[30], w[31], offset); + w[30] = hc_bytealign_S (w[29], w[30], offset); + w[29] = hc_bytealign_S (w[28], w[29], offset); + w[28] = hc_bytealign_S (w[27], w[28], offset); + w[27] = hc_bytealign_S (w[26], w[27], offset); + w[26] = hc_bytealign_S (w[25], w[26], offset); + w[25] = hc_bytealign_S (w[24], w[25], offset); + w[24] = hc_bytealign_S (w[23], w[24], offset); + w[23] = hc_bytealign_S (w[22], w[23], offset); + w[22] = hc_bytealign_S (w[21], w[22], offset); + w[21] = hc_bytealign_S (w[20], w[21], offset); + w[20] = hc_bytealign_S (w[19], w[20], offset); + w[19] = hc_bytealign_S (w[18], w[19], offset); + w[18] = hc_bytealign_S (w[17], w[18], offset); + w[17] = hc_bytealign_S (w[16], w[17], offset); + w[16] = hc_bytealign_S (w[15], w[16], offset); + w[15] = hc_bytealign_S (w[14], w[15], offset); + w[14] = hc_bytealign_S (w[13], w[14], offset); + w[13] = hc_bytealign_S (w[12], w[13], offset); + w[12] = hc_bytealign_S (w[11], w[12], offset); + w[11] = hc_bytealign_S (w[10], w[11], offset); + w[10] = hc_bytealign_S (w[ 9], w[10], offset); + w[ 9] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[ 8] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[ 7] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[ 6] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[ 5] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[ 4] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[ 3] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[ 2] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[ 1] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[ 0] = hc_bytealign_S ( 0, w[ 0], offset); break; case 1: - w[63] = amd_bytealign_S (w[61], w[62], offset); - w[62] = amd_bytealign_S (w[60], w[61], offset); - w[61] = amd_bytealign_S (w[59], w[60], offset); - w[60] = amd_bytealign_S (w[58], w[59], offset); - w[59] = amd_bytealign_S (w[57], w[58], offset); - w[58] = amd_bytealign_S (w[56], w[57], offset); - w[57] = amd_bytealign_S (w[55], w[56], offset); - w[56] = amd_bytealign_S (w[54], w[55], offset); - w[55] = amd_bytealign_S (w[53], w[54], offset); - w[54] = amd_bytealign_S (w[52], w[53], offset); - w[53] = amd_bytealign_S (w[51], w[52], offset); - w[52] = amd_bytealign_S (w[50], w[51], offset); - w[51] = amd_bytealign_S (w[49], w[50], offset); - w[50] = amd_bytealign_S (w[48], w[49], offset); - w[49] = amd_bytealign_S (w[47], w[48], offset); - w[48] = amd_bytealign_S (w[46], w[47], offset); - w[47] = amd_bytealign_S (w[45], w[46], offset); - w[46] = amd_bytealign_S (w[44], w[45], offset); - w[45] = amd_bytealign_S (w[43], w[44], offset); - w[44] = amd_bytealign_S (w[42], w[43], offset); - w[43] = amd_bytealign_S (w[41], w[42], offset); - w[42] = amd_bytealign_S (w[40], w[41], offset); - w[41] = amd_bytealign_S (w[39], w[40], offset); - w[40] = amd_bytealign_S (w[38], w[39], offset); - w[39] = amd_bytealign_S (w[37], w[38], offset); - w[38] = amd_bytealign_S (w[36], w[37], offset); - w[37] = amd_bytealign_S (w[35], w[36], offset); - w[36] = amd_bytealign_S (w[34], w[35], offset); - w[35] = amd_bytealign_S (w[33], w[34], offset); - w[34] = amd_bytealign_S (w[32], w[33], offset); - w[33] = amd_bytealign_S (w[31], w[32], offset); - w[32] = amd_bytealign_S (w[30], w[31], offset); - w[31] = amd_bytealign_S (w[29], w[30], offset); - w[30] = amd_bytealign_S (w[28], w[29], offset); - w[29] = amd_bytealign_S (w[27], w[28], offset); - w[28] = amd_bytealign_S (w[26], w[27], offset); - w[27] = amd_bytealign_S (w[25], w[26], offset); - w[26] = amd_bytealign_S (w[24], w[25], offset); - w[25] = amd_bytealign_S (w[23], w[24], offset); - w[24] = amd_bytealign_S (w[22], w[23], offset); - w[23] = amd_bytealign_S (w[21], w[22], offset); - w[22] = amd_bytealign_S (w[20], w[21], offset); - w[21] = amd_bytealign_S (w[19], w[20], offset); - w[20] = amd_bytealign_S (w[18], w[19], offset); - w[19] = amd_bytealign_S (w[17], w[18], offset); - w[18] = amd_bytealign_S (w[16], w[17], offset); - w[17] = amd_bytealign_S (w[15], w[16], offset); - w[16] = amd_bytealign_S (w[14], w[15], offset); - w[15] = amd_bytealign_S (w[13], w[14], offset); - w[14] = amd_bytealign_S (w[12], w[13], offset); - w[13] = amd_bytealign_S (w[11], w[12], offset); - w[12] = amd_bytealign_S (w[10], w[11], offset); - w[11] = amd_bytealign_S (w[ 9], w[10], offset); - w[10] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[ 9] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[ 8] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[ 7] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[ 6] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[ 5] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[ 4] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[ 3] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[ 2] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[ 1] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[61], w[62], offset); + w[62] = hc_bytealign_S (w[60], w[61], offset); + w[61] = hc_bytealign_S (w[59], w[60], offset); + w[60] = hc_bytealign_S (w[58], w[59], offset); + w[59] = hc_bytealign_S (w[57], w[58], offset); + w[58] = hc_bytealign_S (w[56], w[57], offset); + w[57] = hc_bytealign_S (w[55], w[56], offset); + w[56] = hc_bytealign_S (w[54], w[55], offset); + w[55] = hc_bytealign_S (w[53], w[54], offset); + w[54] = hc_bytealign_S (w[52], w[53], offset); + w[53] = hc_bytealign_S (w[51], w[52], offset); + w[52] = hc_bytealign_S (w[50], w[51], offset); + w[51] = hc_bytealign_S (w[49], w[50], offset); + w[50] = hc_bytealign_S (w[48], w[49], offset); + w[49] = hc_bytealign_S (w[47], w[48], offset); + w[48] = hc_bytealign_S (w[46], w[47], offset); + w[47] = hc_bytealign_S (w[45], w[46], offset); + w[46] = hc_bytealign_S (w[44], w[45], offset); + w[45] = hc_bytealign_S (w[43], w[44], offset); + w[44] = hc_bytealign_S (w[42], w[43], offset); + w[43] = hc_bytealign_S (w[41], w[42], offset); + w[42] = hc_bytealign_S (w[40], w[41], offset); + w[41] = hc_bytealign_S (w[39], w[40], offset); + w[40] = hc_bytealign_S (w[38], w[39], offset); + w[39] = hc_bytealign_S (w[37], w[38], offset); + w[38] = hc_bytealign_S (w[36], w[37], offset); + w[37] = hc_bytealign_S (w[35], w[36], offset); + w[36] = hc_bytealign_S (w[34], w[35], offset); + w[35] = hc_bytealign_S (w[33], w[34], offset); + w[34] = hc_bytealign_S (w[32], w[33], offset); + w[33] = hc_bytealign_S (w[31], w[32], offset); + w[32] = hc_bytealign_S (w[30], w[31], offset); + w[31] = hc_bytealign_S (w[29], w[30], offset); + w[30] = hc_bytealign_S (w[28], w[29], offset); + w[29] = hc_bytealign_S (w[27], w[28], offset); + w[28] = hc_bytealign_S (w[26], w[27], offset); + w[27] = hc_bytealign_S (w[25], w[26], offset); + w[26] = hc_bytealign_S (w[24], w[25], offset); + w[25] = hc_bytealign_S (w[23], w[24], offset); + w[24] = hc_bytealign_S (w[22], w[23], offset); + w[23] = hc_bytealign_S (w[21], w[22], offset); + w[22] = hc_bytealign_S (w[20], w[21], offset); + w[21] = hc_bytealign_S (w[19], w[20], offset); + w[20] = hc_bytealign_S (w[18], w[19], offset); + w[19] = hc_bytealign_S (w[17], w[18], offset); + w[18] = hc_bytealign_S (w[16], w[17], offset); + w[17] = hc_bytealign_S (w[15], w[16], offset); + w[16] = hc_bytealign_S (w[14], w[15], offset); + w[15] = hc_bytealign_S (w[13], w[14], offset); + w[14] = hc_bytealign_S (w[12], w[13], offset); + w[13] = hc_bytealign_S (w[11], w[12], offset); + w[12] = hc_bytealign_S (w[10], w[11], offset); + w[11] = hc_bytealign_S (w[ 9], w[10], offset); + w[10] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[ 9] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[ 8] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[ 7] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[ 6] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[ 5] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[ 4] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[ 3] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[ 2] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[ 1] = hc_bytealign_S ( 0, w[ 0], offset); w[ 0] = 0; break; case 2: - w[63] = amd_bytealign_S (w[60], w[61], offset); - w[62] = amd_bytealign_S (w[59], w[60], offset); - w[61] = amd_bytealign_S (w[58], w[59], offset); - w[60] = amd_bytealign_S (w[57], w[58], offset); - w[59] = amd_bytealign_S (w[56], w[57], offset); - w[58] = amd_bytealign_S (w[55], w[56], offset); - w[57] = amd_bytealign_S (w[54], w[55], offset); - w[56] = amd_bytealign_S (w[53], w[54], offset); - w[55] = amd_bytealign_S (w[52], w[53], offset); - w[54] = amd_bytealign_S (w[51], w[52], offset); - w[53] = amd_bytealign_S (w[50], w[51], offset); - w[52] = amd_bytealign_S (w[49], w[50], offset); - w[51] = amd_bytealign_S (w[48], w[49], offset); - w[50] = amd_bytealign_S (w[47], w[48], offset); - w[49] = amd_bytealign_S (w[46], w[47], offset); - w[48] = amd_bytealign_S (w[45], w[46], offset); - w[47] = amd_bytealign_S (w[44], w[45], offset); - w[46] = amd_bytealign_S (w[43], w[44], offset); - w[45] = amd_bytealign_S (w[42], w[43], offset); - w[44] = amd_bytealign_S (w[41], w[42], offset); - w[43] = amd_bytealign_S (w[40], w[41], offset); - w[42] = amd_bytealign_S (w[39], w[40], offset); - w[41] = amd_bytealign_S (w[38], w[39], offset); - w[40] = amd_bytealign_S (w[37], w[38], offset); - w[39] = amd_bytealign_S (w[36], w[37], offset); - w[38] = amd_bytealign_S (w[35], w[36], offset); - w[37] = amd_bytealign_S (w[34], w[35], offset); - w[36] = amd_bytealign_S (w[33], w[34], offset); - w[35] = amd_bytealign_S (w[32], w[33], offset); - w[34] = amd_bytealign_S (w[31], w[32], offset); - w[33] = amd_bytealign_S (w[30], w[31], offset); - w[32] = amd_bytealign_S (w[29], w[30], offset); - w[31] = amd_bytealign_S (w[28], w[29], offset); - w[30] = amd_bytealign_S (w[27], w[28], offset); - w[29] = amd_bytealign_S (w[26], w[27], offset); - w[28] = amd_bytealign_S (w[25], w[26], offset); - w[27] = amd_bytealign_S (w[24], w[25], offset); - w[26] = amd_bytealign_S (w[23], w[24], offset); - w[25] = amd_bytealign_S (w[22], w[23], offset); - w[24] = amd_bytealign_S (w[21], w[22], offset); - w[23] = amd_bytealign_S (w[20], w[21], offset); - w[22] = amd_bytealign_S (w[19], w[20], offset); - w[21] = amd_bytealign_S (w[18], w[19], offset); - w[20] = amd_bytealign_S (w[17], w[18], offset); - w[19] = amd_bytealign_S (w[16], w[17], offset); - w[18] = amd_bytealign_S (w[15], w[16], offset); - w[17] = amd_bytealign_S (w[14], w[15], offset); - w[16] = amd_bytealign_S (w[13], w[14], offset); - w[15] = amd_bytealign_S (w[12], w[13], offset); - w[14] = amd_bytealign_S (w[11], w[12], offset); - w[13] = amd_bytealign_S (w[10], w[11], offset); - w[12] = amd_bytealign_S (w[ 9], w[10], offset); - w[11] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[10] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[ 9] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[ 8] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[ 7] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[ 6] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[ 5] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[ 4] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[ 3] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[ 2] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[60], w[61], offset); + w[62] = hc_bytealign_S (w[59], w[60], offset); + w[61] = hc_bytealign_S (w[58], w[59], offset); + w[60] = hc_bytealign_S (w[57], w[58], offset); + w[59] = hc_bytealign_S (w[56], w[57], offset); + w[58] = hc_bytealign_S (w[55], w[56], offset); + w[57] = hc_bytealign_S (w[54], w[55], offset); + w[56] = hc_bytealign_S (w[53], w[54], offset); + w[55] = hc_bytealign_S (w[52], w[53], offset); + w[54] = hc_bytealign_S (w[51], w[52], offset); + w[53] = hc_bytealign_S (w[50], w[51], offset); + w[52] = hc_bytealign_S (w[49], w[50], offset); + w[51] = hc_bytealign_S (w[48], w[49], offset); + w[50] = hc_bytealign_S (w[47], w[48], offset); + w[49] = hc_bytealign_S (w[46], w[47], offset); + w[48] = hc_bytealign_S (w[45], w[46], offset); + w[47] = hc_bytealign_S (w[44], w[45], offset); + w[46] = hc_bytealign_S (w[43], w[44], offset); + w[45] = hc_bytealign_S (w[42], w[43], offset); + w[44] = hc_bytealign_S (w[41], w[42], offset); + w[43] = hc_bytealign_S (w[40], w[41], offset); + w[42] = hc_bytealign_S (w[39], w[40], offset); + w[41] = hc_bytealign_S (w[38], w[39], offset); + w[40] = hc_bytealign_S (w[37], w[38], offset); + w[39] = hc_bytealign_S (w[36], w[37], offset); + w[38] = hc_bytealign_S (w[35], w[36], offset); + w[37] = hc_bytealign_S (w[34], w[35], offset); + w[36] = hc_bytealign_S (w[33], w[34], offset); + w[35] = hc_bytealign_S (w[32], w[33], offset); + w[34] = hc_bytealign_S (w[31], w[32], offset); + w[33] = hc_bytealign_S (w[30], w[31], offset); + w[32] = hc_bytealign_S (w[29], w[30], offset); + w[31] = hc_bytealign_S (w[28], w[29], offset); + w[30] = hc_bytealign_S (w[27], w[28], offset); + w[29] = hc_bytealign_S (w[26], w[27], offset); + w[28] = hc_bytealign_S (w[25], w[26], offset); + w[27] = hc_bytealign_S (w[24], w[25], offset); + w[26] = hc_bytealign_S (w[23], w[24], offset); + w[25] = hc_bytealign_S (w[22], w[23], offset); + w[24] = hc_bytealign_S (w[21], w[22], offset); + w[23] = hc_bytealign_S (w[20], w[21], offset); + w[22] = hc_bytealign_S (w[19], w[20], offset); + w[21] = hc_bytealign_S (w[18], w[19], offset); + w[20] = hc_bytealign_S (w[17], w[18], offset); + w[19] = hc_bytealign_S (w[16], w[17], offset); + w[18] = hc_bytealign_S (w[15], w[16], offset); + w[17] = hc_bytealign_S (w[14], w[15], offset); + w[16] = hc_bytealign_S (w[13], w[14], offset); + w[15] = hc_bytealign_S (w[12], w[13], offset); + w[14] = hc_bytealign_S (w[11], w[12], offset); + w[13] = hc_bytealign_S (w[10], w[11], offset); + w[12] = hc_bytealign_S (w[ 9], w[10], offset); + w[11] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[10] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[ 9] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[ 8] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[ 7] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[ 6] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[ 5] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[ 4] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[ 3] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[ 2] = hc_bytealign_S ( 0, w[ 0], offset); w[ 1] = 0; w[ 0] = 0; break; case 3: - w[63] = amd_bytealign_S (w[59], w[60], offset); - w[62] = amd_bytealign_S (w[58], w[59], offset); - w[61] = amd_bytealign_S (w[57], w[58], offset); - w[60] = amd_bytealign_S (w[56], w[57], offset); - w[59] = amd_bytealign_S (w[55], w[56], offset); - w[58] = amd_bytealign_S (w[54], w[55], offset); - w[57] = amd_bytealign_S (w[53], w[54], offset); - w[56] = amd_bytealign_S (w[52], w[53], offset); - w[55] = amd_bytealign_S (w[51], w[52], offset); - w[54] = amd_bytealign_S (w[50], w[51], offset); - w[53] = amd_bytealign_S (w[49], w[50], offset); - w[52] = amd_bytealign_S (w[48], w[49], offset); - w[51] = amd_bytealign_S (w[47], w[48], offset); - w[50] = amd_bytealign_S (w[46], w[47], offset); - w[49] = amd_bytealign_S (w[45], w[46], offset); - w[48] = amd_bytealign_S (w[44], w[45], offset); - w[47] = amd_bytealign_S (w[43], w[44], offset); - w[46] = amd_bytealign_S (w[42], w[43], offset); - w[45] = amd_bytealign_S (w[41], w[42], offset); - w[44] = amd_bytealign_S (w[40], w[41], offset); - w[43] = amd_bytealign_S (w[39], w[40], offset); - w[42] = amd_bytealign_S (w[38], w[39], offset); - w[41] = amd_bytealign_S (w[37], w[38], offset); - w[40] = amd_bytealign_S (w[36], w[37], offset); - w[39] = amd_bytealign_S (w[35], w[36], offset); - w[38] = amd_bytealign_S (w[34], w[35], offset); - w[37] = amd_bytealign_S (w[33], w[34], offset); - w[36] = amd_bytealign_S (w[32], w[33], offset); - w[35] = amd_bytealign_S (w[31], w[32], offset); - w[34] = amd_bytealign_S (w[30], w[31], offset); - w[33] = amd_bytealign_S (w[29], w[30], offset); - w[32] = amd_bytealign_S (w[28], w[29], offset); - w[31] = amd_bytealign_S (w[27], w[28], offset); - w[30] = amd_bytealign_S (w[26], w[27], offset); - w[29] = amd_bytealign_S (w[25], w[26], offset); - w[28] = amd_bytealign_S (w[24], w[25], offset); - w[27] = amd_bytealign_S (w[23], w[24], offset); - w[26] = amd_bytealign_S (w[22], w[23], offset); - w[25] = amd_bytealign_S (w[21], w[22], offset); - w[24] = amd_bytealign_S (w[20], w[21], offset); - w[23] = amd_bytealign_S (w[19], w[20], offset); - w[22] = amd_bytealign_S (w[18], w[19], offset); - w[21] = amd_bytealign_S (w[17], w[18], offset); - w[20] = amd_bytealign_S (w[16], w[17], offset); - w[19] = amd_bytealign_S (w[15], w[16], offset); - w[18] = amd_bytealign_S (w[14], w[15], offset); - w[17] = amd_bytealign_S (w[13], w[14], offset); - w[16] = amd_bytealign_S (w[12], w[13], offset); - w[15] = amd_bytealign_S (w[11], w[12], offset); - w[14] = amd_bytealign_S (w[10], w[11], offset); - w[13] = amd_bytealign_S (w[ 9], w[10], offset); - w[12] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[11] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[10] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[ 9] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[ 8] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[ 7] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[ 6] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[ 5] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[ 4] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[ 3] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[59], w[60], offset); + w[62] = hc_bytealign_S (w[58], w[59], offset); + w[61] = hc_bytealign_S (w[57], w[58], offset); + w[60] = hc_bytealign_S (w[56], w[57], offset); + w[59] = hc_bytealign_S (w[55], w[56], offset); + w[58] = hc_bytealign_S (w[54], w[55], offset); + w[57] = hc_bytealign_S (w[53], w[54], offset); + w[56] = hc_bytealign_S (w[52], w[53], offset); + w[55] = hc_bytealign_S (w[51], w[52], offset); + w[54] = hc_bytealign_S (w[50], w[51], offset); + w[53] = hc_bytealign_S (w[49], w[50], offset); + w[52] = hc_bytealign_S (w[48], w[49], offset); + w[51] = hc_bytealign_S (w[47], w[48], offset); + w[50] = hc_bytealign_S (w[46], w[47], offset); + w[49] = hc_bytealign_S (w[45], w[46], offset); + w[48] = hc_bytealign_S (w[44], w[45], offset); + w[47] = hc_bytealign_S (w[43], w[44], offset); + w[46] = hc_bytealign_S (w[42], w[43], offset); + w[45] = hc_bytealign_S (w[41], w[42], offset); + w[44] = hc_bytealign_S (w[40], w[41], offset); + w[43] = hc_bytealign_S (w[39], w[40], offset); + w[42] = hc_bytealign_S (w[38], w[39], offset); + w[41] = hc_bytealign_S (w[37], w[38], offset); + w[40] = hc_bytealign_S (w[36], w[37], offset); + w[39] = hc_bytealign_S (w[35], w[36], offset); + w[38] = hc_bytealign_S (w[34], w[35], offset); + w[37] = hc_bytealign_S (w[33], w[34], offset); + w[36] = hc_bytealign_S (w[32], w[33], offset); + w[35] = hc_bytealign_S (w[31], w[32], offset); + w[34] = hc_bytealign_S (w[30], w[31], offset); + w[33] = hc_bytealign_S (w[29], w[30], offset); + w[32] = hc_bytealign_S (w[28], w[29], offset); + w[31] = hc_bytealign_S (w[27], w[28], offset); + w[30] = hc_bytealign_S (w[26], w[27], offset); + w[29] = hc_bytealign_S (w[25], w[26], offset); + w[28] = hc_bytealign_S (w[24], w[25], offset); + w[27] = hc_bytealign_S (w[23], w[24], offset); + w[26] = hc_bytealign_S (w[22], w[23], offset); + w[25] = hc_bytealign_S (w[21], w[22], offset); + w[24] = hc_bytealign_S (w[20], w[21], offset); + w[23] = hc_bytealign_S (w[19], w[20], offset); + w[22] = hc_bytealign_S (w[18], w[19], offset); + w[21] = hc_bytealign_S (w[17], w[18], offset); + w[20] = hc_bytealign_S (w[16], w[17], offset); + w[19] = hc_bytealign_S (w[15], w[16], offset); + w[18] = hc_bytealign_S (w[14], w[15], offset); + w[17] = hc_bytealign_S (w[13], w[14], offset); + w[16] = hc_bytealign_S (w[12], w[13], offset); + w[15] = hc_bytealign_S (w[11], w[12], offset); + w[14] = hc_bytealign_S (w[10], w[11], offset); + w[13] = hc_bytealign_S (w[ 9], w[10], offset); + w[12] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[11] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[10] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[ 9] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[ 8] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[ 7] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[ 6] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[ 5] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[ 4] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[ 3] = hc_bytealign_S ( 0, w[ 0], offset); w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; @@ -51358,66 +51358,66 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 4: - w[63] = amd_bytealign_S (w[58], w[59], offset); - w[62] = amd_bytealign_S (w[57], w[58], offset); - w[61] = amd_bytealign_S (w[56], w[57], offset); - w[60] = amd_bytealign_S (w[55], w[56], offset); - w[59] = amd_bytealign_S (w[54], w[55], offset); - w[58] = amd_bytealign_S (w[53], w[54], offset); - w[57] = amd_bytealign_S (w[52], w[53], offset); - w[56] = amd_bytealign_S (w[51], w[52], offset); - w[55] = amd_bytealign_S (w[50], w[51], offset); - w[54] = amd_bytealign_S (w[49], w[50], offset); - w[53] = amd_bytealign_S (w[48], w[49], offset); - w[52] = amd_bytealign_S (w[47], w[48], offset); - w[51] = amd_bytealign_S (w[46], w[47], offset); - w[50] = amd_bytealign_S (w[45], w[46], offset); - w[49] = amd_bytealign_S (w[44], w[45], offset); - w[48] = amd_bytealign_S (w[43], w[44], offset); - w[47] = amd_bytealign_S (w[42], w[43], offset); - w[46] = amd_bytealign_S (w[41], w[42], offset); - w[45] = amd_bytealign_S (w[40], w[41], offset); - w[44] = amd_bytealign_S (w[39], w[40], offset); - w[43] = amd_bytealign_S (w[38], w[39], offset); - w[42] = amd_bytealign_S (w[37], w[38], offset); - w[41] = amd_bytealign_S (w[36], w[37], offset); - w[40] = amd_bytealign_S (w[35], w[36], offset); - w[39] = amd_bytealign_S (w[34], w[35], offset); - w[38] = amd_bytealign_S (w[33], w[34], offset); - w[37] = amd_bytealign_S (w[32], w[33], offset); - w[36] = amd_bytealign_S (w[31], w[32], offset); - w[35] = amd_bytealign_S (w[30], w[31], offset); - w[34] = amd_bytealign_S (w[29], w[30], offset); - w[33] = amd_bytealign_S (w[28], w[29], offset); - w[32] = amd_bytealign_S (w[27], w[28], offset); - w[31] = amd_bytealign_S (w[26], w[27], offset); - w[30] = amd_bytealign_S (w[25], w[26], offset); - w[29] = amd_bytealign_S (w[24], w[25], offset); - w[28] = amd_bytealign_S (w[23], w[24], offset); - w[27] = amd_bytealign_S (w[22], w[23], offset); - w[26] = amd_bytealign_S (w[21], w[22], offset); - w[25] = amd_bytealign_S (w[20], w[21], offset); - w[24] = amd_bytealign_S (w[19], w[20], offset); - w[23] = amd_bytealign_S (w[18], w[19], offset); - w[22] = amd_bytealign_S (w[17], w[18], offset); - w[21] = amd_bytealign_S (w[16], w[17], offset); - w[20] = amd_bytealign_S (w[15], w[16], offset); - w[19] = amd_bytealign_S (w[14], w[15], offset); - w[18] = amd_bytealign_S (w[13], w[14], offset); - w[17] = amd_bytealign_S (w[12], w[13], offset); - w[16] = amd_bytealign_S (w[11], w[12], offset); - w[15] = amd_bytealign_S (w[10], w[11], offset); - w[14] = amd_bytealign_S (w[ 9], w[10], offset); - w[13] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[12] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[11] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[10] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[ 9] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[ 8] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[ 7] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[ 6] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[ 5] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[ 4] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[58], w[59], offset); + w[62] = hc_bytealign_S (w[57], w[58], offset); + w[61] = hc_bytealign_S (w[56], w[57], offset); + w[60] = hc_bytealign_S (w[55], w[56], offset); + w[59] = hc_bytealign_S (w[54], w[55], offset); + w[58] = hc_bytealign_S (w[53], w[54], offset); + w[57] = hc_bytealign_S (w[52], w[53], offset); + w[56] = hc_bytealign_S (w[51], w[52], offset); + w[55] = hc_bytealign_S (w[50], w[51], offset); + w[54] = hc_bytealign_S (w[49], w[50], offset); + w[53] = hc_bytealign_S (w[48], w[49], offset); + w[52] = hc_bytealign_S (w[47], w[48], offset); + w[51] = hc_bytealign_S (w[46], w[47], offset); + w[50] = hc_bytealign_S (w[45], w[46], offset); + w[49] = hc_bytealign_S (w[44], w[45], offset); + w[48] = hc_bytealign_S (w[43], w[44], offset); + w[47] = hc_bytealign_S (w[42], w[43], offset); + w[46] = hc_bytealign_S (w[41], w[42], offset); + w[45] = hc_bytealign_S (w[40], w[41], offset); + w[44] = hc_bytealign_S (w[39], w[40], offset); + w[43] = hc_bytealign_S (w[38], w[39], offset); + w[42] = hc_bytealign_S (w[37], w[38], offset); + w[41] = hc_bytealign_S (w[36], w[37], offset); + w[40] = hc_bytealign_S (w[35], w[36], offset); + w[39] = hc_bytealign_S (w[34], w[35], offset); + w[38] = hc_bytealign_S (w[33], w[34], offset); + w[37] = hc_bytealign_S (w[32], w[33], offset); + w[36] = hc_bytealign_S (w[31], w[32], offset); + w[35] = hc_bytealign_S (w[30], w[31], offset); + w[34] = hc_bytealign_S (w[29], w[30], offset); + w[33] = hc_bytealign_S (w[28], w[29], offset); + w[32] = hc_bytealign_S (w[27], w[28], offset); + w[31] = hc_bytealign_S (w[26], w[27], offset); + w[30] = hc_bytealign_S (w[25], w[26], offset); + w[29] = hc_bytealign_S (w[24], w[25], offset); + w[28] = hc_bytealign_S (w[23], w[24], offset); + w[27] = hc_bytealign_S (w[22], w[23], offset); + w[26] = hc_bytealign_S (w[21], w[22], offset); + w[25] = hc_bytealign_S (w[20], w[21], offset); + w[24] = hc_bytealign_S (w[19], w[20], offset); + w[23] = hc_bytealign_S (w[18], w[19], offset); + w[22] = hc_bytealign_S (w[17], w[18], offset); + w[21] = hc_bytealign_S (w[16], w[17], offset); + w[20] = hc_bytealign_S (w[15], w[16], offset); + w[19] = hc_bytealign_S (w[14], w[15], offset); + w[18] = hc_bytealign_S (w[13], w[14], offset); + w[17] = hc_bytealign_S (w[12], w[13], offset); + w[16] = hc_bytealign_S (w[11], w[12], offset); + w[15] = hc_bytealign_S (w[10], w[11], offset); + w[14] = hc_bytealign_S (w[ 9], w[10], offset); + w[13] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[12] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[11] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[10] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[ 9] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[ 8] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[ 7] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[ 6] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[ 5] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[ 4] = hc_bytealign_S ( 0, w[ 0], offset); w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; @@ -51426,65 +51426,65 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 5: - w[63] = amd_bytealign_S (w[57], w[58], offset); - w[62] = amd_bytealign_S (w[56], w[57], offset); - w[61] = amd_bytealign_S (w[55], w[56], offset); - w[60] = amd_bytealign_S (w[54], w[55], offset); - w[59] = amd_bytealign_S (w[53], w[54], offset); - w[58] = amd_bytealign_S (w[52], w[53], offset); - w[57] = amd_bytealign_S (w[51], w[52], offset); - w[56] = amd_bytealign_S (w[50], w[51], offset); - w[55] = amd_bytealign_S (w[49], w[50], offset); - w[54] = amd_bytealign_S (w[48], w[49], offset); - w[53] = amd_bytealign_S (w[47], w[48], offset); - w[52] = amd_bytealign_S (w[46], w[47], offset); - w[51] = amd_bytealign_S (w[45], w[46], offset); - w[50] = amd_bytealign_S (w[44], w[45], offset); - w[49] = amd_bytealign_S (w[43], w[44], offset); - w[48] = amd_bytealign_S (w[42], w[43], offset); - w[47] = amd_bytealign_S (w[41], w[42], offset); - w[46] = amd_bytealign_S (w[40], w[41], offset); - w[45] = amd_bytealign_S (w[39], w[40], offset); - w[44] = amd_bytealign_S (w[38], w[39], offset); - w[43] = amd_bytealign_S (w[37], w[38], offset); - w[42] = amd_bytealign_S (w[36], w[37], offset); - w[41] = amd_bytealign_S (w[35], w[36], offset); - w[40] = amd_bytealign_S (w[34], w[35], offset); - w[39] = amd_bytealign_S (w[33], w[34], offset); - w[38] = amd_bytealign_S (w[32], w[33], offset); - w[37] = amd_bytealign_S (w[31], w[32], offset); - w[36] = amd_bytealign_S (w[30], w[31], offset); - w[35] = amd_bytealign_S (w[29], w[30], offset); - w[34] = amd_bytealign_S (w[28], w[29], offset); - w[33] = amd_bytealign_S (w[27], w[28], offset); - w[32] = amd_bytealign_S (w[26], w[27], offset); - w[31] = amd_bytealign_S (w[25], w[26], offset); - w[30] = amd_bytealign_S (w[24], w[25], offset); - w[29] = amd_bytealign_S (w[23], w[24], offset); - w[28] = amd_bytealign_S (w[22], w[23], offset); - w[27] = amd_bytealign_S (w[21], w[22], offset); - w[26] = amd_bytealign_S (w[20], w[21], offset); - w[25] = amd_bytealign_S (w[19], w[20], offset); - w[24] = amd_bytealign_S (w[18], w[19], offset); - w[23] = amd_bytealign_S (w[17], w[18], offset); - w[22] = amd_bytealign_S (w[16], w[17], offset); - w[21] = amd_bytealign_S (w[15], w[16], offset); - w[20] = amd_bytealign_S (w[14], w[15], offset); - w[19] = amd_bytealign_S (w[13], w[14], offset); - w[18] = amd_bytealign_S (w[12], w[13], offset); - w[17] = amd_bytealign_S (w[11], w[12], offset); - w[16] = amd_bytealign_S (w[10], w[11], offset); - w[15] = amd_bytealign_S (w[ 9], w[10], offset); - w[14] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[13] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[12] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[11] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[10] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[ 9] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[ 8] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[ 7] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[ 6] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[ 5] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[57], w[58], offset); + w[62] = hc_bytealign_S (w[56], w[57], offset); + w[61] = hc_bytealign_S (w[55], w[56], offset); + w[60] = hc_bytealign_S (w[54], w[55], offset); + w[59] = hc_bytealign_S (w[53], w[54], offset); + w[58] = hc_bytealign_S (w[52], w[53], offset); + w[57] = hc_bytealign_S (w[51], w[52], offset); + w[56] = hc_bytealign_S (w[50], w[51], offset); + w[55] = hc_bytealign_S (w[49], w[50], offset); + w[54] = hc_bytealign_S (w[48], w[49], offset); + w[53] = hc_bytealign_S (w[47], w[48], offset); + w[52] = hc_bytealign_S (w[46], w[47], offset); + w[51] = hc_bytealign_S (w[45], w[46], offset); + w[50] = hc_bytealign_S (w[44], w[45], offset); + w[49] = hc_bytealign_S (w[43], w[44], offset); + w[48] = hc_bytealign_S (w[42], w[43], offset); + w[47] = hc_bytealign_S (w[41], w[42], offset); + w[46] = hc_bytealign_S (w[40], w[41], offset); + w[45] = hc_bytealign_S (w[39], w[40], offset); + w[44] = hc_bytealign_S (w[38], w[39], offset); + w[43] = hc_bytealign_S (w[37], w[38], offset); + w[42] = hc_bytealign_S (w[36], w[37], offset); + w[41] = hc_bytealign_S (w[35], w[36], offset); + w[40] = hc_bytealign_S (w[34], w[35], offset); + w[39] = hc_bytealign_S (w[33], w[34], offset); + w[38] = hc_bytealign_S (w[32], w[33], offset); + w[37] = hc_bytealign_S (w[31], w[32], offset); + w[36] = hc_bytealign_S (w[30], w[31], offset); + w[35] = hc_bytealign_S (w[29], w[30], offset); + w[34] = hc_bytealign_S (w[28], w[29], offset); + w[33] = hc_bytealign_S (w[27], w[28], offset); + w[32] = hc_bytealign_S (w[26], w[27], offset); + w[31] = hc_bytealign_S (w[25], w[26], offset); + w[30] = hc_bytealign_S (w[24], w[25], offset); + w[29] = hc_bytealign_S (w[23], w[24], offset); + w[28] = hc_bytealign_S (w[22], w[23], offset); + w[27] = hc_bytealign_S (w[21], w[22], offset); + w[26] = hc_bytealign_S (w[20], w[21], offset); + w[25] = hc_bytealign_S (w[19], w[20], offset); + w[24] = hc_bytealign_S (w[18], w[19], offset); + w[23] = hc_bytealign_S (w[17], w[18], offset); + w[22] = hc_bytealign_S (w[16], w[17], offset); + w[21] = hc_bytealign_S (w[15], w[16], offset); + w[20] = hc_bytealign_S (w[14], w[15], offset); + w[19] = hc_bytealign_S (w[13], w[14], offset); + w[18] = hc_bytealign_S (w[12], w[13], offset); + w[17] = hc_bytealign_S (w[11], w[12], offset); + w[16] = hc_bytealign_S (w[10], w[11], offset); + w[15] = hc_bytealign_S (w[ 9], w[10], offset); + w[14] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[13] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[12] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[11] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[10] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[ 9] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[ 8] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[ 7] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[ 6] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[ 5] = hc_bytealign_S ( 0, w[ 0], offset); w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; @@ -51494,64 +51494,64 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 6: - w[63] = amd_bytealign_S (w[56], w[57], offset); - w[62] = amd_bytealign_S (w[55], w[56], offset); - w[61] = amd_bytealign_S (w[54], w[55], offset); - w[60] = amd_bytealign_S (w[53], w[54], offset); - w[59] = amd_bytealign_S (w[52], w[53], offset); - w[58] = amd_bytealign_S (w[51], w[52], offset); - w[57] = amd_bytealign_S (w[50], w[51], offset); - w[56] = amd_bytealign_S (w[49], w[50], offset); - w[55] = amd_bytealign_S (w[48], w[49], offset); - w[54] = amd_bytealign_S (w[47], w[48], offset); - w[53] = amd_bytealign_S (w[46], w[47], offset); - w[52] = amd_bytealign_S (w[45], w[46], offset); - w[51] = amd_bytealign_S (w[44], w[45], offset); - w[50] = amd_bytealign_S (w[43], w[44], offset); - w[49] = amd_bytealign_S (w[42], w[43], offset); - w[48] = amd_bytealign_S (w[41], w[42], offset); - w[47] = amd_bytealign_S (w[40], w[41], offset); - w[46] = amd_bytealign_S (w[39], w[40], offset); - w[45] = amd_bytealign_S (w[38], w[39], offset); - w[44] = amd_bytealign_S (w[37], w[38], offset); - w[43] = amd_bytealign_S (w[36], w[37], offset); - w[42] = amd_bytealign_S (w[35], w[36], offset); - w[41] = amd_bytealign_S (w[34], w[35], offset); - w[40] = amd_bytealign_S (w[33], w[34], offset); - w[39] = amd_bytealign_S (w[32], w[33], offset); - w[38] = amd_bytealign_S (w[31], w[32], offset); - w[37] = amd_bytealign_S (w[30], w[31], offset); - w[36] = amd_bytealign_S (w[29], w[30], offset); - w[35] = amd_bytealign_S (w[28], w[29], offset); - w[34] = amd_bytealign_S (w[27], w[28], offset); - w[33] = amd_bytealign_S (w[26], w[27], offset); - w[32] = amd_bytealign_S (w[25], w[26], offset); - w[31] = amd_bytealign_S (w[24], w[25], offset); - w[30] = amd_bytealign_S (w[23], w[24], offset); - w[29] = amd_bytealign_S (w[22], w[23], offset); - w[28] = amd_bytealign_S (w[21], w[22], offset); - w[27] = amd_bytealign_S (w[20], w[21], offset); - w[26] = amd_bytealign_S (w[19], w[20], offset); - w[25] = amd_bytealign_S (w[18], w[19], offset); - w[24] = amd_bytealign_S (w[17], w[18], offset); - w[23] = amd_bytealign_S (w[16], w[17], offset); - w[22] = amd_bytealign_S (w[15], w[16], offset); - w[21] = amd_bytealign_S (w[14], w[15], offset); - w[20] = amd_bytealign_S (w[13], w[14], offset); - w[19] = amd_bytealign_S (w[12], w[13], offset); - w[18] = amd_bytealign_S (w[11], w[12], offset); - w[17] = amd_bytealign_S (w[10], w[11], offset); - w[16] = amd_bytealign_S (w[ 9], w[10], offset); - w[15] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[14] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[13] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[12] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[11] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[10] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[ 9] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[ 8] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[ 7] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[ 6] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[56], w[57], offset); + w[62] = hc_bytealign_S (w[55], w[56], offset); + w[61] = hc_bytealign_S (w[54], w[55], offset); + w[60] = hc_bytealign_S (w[53], w[54], offset); + w[59] = hc_bytealign_S (w[52], w[53], offset); + w[58] = hc_bytealign_S (w[51], w[52], offset); + w[57] = hc_bytealign_S (w[50], w[51], offset); + w[56] = hc_bytealign_S (w[49], w[50], offset); + w[55] = hc_bytealign_S (w[48], w[49], offset); + w[54] = hc_bytealign_S (w[47], w[48], offset); + w[53] = hc_bytealign_S (w[46], w[47], offset); + w[52] = hc_bytealign_S (w[45], w[46], offset); + w[51] = hc_bytealign_S (w[44], w[45], offset); + w[50] = hc_bytealign_S (w[43], w[44], offset); + w[49] = hc_bytealign_S (w[42], w[43], offset); + w[48] = hc_bytealign_S (w[41], w[42], offset); + w[47] = hc_bytealign_S (w[40], w[41], offset); + w[46] = hc_bytealign_S (w[39], w[40], offset); + w[45] = hc_bytealign_S (w[38], w[39], offset); + w[44] = hc_bytealign_S (w[37], w[38], offset); + w[43] = hc_bytealign_S (w[36], w[37], offset); + w[42] = hc_bytealign_S (w[35], w[36], offset); + w[41] = hc_bytealign_S (w[34], w[35], offset); + w[40] = hc_bytealign_S (w[33], w[34], offset); + w[39] = hc_bytealign_S (w[32], w[33], offset); + w[38] = hc_bytealign_S (w[31], w[32], offset); + w[37] = hc_bytealign_S (w[30], w[31], offset); + w[36] = hc_bytealign_S (w[29], w[30], offset); + w[35] = hc_bytealign_S (w[28], w[29], offset); + w[34] = hc_bytealign_S (w[27], w[28], offset); + w[33] = hc_bytealign_S (w[26], w[27], offset); + w[32] = hc_bytealign_S (w[25], w[26], offset); + w[31] = hc_bytealign_S (w[24], w[25], offset); + w[30] = hc_bytealign_S (w[23], w[24], offset); + w[29] = hc_bytealign_S (w[22], w[23], offset); + w[28] = hc_bytealign_S (w[21], w[22], offset); + w[27] = hc_bytealign_S (w[20], w[21], offset); + w[26] = hc_bytealign_S (w[19], w[20], offset); + w[25] = hc_bytealign_S (w[18], w[19], offset); + w[24] = hc_bytealign_S (w[17], w[18], offset); + w[23] = hc_bytealign_S (w[16], w[17], offset); + w[22] = hc_bytealign_S (w[15], w[16], offset); + w[21] = hc_bytealign_S (w[14], w[15], offset); + w[20] = hc_bytealign_S (w[13], w[14], offset); + w[19] = hc_bytealign_S (w[12], w[13], offset); + w[18] = hc_bytealign_S (w[11], w[12], offset); + w[17] = hc_bytealign_S (w[10], w[11], offset); + w[16] = hc_bytealign_S (w[ 9], w[10], offset); + w[15] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[14] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[13] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[12] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[11] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[10] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[ 9] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[ 8] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[ 7] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[ 6] = hc_bytealign_S ( 0, w[ 0], offset); w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; @@ -51562,63 +51562,63 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 7: - w[63] = amd_bytealign_S (w[55], w[56], offset); - w[62] = amd_bytealign_S (w[54], w[55], offset); - w[61] = amd_bytealign_S (w[53], w[54], offset); - w[60] = amd_bytealign_S (w[52], w[53], offset); - w[59] = amd_bytealign_S (w[51], w[52], offset); - w[58] = amd_bytealign_S (w[50], w[51], offset); - w[57] = amd_bytealign_S (w[49], w[50], offset); - w[56] = amd_bytealign_S (w[48], w[49], offset); - w[55] = amd_bytealign_S (w[47], w[48], offset); - w[54] = amd_bytealign_S (w[46], w[47], offset); - w[53] = amd_bytealign_S (w[45], w[46], offset); - w[52] = amd_bytealign_S (w[44], w[45], offset); - w[51] = amd_bytealign_S (w[43], w[44], offset); - w[50] = amd_bytealign_S (w[42], w[43], offset); - w[49] = amd_bytealign_S (w[41], w[42], offset); - w[48] = amd_bytealign_S (w[40], w[41], offset); - w[47] = amd_bytealign_S (w[39], w[40], offset); - w[46] = amd_bytealign_S (w[38], w[39], offset); - w[45] = amd_bytealign_S (w[37], w[38], offset); - w[44] = amd_bytealign_S (w[36], w[37], offset); - w[43] = amd_bytealign_S (w[35], w[36], offset); - w[42] = amd_bytealign_S (w[34], w[35], offset); - w[41] = amd_bytealign_S (w[33], w[34], offset); - w[40] = amd_bytealign_S (w[32], w[33], offset); - w[39] = amd_bytealign_S (w[31], w[32], offset); - w[38] = amd_bytealign_S (w[30], w[31], offset); - w[37] = amd_bytealign_S (w[29], w[30], offset); - w[36] = amd_bytealign_S (w[28], w[29], offset); - w[35] = amd_bytealign_S (w[27], w[28], offset); - w[34] = amd_bytealign_S (w[26], w[27], offset); - w[33] = amd_bytealign_S (w[25], w[26], offset); - w[32] = amd_bytealign_S (w[24], w[25], offset); - w[31] = amd_bytealign_S (w[23], w[24], offset); - w[30] = amd_bytealign_S (w[22], w[23], offset); - w[29] = amd_bytealign_S (w[21], w[22], offset); - w[28] = amd_bytealign_S (w[20], w[21], offset); - w[27] = amd_bytealign_S (w[19], w[20], offset); - w[26] = amd_bytealign_S (w[18], w[19], offset); - w[25] = amd_bytealign_S (w[17], w[18], offset); - w[24] = amd_bytealign_S (w[16], w[17], offset); - w[23] = amd_bytealign_S (w[15], w[16], offset); - w[22] = amd_bytealign_S (w[14], w[15], offset); - w[21] = amd_bytealign_S (w[13], w[14], offset); - w[20] = amd_bytealign_S (w[12], w[13], offset); - w[19] = amd_bytealign_S (w[11], w[12], offset); - w[18] = amd_bytealign_S (w[10], w[11], offset); - w[17] = amd_bytealign_S (w[ 9], w[10], offset); - w[16] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[15] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[14] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[13] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[12] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[11] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[10] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[ 9] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[ 8] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[ 7] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[55], w[56], offset); + w[62] = hc_bytealign_S (w[54], w[55], offset); + w[61] = hc_bytealign_S (w[53], w[54], offset); + w[60] = hc_bytealign_S (w[52], w[53], offset); + w[59] = hc_bytealign_S (w[51], w[52], offset); + w[58] = hc_bytealign_S (w[50], w[51], offset); + w[57] = hc_bytealign_S (w[49], w[50], offset); + w[56] = hc_bytealign_S (w[48], w[49], offset); + w[55] = hc_bytealign_S (w[47], w[48], offset); + w[54] = hc_bytealign_S (w[46], w[47], offset); + w[53] = hc_bytealign_S (w[45], w[46], offset); + w[52] = hc_bytealign_S (w[44], w[45], offset); + w[51] = hc_bytealign_S (w[43], w[44], offset); + w[50] = hc_bytealign_S (w[42], w[43], offset); + w[49] = hc_bytealign_S (w[41], w[42], offset); + w[48] = hc_bytealign_S (w[40], w[41], offset); + w[47] = hc_bytealign_S (w[39], w[40], offset); + w[46] = hc_bytealign_S (w[38], w[39], offset); + w[45] = hc_bytealign_S (w[37], w[38], offset); + w[44] = hc_bytealign_S (w[36], w[37], offset); + w[43] = hc_bytealign_S (w[35], w[36], offset); + w[42] = hc_bytealign_S (w[34], w[35], offset); + w[41] = hc_bytealign_S (w[33], w[34], offset); + w[40] = hc_bytealign_S (w[32], w[33], offset); + w[39] = hc_bytealign_S (w[31], w[32], offset); + w[38] = hc_bytealign_S (w[30], w[31], offset); + w[37] = hc_bytealign_S (w[29], w[30], offset); + w[36] = hc_bytealign_S (w[28], w[29], offset); + w[35] = hc_bytealign_S (w[27], w[28], offset); + w[34] = hc_bytealign_S (w[26], w[27], offset); + w[33] = hc_bytealign_S (w[25], w[26], offset); + w[32] = hc_bytealign_S (w[24], w[25], offset); + w[31] = hc_bytealign_S (w[23], w[24], offset); + w[30] = hc_bytealign_S (w[22], w[23], offset); + w[29] = hc_bytealign_S (w[21], w[22], offset); + w[28] = hc_bytealign_S (w[20], w[21], offset); + w[27] = hc_bytealign_S (w[19], w[20], offset); + w[26] = hc_bytealign_S (w[18], w[19], offset); + w[25] = hc_bytealign_S (w[17], w[18], offset); + w[24] = hc_bytealign_S (w[16], w[17], offset); + w[23] = hc_bytealign_S (w[15], w[16], offset); + w[22] = hc_bytealign_S (w[14], w[15], offset); + w[21] = hc_bytealign_S (w[13], w[14], offset); + w[20] = hc_bytealign_S (w[12], w[13], offset); + w[19] = hc_bytealign_S (w[11], w[12], offset); + w[18] = hc_bytealign_S (w[10], w[11], offset); + w[17] = hc_bytealign_S (w[ 9], w[10], offset); + w[16] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[15] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[14] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[13] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[12] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[11] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[10] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[ 9] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[ 8] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[ 7] = hc_bytealign_S ( 0, w[ 0], offset); w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; @@ -51630,62 +51630,62 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 8: - w[63] = amd_bytealign_S (w[54], w[55], offset); - w[62] = amd_bytealign_S (w[53], w[54], offset); - w[61] = amd_bytealign_S (w[52], w[53], offset); - w[60] = amd_bytealign_S (w[51], w[52], offset); - w[59] = amd_bytealign_S (w[50], w[51], offset); - w[58] = amd_bytealign_S (w[49], w[50], offset); - w[57] = amd_bytealign_S (w[48], w[49], offset); - w[56] = amd_bytealign_S (w[47], w[48], offset); - w[55] = amd_bytealign_S (w[46], w[47], offset); - w[54] = amd_bytealign_S (w[45], w[46], offset); - w[53] = amd_bytealign_S (w[44], w[45], offset); - w[52] = amd_bytealign_S (w[43], w[44], offset); - w[51] = amd_bytealign_S (w[42], w[43], offset); - w[50] = amd_bytealign_S (w[41], w[42], offset); - w[49] = amd_bytealign_S (w[40], w[41], offset); - w[48] = amd_bytealign_S (w[39], w[40], offset); - w[47] = amd_bytealign_S (w[38], w[39], offset); - w[46] = amd_bytealign_S (w[37], w[38], offset); - w[45] = amd_bytealign_S (w[36], w[37], offset); - w[44] = amd_bytealign_S (w[35], w[36], offset); - w[43] = amd_bytealign_S (w[34], w[35], offset); - w[42] = amd_bytealign_S (w[33], w[34], offset); - w[41] = amd_bytealign_S (w[32], w[33], offset); - w[40] = amd_bytealign_S (w[31], w[32], offset); - w[39] = amd_bytealign_S (w[30], w[31], offset); - w[38] = amd_bytealign_S (w[29], w[30], offset); - w[37] = amd_bytealign_S (w[28], w[29], offset); - w[36] = amd_bytealign_S (w[27], w[28], offset); - w[35] = amd_bytealign_S (w[26], w[27], offset); - w[34] = amd_bytealign_S (w[25], w[26], offset); - w[33] = amd_bytealign_S (w[24], w[25], offset); - w[32] = amd_bytealign_S (w[23], w[24], offset); - w[31] = amd_bytealign_S (w[22], w[23], offset); - w[30] = amd_bytealign_S (w[21], w[22], offset); - w[29] = amd_bytealign_S (w[20], w[21], offset); - w[28] = amd_bytealign_S (w[19], w[20], offset); - w[27] = amd_bytealign_S (w[18], w[19], offset); - w[26] = amd_bytealign_S (w[17], w[18], offset); - w[25] = amd_bytealign_S (w[16], w[17], offset); - w[24] = amd_bytealign_S (w[15], w[16], offset); - w[23] = amd_bytealign_S (w[14], w[15], offset); - w[22] = amd_bytealign_S (w[13], w[14], offset); - w[21] = amd_bytealign_S (w[12], w[13], offset); - w[20] = amd_bytealign_S (w[11], w[12], offset); - w[19] = amd_bytealign_S (w[10], w[11], offset); - w[18] = amd_bytealign_S (w[ 9], w[10], offset); - w[17] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[16] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[15] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[14] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[13] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[12] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[11] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[10] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[ 9] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[ 8] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[54], w[55], offset); + w[62] = hc_bytealign_S (w[53], w[54], offset); + w[61] = hc_bytealign_S (w[52], w[53], offset); + w[60] = hc_bytealign_S (w[51], w[52], offset); + w[59] = hc_bytealign_S (w[50], w[51], offset); + w[58] = hc_bytealign_S (w[49], w[50], offset); + w[57] = hc_bytealign_S (w[48], w[49], offset); + w[56] = hc_bytealign_S (w[47], w[48], offset); + w[55] = hc_bytealign_S (w[46], w[47], offset); + w[54] = hc_bytealign_S (w[45], w[46], offset); + w[53] = hc_bytealign_S (w[44], w[45], offset); + w[52] = hc_bytealign_S (w[43], w[44], offset); + w[51] = hc_bytealign_S (w[42], w[43], offset); + w[50] = hc_bytealign_S (w[41], w[42], offset); + w[49] = hc_bytealign_S (w[40], w[41], offset); + w[48] = hc_bytealign_S (w[39], w[40], offset); + w[47] = hc_bytealign_S (w[38], w[39], offset); + w[46] = hc_bytealign_S (w[37], w[38], offset); + w[45] = hc_bytealign_S (w[36], w[37], offset); + w[44] = hc_bytealign_S (w[35], w[36], offset); + w[43] = hc_bytealign_S (w[34], w[35], offset); + w[42] = hc_bytealign_S (w[33], w[34], offset); + w[41] = hc_bytealign_S (w[32], w[33], offset); + w[40] = hc_bytealign_S (w[31], w[32], offset); + w[39] = hc_bytealign_S (w[30], w[31], offset); + w[38] = hc_bytealign_S (w[29], w[30], offset); + w[37] = hc_bytealign_S (w[28], w[29], offset); + w[36] = hc_bytealign_S (w[27], w[28], offset); + w[35] = hc_bytealign_S (w[26], w[27], offset); + w[34] = hc_bytealign_S (w[25], w[26], offset); + w[33] = hc_bytealign_S (w[24], w[25], offset); + w[32] = hc_bytealign_S (w[23], w[24], offset); + w[31] = hc_bytealign_S (w[22], w[23], offset); + w[30] = hc_bytealign_S (w[21], w[22], offset); + w[29] = hc_bytealign_S (w[20], w[21], offset); + w[28] = hc_bytealign_S (w[19], w[20], offset); + w[27] = hc_bytealign_S (w[18], w[19], offset); + w[26] = hc_bytealign_S (w[17], w[18], offset); + w[25] = hc_bytealign_S (w[16], w[17], offset); + w[24] = hc_bytealign_S (w[15], w[16], offset); + w[23] = hc_bytealign_S (w[14], w[15], offset); + w[22] = hc_bytealign_S (w[13], w[14], offset); + w[21] = hc_bytealign_S (w[12], w[13], offset); + w[20] = hc_bytealign_S (w[11], w[12], offset); + w[19] = hc_bytealign_S (w[10], w[11], offset); + w[18] = hc_bytealign_S (w[ 9], w[10], offset); + w[17] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[16] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[15] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[14] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[13] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[12] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[11] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[10] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[ 9] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[ 8] = hc_bytealign_S ( 0, w[ 0], offset); w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; @@ -51698,61 +51698,61 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 9: - w[63] = amd_bytealign_S (w[53], w[54], offset); - w[62] = amd_bytealign_S (w[52], w[53], offset); - w[61] = amd_bytealign_S (w[51], w[52], offset); - w[60] = amd_bytealign_S (w[50], w[51], offset); - w[59] = amd_bytealign_S (w[49], w[50], offset); - w[58] = amd_bytealign_S (w[48], w[49], offset); - w[57] = amd_bytealign_S (w[47], w[48], offset); - w[56] = amd_bytealign_S (w[46], w[47], offset); - w[55] = amd_bytealign_S (w[45], w[46], offset); - w[54] = amd_bytealign_S (w[44], w[45], offset); - w[53] = amd_bytealign_S (w[43], w[44], offset); - w[52] = amd_bytealign_S (w[42], w[43], offset); - w[51] = amd_bytealign_S (w[41], w[42], offset); - w[50] = amd_bytealign_S (w[40], w[41], offset); - w[49] = amd_bytealign_S (w[39], w[40], offset); - w[48] = amd_bytealign_S (w[38], w[39], offset); - w[47] = amd_bytealign_S (w[37], w[38], offset); - w[46] = amd_bytealign_S (w[36], w[37], offset); - w[45] = amd_bytealign_S (w[35], w[36], offset); - w[44] = amd_bytealign_S (w[34], w[35], offset); - w[43] = amd_bytealign_S (w[33], w[34], offset); - w[42] = amd_bytealign_S (w[32], w[33], offset); - w[41] = amd_bytealign_S (w[31], w[32], offset); - w[40] = amd_bytealign_S (w[30], w[31], offset); - w[39] = amd_bytealign_S (w[29], w[30], offset); - w[38] = amd_bytealign_S (w[28], w[29], offset); - w[37] = amd_bytealign_S (w[27], w[28], offset); - w[36] = amd_bytealign_S (w[26], w[27], offset); - w[35] = amd_bytealign_S (w[25], w[26], offset); - w[34] = amd_bytealign_S (w[24], w[25], offset); - w[33] = amd_bytealign_S (w[23], w[24], offset); - w[32] = amd_bytealign_S (w[22], w[23], offset); - w[31] = amd_bytealign_S (w[21], w[22], offset); - w[30] = amd_bytealign_S (w[20], w[21], offset); - w[29] = amd_bytealign_S (w[19], w[20], offset); - w[28] = amd_bytealign_S (w[18], w[19], offset); - w[27] = amd_bytealign_S (w[17], w[18], offset); - w[26] = amd_bytealign_S (w[16], w[17], offset); - w[25] = amd_bytealign_S (w[15], w[16], offset); - w[24] = amd_bytealign_S (w[14], w[15], offset); - w[23] = amd_bytealign_S (w[13], w[14], offset); - w[22] = amd_bytealign_S (w[12], w[13], offset); - w[21] = amd_bytealign_S (w[11], w[12], offset); - w[20] = amd_bytealign_S (w[10], w[11], offset); - w[19] = amd_bytealign_S (w[ 9], w[10], offset); - w[18] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[17] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[16] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[15] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[14] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[13] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[12] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[11] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[10] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[ 9] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[53], w[54], offset); + w[62] = hc_bytealign_S (w[52], w[53], offset); + w[61] = hc_bytealign_S (w[51], w[52], offset); + w[60] = hc_bytealign_S (w[50], w[51], offset); + w[59] = hc_bytealign_S (w[49], w[50], offset); + w[58] = hc_bytealign_S (w[48], w[49], offset); + w[57] = hc_bytealign_S (w[47], w[48], offset); + w[56] = hc_bytealign_S (w[46], w[47], offset); + w[55] = hc_bytealign_S (w[45], w[46], offset); + w[54] = hc_bytealign_S (w[44], w[45], offset); + w[53] = hc_bytealign_S (w[43], w[44], offset); + w[52] = hc_bytealign_S (w[42], w[43], offset); + w[51] = hc_bytealign_S (w[41], w[42], offset); + w[50] = hc_bytealign_S (w[40], w[41], offset); + w[49] = hc_bytealign_S (w[39], w[40], offset); + w[48] = hc_bytealign_S (w[38], w[39], offset); + w[47] = hc_bytealign_S (w[37], w[38], offset); + w[46] = hc_bytealign_S (w[36], w[37], offset); + w[45] = hc_bytealign_S (w[35], w[36], offset); + w[44] = hc_bytealign_S (w[34], w[35], offset); + w[43] = hc_bytealign_S (w[33], w[34], offset); + w[42] = hc_bytealign_S (w[32], w[33], offset); + w[41] = hc_bytealign_S (w[31], w[32], offset); + w[40] = hc_bytealign_S (w[30], w[31], offset); + w[39] = hc_bytealign_S (w[29], w[30], offset); + w[38] = hc_bytealign_S (w[28], w[29], offset); + w[37] = hc_bytealign_S (w[27], w[28], offset); + w[36] = hc_bytealign_S (w[26], w[27], offset); + w[35] = hc_bytealign_S (w[25], w[26], offset); + w[34] = hc_bytealign_S (w[24], w[25], offset); + w[33] = hc_bytealign_S (w[23], w[24], offset); + w[32] = hc_bytealign_S (w[22], w[23], offset); + w[31] = hc_bytealign_S (w[21], w[22], offset); + w[30] = hc_bytealign_S (w[20], w[21], offset); + w[29] = hc_bytealign_S (w[19], w[20], offset); + w[28] = hc_bytealign_S (w[18], w[19], offset); + w[27] = hc_bytealign_S (w[17], w[18], offset); + w[26] = hc_bytealign_S (w[16], w[17], offset); + w[25] = hc_bytealign_S (w[15], w[16], offset); + w[24] = hc_bytealign_S (w[14], w[15], offset); + w[23] = hc_bytealign_S (w[13], w[14], offset); + w[22] = hc_bytealign_S (w[12], w[13], offset); + w[21] = hc_bytealign_S (w[11], w[12], offset); + w[20] = hc_bytealign_S (w[10], w[11], offset); + w[19] = hc_bytealign_S (w[ 9], w[10], offset); + w[18] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[17] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[16] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[15] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[14] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[13] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[12] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[11] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[10] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[ 9] = hc_bytealign_S ( 0, w[ 0], offset); w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; @@ -51766,60 +51766,60 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 10: - w[63] = amd_bytealign_S (w[52], w[53], offset); - w[62] = amd_bytealign_S (w[51], w[52], offset); - w[61] = amd_bytealign_S (w[50], w[51], offset); - w[60] = amd_bytealign_S (w[49], w[50], offset); - w[59] = amd_bytealign_S (w[48], w[49], offset); - w[58] = amd_bytealign_S (w[47], w[48], offset); - w[57] = amd_bytealign_S (w[46], w[47], offset); - w[56] = amd_bytealign_S (w[45], w[46], offset); - w[55] = amd_bytealign_S (w[44], w[45], offset); - w[54] = amd_bytealign_S (w[43], w[44], offset); - w[53] = amd_bytealign_S (w[42], w[43], offset); - w[52] = amd_bytealign_S (w[41], w[42], offset); - w[51] = amd_bytealign_S (w[40], w[41], offset); - w[50] = amd_bytealign_S (w[39], w[40], offset); - w[49] = amd_bytealign_S (w[38], w[39], offset); - w[48] = amd_bytealign_S (w[37], w[38], offset); - w[47] = amd_bytealign_S (w[36], w[37], offset); - w[46] = amd_bytealign_S (w[35], w[36], offset); - w[45] = amd_bytealign_S (w[34], w[35], offset); - w[44] = amd_bytealign_S (w[33], w[34], offset); - w[43] = amd_bytealign_S (w[32], w[33], offset); - w[42] = amd_bytealign_S (w[31], w[32], offset); - w[41] = amd_bytealign_S (w[30], w[31], offset); - w[40] = amd_bytealign_S (w[29], w[30], offset); - w[39] = amd_bytealign_S (w[28], w[29], offset); - w[38] = amd_bytealign_S (w[27], w[28], offset); - w[37] = amd_bytealign_S (w[26], w[27], offset); - w[36] = amd_bytealign_S (w[25], w[26], offset); - w[35] = amd_bytealign_S (w[24], w[25], offset); - w[34] = amd_bytealign_S (w[23], w[24], offset); - w[33] = amd_bytealign_S (w[22], w[23], offset); - w[32] = amd_bytealign_S (w[21], w[22], offset); - w[31] = amd_bytealign_S (w[20], w[21], offset); - w[30] = amd_bytealign_S (w[19], w[20], offset); - w[29] = amd_bytealign_S (w[18], w[19], offset); - w[28] = amd_bytealign_S (w[17], w[18], offset); - w[27] = amd_bytealign_S (w[16], w[17], offset); - w[26] = amd_bytealign_S (w[15], w[16], offset); - w[25] = amd_bytealign_S (w[14], w[15], offset); - w[24] = amd_bytealign_S (w[13], w[14], offset); - w[23] = amd_bytealign_S (w[12], w[13], offset); - w[22] = amd_bytealign_S (w[11], w[12], offset); - w[21] = amd_bytealign_S (w[10], w[11], offset); - w[20] = amd_bytealign_S (w[ 9], w[10], offset); - w[19] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[18] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[17] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[16] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[15] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[14] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[13] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[12] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[11] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[10] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[52], w[53], offset); + w[62] = hc_bytealign_S (w[51], w[52], offset); + w[61] = hc_bytealign_S (w[50], w[51], offset); + w[60] = hc_bytealign_S (w[49], w[50], offset); + w[59] = hc_bytealign_S (w[48], w[49], offset); + w[58] = hc_bytealign_S (w[47], w[48], offset); + w[57] = hc_bytealign_S (w[46], w[47], offset); + w[56] = hc_bytealign_S (w[45], w[46], offset); + w[55] = hc_bytealign_S (w[44], w[45], offset); + w[54] = hc_bytealign_S (w[43], w[44], offset); + w[53] = hc_bytealign_S (w[42], w[43], offset); + w[52] = hc_bytealign_S (w[41], w[42], offset); + w[51] = hc_bytealign_S (w[40], w[41], offset); + w[50] = hc_bytealign_S (w[39], w[40], offset); + w[49] = hc_bytealign_S (w[38], w[39], offset); + w[48] = hc_bytealign_S (w[37], w[38], offset); + w[47] = hc_bytealign_S (w[36], w[37], offset); + w[46] = hc_bytealign_S (w[35], w[36], offset); + w[45] = hc_bytealign_S (w[34], w[35], offset); + w[44] = hc_bytealign_S (w[33], w[34], offset); + w[43] = hc_bytealign_S (w[32], w[33], offset); + w[42] = hc_bytealign_S (w[31], w[32], offset); + w[41] = hc_bytealign_S (w[30], w[31], offset); + w[40] = hc_bytealign_S (w[29], w[30], offset); + w[39] = hc_bytealign_S (w[28], w[29], offset); + w[38] = hc_bytealign_S (w[27], w[28], offset); + w[37] = hc_bytealign_S (w[26], w[27], offset); + w[36] = hc_bytealign_S (w[25], w[26], offset); + w[35] = hc_bytealign_S (w[24], w[25], offset); + w[34] = hc_bytealign_S (w[23], w[24], offset); + w[33] = hc_bytealign_S (w[22], w[23], offset); + w[32] = hc_bytealign_S (w[21], w[22], offset); + w[31] = hc_bytealign_S (w[20], w[21], offset); + w[30] = hc_bytealign_S (w[19], w[20], offset); + w[29] = hc_bytealign_S (w[18], w[19], offset); + w[28] = hc_bytealign_S (w[17], w[18], offset); + w[27] = hc_bytealign_S (w[16], w[17], offset); + w[26] = hc_bytealign_S (w[15], w[16], offset); + w[25] = hc_bytealign_S (w[14], w[15], offset); + w[24] = hc_bytealign_S (w[13], w[14], offset); + w[23] = hc_bytealign_S (w[12], w[13], offset); + w[22] = hc_bytealign_S (w[11], w[12], offset); + w[21] = hc_bytealign_S (w[10], w[11], offset); + w[20] = hc_bytealign_S (w[ 9], w[10], offset); + w[19] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[18] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[17] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[16] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[15] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[14] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[13] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[12] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[11] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[10] = hc_bytealign_S ( 0, w[ 0], offset); w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; @@ -51834,59 +51834,59 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 11: - w[63] = amd_bytealign_S (w[51], w[52], offset); - w[62] = amd_bytealign_S (w[50], w[51], offset); - w[61] = amd_bytealign_S (w[49], w[50], offset); - w[60] = amd_bytealign_S (w[48], w[49], offset); - w[59] = amd_bytealign_S (w[47], w[48], offset); - w[58] = amd_bytealign_S (w[46], w[47], offset); - w[57] = amd_bytealign_S (w[45], w[46], offset); - w[56] = amd_bytealign_S (w[44], w[45], offset); - w[55] = amd_bytealign_S (w[43], w[44], offset); - w[54] = amd_bytealign_S (w[42], w[43], offset); - w[53] = amd_bytealign_S (w[41], w[42], offset); - w[52] = amd_bytealign_S (w[40], w[41], offset); - w[51] = amd_bytealign_S (w[39], w[40], offset); - w[50] = amd_bytealign_S (w[38], w[39], offset); - w[49] = amd_bytealign_S (w[37], w[38], offset); - w[48] = amd_bytealign_S (w[36], w[37], offset); - w[47] = amd_bytealign_S (w[35], w[36], offset); - w[46] = amd_bytealign_S (w[34], w[35], offset); - w[45] = amd_bytealign_S (w[33], w[34], offset); - w[44] = amd_bytealign_S (w[32], w[33], offset); - w[43] = amd_bytealign_S (w[31], w[32], offset); - w[42] = amd_bytealign_S (w[30], w[31], offset); - w[41] = amd_bytealign_S (w[29], w[30], offset); - w[40] = amd_bytealign_S (w[28], w[29], offset); - w[39] = amd_bytealign_S (w[27], w[28], offset); - w[38] = amd_bytealign_S (w[26], w[27], offset); - w[37] = amd_bytealign_S (w[25], w[26], offset); - w[36] = amd_bytealign_S (w[24], w[25], offset); - w[35] = amd_bytealign_S (w[23], w[24], offset); - w[34] = amd_bytealign_S (w[22], w[23], offset); - w[33] = amd_bytealign_S (w[21], w[22], offset); - w[32] = amd_bytealign_S (w[20], w[21], offset); - w[31] = amd_bytealign_S (w[19], w[20], offset); - w[30] = amd_bytealign_S (w[18], w[19], offset); - w[29] = amd_bytealign_S (w[17], w[18], offset); - w[28] = amd_bytealign_S (w[16], w[17], offset); - w[27] = amd_bytealign_S (w[15], w[16], offset); - w[26] = amd_bytealign_S (w[14], w[15], offset); - w[25] = amd_bytealign_S (w[13], w[14], offset); - w[24] = amd_bytealign_S (w[12], w[13], offset); - w[23] = amd_bytealign_S (w[11], w[12], offset); - w[22] = amd_bytealign_S (w[10], w[11], offset); - w[21] = amd_bytealign_S (w[ 9], w[10], offset); - w[20] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[19] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[18] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[17] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[16] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[15] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[14] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[13] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[12] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[11] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[51], w[52], offset); + w[62] = hc_bytealign_S (w[50], w[51], offset); + w[61] = hc_bytealign_S (w[49], w[50], offset); + w[60] = hc_bytealign_S (w[48], w[49], offset); + w[59] = hc_bytealign_S (w[47], w[48], offset); + w[58] = hc_bytealign_S (w[46], w[47], offset); + w[57] = hc_bytealign_S (w[45], w[46], offset); + w[56] = hc_bytealign_S (w[44], w[45], offset); + w[55] = hc_bytealign_S (w[43], w[44], offset); + w[54] = hc_bytealign_S (w[42], w[43], offset); + w[53] = hc_bytealign_S (w[41], w[42], offset); + w[52] = hc_bytealign_S (w[40], w[41], offset); + w[51] = hc_bytealign_S (w[39], w[40], offset); + w[50] = hc_bytealign_S (w[38], w[39], offset); + w[49] = hc_bytealign_S (w[37], w[38], offset); + w[48] = hc_bytealign_S (w[36], w[37], offset); + w[47] = hc_bytealign_S (w[35], w[36], offset); + w[46] = hc_bytealign_S (w[34], w[35], offset); + w[45] = hc_bytealign_S (w[33], w[34], offset); + w[44] = hc_bytealign_S (w[32], w[33], offset); + w[43] = hc_bytealign_S (w[31], w[32], offset); + w[42] = hc_bytealign_S (w[30], w[31], offset); + w[41] = hc_bytealign_S (w[29], w[30], offset); + w[40] = hc_bytealign_S (w[28], w[29], offset); + w[39] = hc_bytealign_S (w[27], w[28], offset); + w[38] = hc_bytealign_S (w[26], w[27], offset); + w[37] = hc_bytealign_S (w[25], w[26], offset); + w[36] = hc_bytealign_S (w[24], w[25], offset); + w[35] = hc_bytealign_S (w[23], w[24], offset); + w[34] = hc_bytealign_S (w[22], w[23], offset); + w[33] = hc_bytealign_S (w[21], w[22], offset); + w[32] = hc_bytealign_S (w[20], w[21], offset); + w[31] = hc_bytealign_S (w[19], w[20], offset); + w[30] = hc_bytealign_S (w[18], w[19], offset); + w[29] = hc_bytealign_S (w[17], w[18], offset); + w[28] = hc_bytealign_S (w[16], w[17], offset); + w[27] = hc_bytealign_S (w[15], w[16], offset); + w[26] = hc_bytealign_S (w[14], w[15], offset); + w[25] = hc_bytealign_S (w[13], w[14], offset); + w[24] = hc_bytealign_S (w[12], w[13], offset); + w[23] = hc_bytealign_S (w[11], w[12], offset); + w[22] = hc_bytealign_S (w[10], w[11], offset); + w[21] = hc_bytealign_S (w[ 9], w[10], offset); + w[20] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[19] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[18] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[17] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[16] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[15] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[14] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[13] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[12] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[11] = hc_bytealign_S ( 0, w[ 0], offset); w[10] = 0; w[ 9] = 0; w[ 8] = 0; @@ -51902,58 +51902,58 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 12: - w[63] = amd_bytealign_S (w[50], w[51], offset); - w[62] = amd_bytealign_S (w[49], w[50], offset); - w[61] = amd_bytealign_S (w[48], w[49], offset); - w[60] = amd_bytealign_S (w[47], w[48], offset); - w[59] = amd_bytealign_S (w[46], w[47], offset); - w[58] = amd_bytealign_S (w[45], w[46], offset); - w[57] = amd_bytealign_S (w[44], w[45], offset); - w[56] = amd_bytealign_S (w[43], w[44], offset); - w[55] = amd_bytealign_S (w[42], w[43], offset); - w[54] = amd_bytealign_S (w[41], w[42], offset); - w[53] = amd_bytealign_S (w[40], w[41], offset); - w[52] = amd_bytealign_S (w[39], w[40], offset); - w[51] = amd_bytealign_S (w[38], w[39], offset); - w[50] = amd_bytealign_S (w[37], w[38], offset); - w[49] = amd_bytealign_S (w[36], w[37], offset); - w[48] = amd_bytealign_S (w[35], w[36], offset); - w[47] = amd_bytealign_S (w[34], w[35], offset); - w[46] = amd_bytealign_S (w[33], w[34], offset); - w[45] = amd_bytealign_S (w[32], w[33], offset); - w[44] = amd_bytealign_S (w[31], w[32], offset); - w[43] = amd_bytealign_S (w[30], w[31], offset); - w[42] = amd_bytealign_S (w[29], w[30], offset); - w[41] = amd_bytealign_S (w[28], w[29], offset); - w[40] = amd_bytealign_S (w[27], w[28], offset); - w[39] = amd_bytealign_S (w[26], w[27], offset); - w[38] = amd_bytealign_S (w[25], w[26], offset); - w[37] = amd_bytealign_S (w[24], w[25], offset); - w[36] = amd_bytealign_S (w[23], w[24], offset); - w[35] = amd_bytealign_S (w[22], w[23], offset); - w[34] = amd_bytealign_S (w[21], w[22], offset); - w[33] = amd_bytealign_S (w[20], w[21], offset); - w[32] = amd_bytealign_S (w[19], w[20], offset); - w[31] = amd_bytealign_S (w[18], w[19], offset); - w[30] = amd_bytealign_S (w[17], w[18], offset); - w[29] = amd_bytealign_S (w[16], w[17], offset); - w[28] = amd_bytealign_S (w[15], w[16], offset); - w[27] = amd_bytealign_S (w[14], w[15], offset); - w[26] = amd_bytealign_S (w[13], w[14], offset); - w[25] = amd_bytealign_S (w[12], w[13], offset); - w[24] = amd_bytealign_S (w[11], w[12], offset); - w[23] = amd_bytealign_S (w[10], w[11], offset); - w[22] = amd_bytealign_S (w[ 9], w[10], offset); - w[21] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[20] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[19] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[18] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[17] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[16] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[15] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[14] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[13] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[12] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[50], w[51], offset); + w[62] = hc_bytealign_S (w[49], w[50], offset); + w[61] = hc_bytealign_S (w[48], w[49], offset); + w[60] = hc_bytealign_S (w[47], w[48], offset); + w[59] = hc_bytealign_S (w[46], w[47], offset); + w[58] = hc_bytealign_S (w[45], w[46], offset); + w[57] = hc_bytealign_S (w[44], w[45], offset); + w[56] = hc_bytealign_S (w[43], w[44], offset); + w[55] = hc_bytealign_S (w[42], w[43], offset); + w[54] = hc_bytealign_S (w[41], w[42], offset); + w[53] = hc_bytealign_S (w[40], w[41], offset); + w[52] = hc_bytealign_S (w[39], w[40], offset); + w[51] = hc_bytealign_S (w[38], w[39], offset); + w[50] = hc_bytealign_S (w[37], w[38], offset); + w[49] = hc_bytealign_S (w[36], w[37], offset); + w[48] = hc_bytealign_S (w[35], w[36], offset); + w[47] = hc_bytealign_S (w[34], w[35], offset); + w[46] = hc_bytealign_S (w[33], w[34], offset); + w[45] = hc_bytealign_S (w[32], w[33], offset); + w[44] = hc_bytealign_S (w[31], w[32], offset); + w[43] = hc_bytealign_S (w[30], w[31], offset); + w[42] = hc_bytealign_S (w[29], w[30], offset); + w[41] = hc_bytealign_S (w[28], w[29], offset); + w[40] = hc_bytealign_S (w[27], w[28], offset); + w[39] = hc_bytealign_S (w[26], w[27], offset); + w[38] = hc_bytealign_S (w[25], w[26], offset); + w[37] = hc_bytealign_S (w[24], w[25], offset); + w[36] = hc_bytealign_S (w[23], w[24], offset); + w[35] = hc_bytealign_S (w[22], w[23], offset); + w[34] = hc_bytealign_S (w[21], w[22], offset); + w[33] = hc_bytealign_S (w[20], w[21], offset); + w[32] = hc_bytealign_S (w[19], w[20], offset); + w[31] = hc_bytealign_S (w[18], w[19], offset); + w[30] = hc_bytealign_S (w[17], w[18], offset); + w[29] = hc_bytealign_S (w[16], w[17], offset); + w[28] = hc_bytealign_S (w[15], w[16], offset); + w[27] = hc_bytealign_S (w[14], w[15], offset); + w[26] = hc_bytealign_S (w[13], w[14], offset); + w[25] = hc_bytealign_S (w[12], w[13], offset); + w[24] = hc_bytealign_S (w[11], w[12], offset); + w[23] = hc_bytealign_S (w[10], w[11], offset); + w[22] = hc_bytealign_S (w[ 9], w[10], offset); + w[21] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[20] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[19] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[18] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[17] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[16] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[15] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[14] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[13] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[12] = hc_bytealign_S ( 0, w[ 0], offset); w[11] = 0; w[10] = 0; w[ 9] = 0; @@ -51970,57 +51970,57 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 13: - w[63] = amd_bytealign_S (w[49], w[50], offset); - w[62] = amd_bytealign_S (w[48], w[49], offset); - w[61] = amd_bytealign_S (w[47], w[48], offset); - w[60] = amd_bytealign_S (w[46], w[47], offset); - w[59] = amd_bytealign_S (w[45], w[46], offset); - w[58] = amd_bytealign_S (w[44], w[45], offset); - w[57] = amd_bytealign_S (w[43], w[44], offset); - w[56] = amd_bytealign_S (w[42], w[43], offset); - w[55] = amd_bytealign_S (w[41], w[42], offset); - w[54] = amd_bytealign_S (w[40], w[41], offset); - w[53] = amd_bytealign_S (w[39], w[40], offset); - w[52] = amd_bytealign_S (w[38], w[39], offset); - w[51] = amd_bytealign_S (w[37], w[38], offset); - w[50] = amd_bytealign_S (w[36], w[37], offset); - w[49] = amd_bytealign_S (w[35], w[36], offset); - w[48] = amd_bytealign_S (w[34], w[35], offset); - w[47] = amd_bytealign_S (w[33], w[34], offset); - w[46] = amd_bytealign_S (w[32], w[33], offset); - w[45] = amd_bytealign_S (w[31], w[32], offset); - w[44] = amd_bytealign_S (w[30], w[31], offset); - w[43] = amd_bytealign_S (w[29], w[30], offset); - w[42] = amd_bytealign_S (w[28], w[29], offset); - w[41] = amd_bytealign_S (w[27], w[28], offset); - w[40] = amd_bytealign_S (w[26], w[27], offset); - w[39] = amd_bytealign_S (w[25], w[26], offset); - w[38] = amd_bytealign_S (w[24], w[25], offset); - w[37] = amd_bytealign_S (w[23], w[24], offset); - w[36] = amd_bytealign_S (w[22], w[23], offset); - w[35] = amd_bytealign_S (w[21], w[22], offset); - w[34] = amd_bytealign_S (w[20], w[21], offset); - w[33] = amd_bytealign_S (w[19], w[20], offset); - w[32] = amd_bytealign_S (w[18], w[19], offset); - w[31] = amd_bytealign_S (w[17], w[18], offset); - w[30] = amd_bytealign_S (w[16], w[17], offset); - w[29] = amd_bytealign_S (w[15], w[16], offset); - w[28] = amd_bytealign_S (w[14], w[15], offset); - w[27] = amd_bytealign_S (w[13], w[14], offset); - w[26] = amd_bytealign_S (w[12], w[13], offset); - w[25] = amd_bytealign_S (w[11], w[12], offset); - w[24] = amd_bytealign_S (w[10], w[11], offset); - w[23] = amd_bytealign_S (w[ 9], w[10], offset); - w[22] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[21] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[20] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[19] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[18] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[17] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[16] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[15] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[14] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[13] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[49], w[50], offset); + w[62] = hc_bytealign_S (w[48], w[49], offset); + w[61] = hc_bytealign_S (w[47], w[48], offset); + w[60] = hc_bytealign_S (w[46], w[47], offset); + w[59] = hc_bytealign_S (w[45], w[46], offset); + w[58] = hc_bytealign_S (w[44], w[45], offset); + w[57] = hc_bytealign_S (w[43], w[44], offset); + w[56] = hc_bytealign_S (w[42], w[43], offset); + w[55] = hc_bytealign_S (w[41], w[42], offset); + w[54] = hc_bytealign_S (w[40], w[41], offset); + w[53] = hc_bytealign_S (w[39], w[40], offset); + w[52] = hc_bytealign_S (w[38], w[39], offset); + w[51] = hc_bytealign_S (w[37], w[38], offset); + w[50] = hc_bytealign_S (w[36], w[37], offset); + w[49] = hc_bytealign_S (w[35], w[36], offset); + w[48] = hc_bytealign_S (w[34], w[35], offset); + w[47] = hc_bytealign_S (w[33], w[34], offset); + w[46] = hc_bytealign_S (w[32], w[33], offset); + w[45] = hc_bytealign_S (w[31], w[32], offset); + w[44] = hc_bytealign_S (w[30], w[31], offset); + w[43] = hc_bytealign_S (w[29], w[30], offset); + w[42] = hc_bytealign_S (w[28], w[29], offset); + w[41] = hc_bytealign_S (w[27], w[28], offset); + w[40] = hc_bytealign_S (w[26], w[27], offset); + w[39] = hc_bytealign_S (w[25], w[26], offset); + w[38] = hc_bytealign_S (w[24], w[25], offset); + w[37] = hc_bytealign_S (w[23], w[24], offset); + w[36] = hc_bytealign_S (w[22], w[23], offset); + w[35] = hc_bytealign_S (w[21], w[22], offset); + w[34] = hc_bytealign_S (w[20], w[21], offset); + w[33] = hc_bytealign_S (w[19], w[20], offset); + w[32] = hc_bytealign_S (w[18], w[19], offset); + w[31] = hc_bytealign_S (w[17], w[18], offset); + w[30] = hc_bytealign_S (w[16], w[17], offset); + w[29] = hc_bytealign_S (w[15], w[16], offset); + w[28] = hc_bytealign_S (w[14], w[15], offset); + w[27] = hc_bytealign_S (w[13], w[14], offset); + w[26] = hc_bytealign_S (w[12], w[13], offset); + w[25] = hc_bytealign_S (w[11], w[12], offset); + w[24] = hc_bytealign_S (w[10], w[11], offset); + w[23] = hc_bytealign_S (w[ 9], w[10], offset); + w[22] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[21] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[20] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[19] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[18] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[17] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[16] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[15] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[14] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[13] = hc_bytealign_S ( 0, w[ 0], offset); w[12] = 0; w[11] = 0; w[10] = 0; @@ -52038,56 +52038,56 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 14: - w[63] = amd_bytealign_S (w[48], w[49], offset); - w[62] = amd_bytealign_S (w[47], w[48], offset); - w[61] = amd_bytealign_S (w[46], w[47], offset); - w[60] = amd_bytealign_S (w[45], w[46], offset); - w[59] = amd_bytealign_S (w[44], w[45], offset); - w[58] = amd_bytealign_S (w[43], w[44], offset); - w[57] = amd_bytealign_S (w[42], w[43], offset); - w[56] = amd_bytealign_S (w[41], w[42], offset); - w[55] = amd_bytealign_S (w[40], w[41], offset); - w[54] = amd_bytealign_S (w[39], w[40], offset); - w[53] = amd_bytealign_S (w[38], w[39], offset); - w[52] = amd_bytealign_S (w[37], w[38], offset); - w[51] = amd_bytealign_S (w[36], w[37], offset); - w[50] = amd_bytealign_S (w[35], w[36], offset); - w[49] = amd_bytealign_S (w[34], w[35], offset); - w[48] = amd_bytealign_S (w[33], w[34], offset); - w[47] = amd_bytealign_S (w[32], w[33], offset); - w[46] = amd_bytealign_S (w[31], w[32], offset); - w[45] = amd_bytealign_S (w[30], w[31], offset); - w[44] = amd_bytealign_S (w[29], w[30], offset); - w[43] = amd_bytealign_S (w[28], w[29], offset); - w[42] = amd_bytealign_S (w[27], w[28], offset); - w[41] = amd_bytealign_S (w[26], w[27], offset); - w[40] = amd_bytealign_S (w[25], w[26], offset); - w[39] = amd_bytealign_S (w[24], w[25], offset); - w[38] = amd_bytealign_S (w[23], w[24], offset); - w[37] = amd_bytealign_S (w[22], w[23], offset); - w[36] = amd_bytealign_S (w[21], w[22], offset); - w[35] = amd_bytealign_S (w[20], w[21], offset); - w[34] = amd_bytealign_S (w[19], w[20], offset); - w[33] = amd_bytealign_S (w[18], w[19], offset); - w[32] = amd_bytealign_S (w[17], w[18], offset); - w[31] = amd_bytealign_S (w[16], w[17], offset); - w[30] = amd_bytealign_S (w[15], w[16], offset); - w[29] = amd_bytealign_S (w[14], w[15], offset); - w[28] = amd_bytealign_S (w[13], w[14], offset); - w[27] = amd_bytealign_S (w[12], w[13], offset); - w[26] = amd_bytealign_S (w[11], w[12], offset); - w[25] = amd_bytealign_S (w[10], w[11], offset); - w[24] = amd_bytealign_S (w[ 9], w[10], offset); - w[23] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[22] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[21] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[20] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[19] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[18] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[17] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[16] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[15] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[14] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[48], w[49], offset); + w[62] = hc_bytealign_S (w[47], w[48], offset); + w[61] = hc_bytealign_S (w[46], w[47], offset); + w[60] = hc_bytealign_S (w[45], w[46], offset); + w[59] = hc_bytealign_S (w[44], w[45], offset); + w[58] = hc_bytealign_S (w[43], w[44], offset); + w[57] = hc_bytealign_S (w[42], w[43], offset); + w[56] = hc_bytealign_S (w[41], w[42], offset); + w[55] = hc_bytealign_S (w[40], w[41], offset); + w[54] = hc_bytealign_S (w[39], w[40], offset); + w[53] = hc_bytealign_S (w[38], w[39], offset); + w[52] = hc_bytealign_S (w[37], w[38], offset); + w[51] = hc_bytealign_S (w[36], w[37], offset); + w[50] = hc_bytealign_S (w[35], w[36], offset); + w[49] = hc_bytealign_S (w[34], w[35], offset); + w[48] = hc_bytealign_S (w[33], w[34], offset); + w[47] = hc_bytealign_S (w[32], w[33], offset); + w[46] = hc_bytealign_S (w[31], w[32], offset); + w[45] = hc_bytealign_S (w[30], w[31], offset); + w[44] = hc_bytealign_S (w[29], w[30], offset); + w[43] = hc_bytealign_S (w[28], w[29], offset); + w[42] = hc_bytealign_S (w[27], w[28], offset); + w[41] = hc_bytealign_S (w[26], w[27], offset); + w[40] = hc_bytealign_S (w[25], w[26], offset); + w[39] = hc_bytealign_S (w[24], w[25], offset); + w[38] = hc_bytealign_S (w[23], w[24], offset); + w[37] = hc_bytealign_S (w[22], w[23], offset); + w[36] = hc_bytealign_S (w[21], w[22], offset); + w[35] = hc_bytealign_S (w[20], w[21], offset); + w[34] = hc_bytealign_S (w[19], w[20], offset); + w[33] = hc_bytealign_S (w[18], w[19], offset); + w[32] = hc_bytealign_S (w[17], w[18], offset); + w[31] = hc_bytealign_S (w[16], w[17], offset); + w[30] = hc_bytealign_S (w[15], w[16], offset); + w[29] = hc_bytealign_S (w[14], w[15], offset); + w[28] = hc_bytealign_S (w[13], w[14], offset); + w[27] = hc_bytealign_S (w[12], w[13], offset); + w[26] = hc_bytealign_S (w[11], w[12], offset); + w[25] = hc_bytealign_S (w[10], w[11], offset); + w[24] = hc_bytealign_S (w[ 9], w[10], offset); + w[23] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[22] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[21] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[20] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[19] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[18] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[17] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[16] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[15] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[14] = hc_bytealign_S ( 0, w[ 0], offset); w[13] = 0; w[12] = 0; w[11] = 0; @@ -52106,55 +52106,55 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 15: - w[63] = amd_bytealign_S (w[47], w[48], offset); - w[62] = amd_bytealign_S (w[46], w[47], offset); - w[61] = amd_bytealign_S (w[45], w[46], offset); - w[60] = amd_bytealign_S (w[44], w[45], offset); - w[59] = amd_bytealign_S (w[43], w[44], offset); - w[58] = amd_bytealign_S (w[42], w[43], offset); - w[57] = amd_bytealign_S (w[41], w[42], offset); - w[56] = amd_bytealign_S (w[40], w[41], offset); - w[55] = amd_bytealign_S (w[39], w[40], offset); - w[54] = amd_bytealign_S (w[38], w[39], offset); - w[53] = amd_bytealign_S (w[37], w[38], offset); - w[52] = amd_bytealign_S (w[36], w[37], offset); - w[51] = amd_bytealign_S (w[35], w[36], offset); - w[50] = amd_bytealign_S (w[34], w[35], offset); - w[49] = amd_bytealign_S (w[33], w[34], offset); - w[48] = amd_bytealign_S (w[32], w[33], offset); - w[47] = amd_bytealign_S (w[31], w[32], offset); - w[46] = amd_bytealign_S (w[30], w[31], offset); - w[45] = amd_bytealign_S (w[29], w[30], offset); - w[44] = amd_bytealign_S (w[28], w[29], offset); - w[43] = amd_bytealign_S (w[27], w[28], offset); - w[42] = amd_bytealign_S (w[26], w[27], offset); - w[41] = amd_bytealign_S (w[25], w[26], offset); - w[40] = amd_bytealign_S (w[24], w[25], offset); - w[39] = amd_bytealign_S (w[23], w[24], offset); - w[38] = amd_bytealign_S (w[22], w[23], offset); - w[37] = amd_bytealign_S (w[21], w[22], offset); - w[36] = amd_bytealign_S (w[20], w[21], offset); - w[35] = amd_bytealign_S (w[19], w[20], offset); - w[34] = amd_bytealign_S (w[18], w[19], offset); - w[33] = amd_bytealign_S (w[17], w[18], offset); - w[32] = amd_bytealign_S (w[16], w[17], offset); - w[31] = amd_bytealign_S (w[15], w[16], offset); - w[30] = amd_bytealign_S (w[14], w[15], offset); - w[29] = amd_bytealign_S (w[13], w[14], offset); - w[28] = amd_bytealign_S (w[12], w[13], offset); - w[27] = amd_bytealign_S (w[11], w[12], offset); - w[26] = amd_bytealign_S (w[10], w[11], offset); - w[25] = amd_bytealign_S (w[ 9], w[10], offset); - w[24] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[23] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[22] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[21] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[20] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[19] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[18] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[17] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[16] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[15] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[47], w[48], offset); + w[62] = hc_bytealign_S (w[46], w[47], offset); + w[61] = hc_bytealign_S (w[45], w[46], offset); + w[60] = hc_bytealign_S (w[44], w[45], offset); + w[59] = hc_bytealign_S (w[43], w[44], offset); + w[58] = hc_bytealign_S (w[42], w[43], offset); + w[57] = hc_bytealign_S (w[41], w[42], offset); + w[56] = hc_bytealign_S (w[40], w[41], offset); + w[55] = hc_bytealign_S (w[39], w[40], offset); + w[54] = hc_bytealign_S (w[38], w[39], offset); + w[53] = hc_bytealign_S (w[37], w[38], offset); + w[52] = hc_bytealign_S (w[36], w[37], offset); + w[51] = hc_bytealign_S (w[35], w[36], offset); + w[50] = hc_bytealign_S (w[34], w[35], offset); + w[49] = hc_bytealign_S (w[33], w[34], offset); + w[48] = hc_bytealign_S (w[32], w[33], offset); + w[47] = hc_bytealign_S (w[31], w[32], offset); + w[46] = hc_bytealign_S (w[30], w[31], offset); + w[45] = hc_bytealign_S (w[29], w[30], offset); + w[44] = hc_bytealign_S (w[28], w[29], offset); + w[43] = hc_bytealign_S (w[27], w[28], offset); + w[42] = hc_bytealign_S (w[26], w[27], offset); + w[41] = hc_bytealign_S (w[25], w[26], offset); + w[40] = hc_bytealign_S (w[24], w[25], offset); + w[39] = hc_bytealign_S (w[23], w[24], offset); + w[38] = hc_bytealign_S (w[22], w[23], offset); + w[37] = hc_bytealign_S (w[21], w[22], offset); + w[36] = hc_bytealign_S (w[20], w[21], offset); + w[35] = hc_bytealign_S (w[19], w[20], offset); + w[34] = hc_bytealign_S (w[18], w[19], offset); + w[33] = hc_bytealign_S (w[17], w[18], offset); + w[32] = hc_bytealign_S (w[16], w[17], offset); + w[31] = hc_bytealign_S (w[15], w[16], offset); + w[30] = hc_bytealign_S (w[14], w[15], offset); + w[29] = hc_bytealign_S (w[13], w[14], offset); + w[28] = hc_bytealign_S (w[12], w[13], offset); + w[27] = hc_bytealign_S (w[11], w[12], offset); + w[26] = hc_bytealign_S (w[10], w[11], offset); + w[25] = hc_bytealign_S (w[ 9], w[10], offset); + w[24] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[23] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[22] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[21] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[20] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[19] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[18] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[17] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[16] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[15] = hc_bytealign_S ( 0, w[ 0], offset); w[14] = 0; w[13] = 0; w[12] = 0; @@ -52174,54 +52174,54 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 16: - w[63] = amd_bytealign_S (w[46], w[47], offset); - w[62] = amd_bytealign_S (w[45], w[46], offset); - w[61] = amd_bytealign_S (w[44], w[45], offset); - w[60] = amd_bytealign_S (w[43], w[44], offset); - w[59] = amd_bytealign_S (w[42], w[43], offset); - w[58] = amd_bytealign_S (w[41], w[42], offset); - w[57] = amd_bytealign_S (w[40], w[41], offset); - w[56] = amd_bytealign_S (w[39], w[40], offset); - w[55] = amd_bytealign_S (w[38], w[39], offset); - w[54] = amd_bytealign_S (w[37], w[38], offset); - w[53] = amd_bytealign_S (w[36], w[37], offset); - w[52] = amd_bytealign_S (w[35], w[36], offset); - w[51] = amd_bytealign_S (w[34], w[35], offset); - w[50] = amd_bytealign_S (w[33], w[34], offset); - w[49] = amd_bytealign_S (w[32], w[33], offset); - w[48] = amd_bytealign_S (w[31], w[32], offset); - w[47] = amd_bytealign_S (w[30], w[31], offset); - w[46] = amd_bytealign_S (w[29], w[30], offset); - w[45] = amd_bytealign_S (w[28], w[29], offset); - w[44] = amd_bytealign_S (w[27], w[28], offset); - w[43] = amd_bytealign_S (w[26], w[27], offset); - w[42] = amd_bytealign_S (w[25], w[26], offset); - w[41] = amd_bytealign_S (w[24], w[25], offset); - w[40] = amd_bytealign_S (w[23], w[24], offset); - w[39] = amd_bytealign_S (w[22], w[23], offset); - w[38] = amd_bytealign_S (w[21], w[22], offset); - w[37] = amd_bytealign_S (w[20], w[21], offset); - w[36] = amd_bytealign_S (w[19], w[20], offset); - w[35] = amd_bytealign_S (w[18], w[19], offset); - w[34] = amd_bytealign_S (w[17], w[18], offset); - w[33] = amd_bytealign_S (w[16], w[17], offset); - w[32] = amd_bytealign_S (w[15], w[16], offset); - w[31] = amd_bytealign_S (w[14], w[15], offset); - w[30] = amd_bytealign_S (w[13], w[14], offset); - w[29] = amd_bytealign_S (w[12], w[13], offset); - w[28] = amd_bytealign_S (w[11], w[12], offset); - w[27] = amd_bytealign_S (w[10], w[11], offset); - w[26] = amd_bytealign_S (w[ 9], w[10], offset); - w[25] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[24] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[23] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[22] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[21] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[20] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[19] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[18] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[17] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[16] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[46], w[47], offset); + w[62] = hc_bytealign_S (w[45], w[46], offset); + w[61] = hc_bytealign_S (w[44], w[45], offset); + w[60] = hc_bytealign_S (w[43], w[44], offset); + w[59] = hc_bytealign_S (w[42], w[43], offset); + w[58] = hc_bytealign_S (w[41], w[42], offset); + w[57] = hc_bytealign_S (w[40], w[41], offset); + w[56] = hc_bytealign_S (w[39], w[40], offset); + w[55] = hc_bytealign_S (w[38], w[39], offset); + w[54] = hc_bytealign_S (w[37], w[38], offset); + w[53] = hc_bytealign_S (w[36], w[37], offset); + w[52] = hc_bytealign_S (w[35], w[36], offset); + w[51] = hc_bytealign_S (w[34], w[35], offset); + w[50] = hc_bytealign_S (w[33], w[34], offset); + w[49] = hc_bytealign_S (w[32], w[33], offset); + w[48] = hc_bytealign_S (w[31], w[32], offset); + w[47] = hc_bytealign_S (w[30], w[31], offset); + w[46] = hc_bytealign_S (w[29], w[30], offset); + w[45] = hc_bytealign_S (w[28], w[29], offset); + w[44] = hc_bytealign_S (w[27], w[28], offset); + w[43] = hc_bytealign_S (w[26], w[27], offset); + w[42] = hc_bytealign_S (w[25], w[26], offset); + w[41] = hc_bytealign_S (w[24], w[25], offset); + w[40] = hc_bytealign_S (w[23], w[24], offset); + w[39] = hc_bytealign_S (w[22], w[23], offset); + w[38] = hc_bytealign_S (w[21], w[22], offset); + w[37] = hc_bytealign_S (w[20], w[21], offset); + w[36] = hc_bytealign_S (w[19], w[20], offset); + w[35] = hc_bytealign_S (w[18], w[19], offset); + w[34] = hc_bytealign_S (w[17], w[18], offset); + w[33] = hc_bytealign_S (w[16], w[17], offset); + w[32] = hc_bytealign_S (w[15], w[16], offset); + w[31] = hc_bytealign_S (w[14], w[15], offset); + w[30] = hc_bytealign_S (w[13], w[14], offset); + w[29] = hc_bytealign_S (w[12], w[13], offset); + w[28] = hc_bytealign_S (w[11], w[12], offset); + w[27] = hc_bytealign_S (w[10], w[11], offset); + w[26] = hc_bytealign_S (w[ 9], w[10], offset); + w[25] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[24] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[23] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[22] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[21] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[20] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[19] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[18] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[17] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[16] = hc_bytealign_S ( 0, w[ 0], offset); w[15] = 0; w[14] = 0; w[13] = 0; @@ -52242,53 +52242,53 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 17: - w[63] = amd_bytealign_S (w[45], w[46], offset); - w[62] = amd_bytealign_S (w[44], w[45], offset); - w[61] = amd_bytealign_S (w[43], w[44], offset); - w[60] = amd_bytealign_S (w[42], w[43], offset); - w[59] = amd_bytealign_S (w[41], w[42], offset); - w[58] = amd_bytealign_S (w[40], w[41], offset); - w[57] = amd_bytealign_S (w[39], w[40], offset); - w[56] = amd_bytealign_S (w[38], w[39], offset); - w[55] = amd_bytealign_S (w[37], w[38], offset); - w[54] = amd_bytealign_S (w[36], w[37], offset); - w[53] = amd_bytealign_S (w[35], w[36], offset); - w[52] = amd_bytealign_S (w[34], w[35], offset); - w[51] = amd_bytealign_S (w[33], w[34], offset); - w[50] = amd_bytealign_S (w[32], w[33], offset); - w[49] = amd_bytealign_S (w[31], w[32], offset); - w[48] = amd_bytealign_S (w[30], w[31], offset); - w[47] = amd_bytealign_S (w[29], w[30], offset); - w[46] = amd_bytealign_S (w[28], w[29], offset); - w[45] = amd_bytealign_S (w[27], w[28], offset); - w[44] = amd_bytealign_S (w[26], w[27], offset); - w[43] = amd_bytealign_S (w[25], w[26], offset); - w[42] = amd_bytealign_S (w[24], w[25], offset); - w[41] = amd_bytealign_S (w[23], w[24], offset); - w[40] = amd_bytealign_S (w[22], w[23], offset); - w[39] = amd_bytealign_S (w[21], w[22], offset); - w[38] = amd_bytealign_S (w[20], w[21], offset); - w[37] = amd_bytealign_S (w[19], w[20], offset); - w[36] = amd_bytealign_S (w[18], w[19], offset); - w[35] = amd_bytealign_S (w[17], w[18], offset); - w[34] = amd_bytealign_S (w[16], w[17], offset); - w[33] = amd_bytealign_S (w[15], w[16], offset); - w[32] = amd_bytealign_S (w[14], w[15], offset); - w[31] = amd_bytealign_S (w[13], w[14], offset); - w[30] = amd_bytealign_S (w[12], w[13], offset); - w[29] = amd_bytealign_S (w[11], w[12], offset); - w[28] = amd_bytealign_S (w[10], w[11], offset); - w[27] = amd_bytealign_S (w[ 9], w[10], offset); - w[26] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[25] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[24] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[23] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[22] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[21] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[20] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[19] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[18] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[17] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[45], w[46], offset); + w[62] = hc_bytealign_S (w[44], w[45], offset); + w[61] = hc_bytealign_S (w[43], w[44], offset); + w[60] = hc_bytealign_S (w[42], w[43], offset); + w[59] = hc_bytealign_S (w[41], w[42], offset); + w[58] = hc_bytealign_S (w[40], w[41], offset); + w[57] = hc_bytealign_S (w[39], w[40], offset); + w[56] = hc_bytealign_S (w[38], w[39], offset); + w[55] = hc_bytealign_S (w[37], w[38], offset); + w[54] = hc_bytealign_S (w[36], w[37], offset); + w[53] = hc_bytealign_S (w[35], w[36], offset); + w[52] = hc_bytealign_S (w[34], w[35], offset); + w[51] = hc_bytealign_S (w[33], w[34], offset); + w[50] = hc_bytealign_S (w[32], w[33], offset); + w[49] = hc_bytealign_S (w[31], w[32], offset); + w[48] = hc_bytealign_S (w[30], w[31], offset); + w[47] = hc_bytealign_S (w[29], w[30], offset); + w[46] = hc_bytealign_S (w[28], w[29], offset); + w[45] = hc_bytealign_S (w[27], w[28], offset); + w[44] = hc_bytealign_S (w[26], w[27], offset); + w[43] = hc_bytealign_S (w[25], w[26], offset); + w[42] = hc_bytealign_S (w[24], w[25], offset); + w[41] = hc_bytealign_S (w[23], w[24], offset); + w[40] = hc_bytealign_S (w[22], w[23], offset); + w[39] = hc_bytealign_S (w[21], w[22], offset); + w[38] = hc_bytealign_S (w[20], w[21], offset); + w[37] = hc_bytealign_S (w[19], w[20], offset); + w[36] = hc_bytealign_S (w[18], w[19], offset); + w[35] = hc_bytealign_S (w[17], w[18], offset); + w[34] = hc_bytealign_S (w[16], w[17], offset); + w[33] = hc_bytealign_S (w[15], w[16], offset); + w[32] = hc_bytealign_S (w[14], w[15], offset); + w[31] = hc_bytealign_S (w[13], w[14], offset); + w[30] = hc_bytealign_S (w[12], w[13], offset); + w[29] = hc_bytealign_S (w[11], w[12], offset); + w[28] = hc_bytealign_S (w[10], w[11], offset); + w[27] = hc_bytealign_S (w[ 9], w[10], offset); + w[26] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[25] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[24] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[23] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[22] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[21] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[20] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[19] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[18] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[17] = hc_bytealign_S ( 0, w[ 0], offset); w[16] = 0; w[15] = 0; w[14] = 0; @@ -52310,52 +52310,52 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 18: - w[63] = amd_bytealign_S (w[44], w[45], offset); - w[62] = amd_bytealign_S (w[43], w[44], offset); - w[61] = amd_bytealign_S (w[42], w[43], offset); - w[60] = amd_bytealign_S (w[41], w[42], offset); - w[59] = amd_bytealign_S (w[40], w[41], offset); - w[58] = amd_bytealign_S (w[39], w[40], offset); - w[57] = amd_bytealign_S (w[38], w[39], offset); - w[56] = amd_bytealign_S (w[37], w[38], offset); - w[55] = amd_bytealign_S (w[36], w[37], offset); - w[54] = amd_bytealign_S (w[35], w[36], offset); - w[53] = amd_bytealign_S (w[34], w[35], offset); - w[52] = amd_bytealign_S (w[33], w[34], offset); - w[51] = amd_bytealign_S (w[32], w[33], offset); - w[50] = amd_bytealign_S (w[31], w[32], offset); - w[49] = amd_bytealign_S (w[30], w[31], offset); - w[48] = amd_bytealign_S (w[29], w[30], offset); - w[47] = amd_bytealign_S (w[28], w[29], offset); - w[46] = amd_bytealign_S (w[27], w[28], offset); - w[45] = amd_bytealign_S (w[26], w[27], offset); - w[44] = amd_bytealign_S (w[25], w[26], offset); - w[43] = amd_bytealign_S (w[24], w[25], offset); - w[42] = amd_bytealign_S (w[23], w[24], offset); - w[41] = amd_bytealign_S (w[22], w[23], offset); - w[40] = amd_bytealign_S (w[21], w[22], offset); - w[39] = amd_bytealign_S (w[20], w[21], offset); - w[38] = amd_bytealign_S (w[19], w[20], offset); - w[37] = amd_bytealign_S (w[18], w[19], offset); - w[36] = amd_bytealign_S (w[17], w[18], offset); - w[35] = amd_bytealign_S (w[16], w[17], offset); - w[34] = amd_bytealign_S (w[15], w[16], offset); - w[33] = amd_bytealign_S (w[14], w[15], offset); - w[32] = amd_bytealign_S (w[13], w[14], offset); - w[31] = amd_bytealign_S (w[12], w[13], offset); - w[30] = amd_bytealign_S (w[11], w[12], offset); - w[29] = amd_bytealign_S (w[10], w[11], offset); - w[28] = amd_bytealign_S (w[ 9], w[10], offset); - w[27] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[26] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[25] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[24] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[23] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[22] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[21] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[20] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[19] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[18] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[44], w[45], offset); + w[62] = hc_bytealign_S (w[43], w[44], offset); + w[61] = hc_bytealign_S (w[42], w[43], offset); + w[60] = hc_bytealign_S (w[41], w[42], offset); + w[59] = hc_bytealign_S (w[40], w[41], offset); + w[58] = hc_bytealign_S (w[39], w[40], offset); + w[57] = hc_bytealign_S (w[38], w[39], offset); + w[56] = hc_bytealign_S (w[37], w[38], offset); + w[55] = hc_bytealign_S (w[36], w[37], offset); + w[54] = hc_bytealign_S (w[35], w[36], offset); + w[53] = hc_bytealign_S (w[34], w[35], offset); + w[52] = hc_bytealign_S (w[33], w[34], offset); + w[51] = hc_bytealign_S (w[32], w[33], offset); + w[50] = hc_bytealign_S (w[31], w[32], offset); + w[49] = hc_bytealign_S (w[30], w[31], offset); + w[48] = hc_bytealign_S (w[29], w[30], offset); + w[47] = hc_bytealign_S (w[28], w[29], offset); + w[46] = hc_bytealign_S (w[27], w[28], offset); + w[45] = hc_bytealign_S (w[26], w[27], offset); + w[44] = hc_bytealign_S (w[25], w[26], offset); + w[43] = hc_bytealign_S (w[24], w[25], offset); + w[42] = hc_bytealign_S (w[23], w[24], offset); + w[41] = hc_bytealign_S (w[22], w[23], offset); + w[40] = hc_bytealign_S (w[21], w[22], offset); + w[39] = hc_bytealign_S (w[20], w[21], offset); + w[38] = hc_bytealign_S (w[19], w[20], offset); + w[37] = hc_bytealign_S (w[18], w[19], offset); + w[36] = hc_bytealign_S (w[17], w[18], offset); + w[35] = hc_bytealign_S (w[16], w[17], offset); + w[34] = hc_bytealign_S (w[15], w[16], offset); + w[33] = hc_bytealign_S (w[14], w[15], offset); + w[32] = hc_bytealign_S (w[13], w[14], offset); + w[31] = hc_bytealign_S (w[12], w[13], offset); + w[30] = hc_bytealign_S (w[11], w[12], offset); + w[29] = hc_bytealign_S (w[10], w[11], offset); + w[28] = hc_bytealign_S (w[ 9], w[10], offset); + w[27] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[26] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[25] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[24] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[23] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[22] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[21] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[20] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[19] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[18] = hc_bytealign_S ( 0, w[ 0], offset); w[17] = 0; w[16] = 0; w[15] = 0; @@ -52378,51 +52378,51 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 19: - w[63] = amd_bytealign_S (w[43], w[44], offset); - w[62] = amd_bytealign_S (w[42], w[43], offset); - w[61] = amd_bytealign_S (w[41], w[42], offset); - w[60] = amd_bytealign_S (w[40], w[41], offset); - w[59] = amd_bytealign_S (w[39], w[40], offset); - w[58] = amd_bytealign_S (w[38], w[39], offset); - w[57] = amd_bytealign_S (w[37], w[38], offset); - w[56] = amd_bytealign_S (w[36], w[37], offset); - w[55] = amd_bytealign_S (w[35], w[36], offset); - w[54] = amd_bytealign_S (w[34], w[35], offset); - w[53] = amd_bytealign_S (w[33], w[34], offset); - w[52] = amd_bytealign_S (w[32], w[33], offset); - w[51] = amd_bytealign_S (w[31], w[32], offset); - w[50] = amd_bytealign_S (w[30], w[31], offset); - w[49] = amd_bytealign_S (w[29], w[30], offset); - w[48] = amd_bytealign_S (w[28], w[29], offset); - w[47] = amd_bytealign_S (w[27], w[28], offset); - w[46] = amd_bytealign_S (w[26], w[27], offset); - w[45] = amd_bytealign_S (w[25], w[26], offset); - w[44] = amd_bytealign_S (w[24], w[25], offset); - w[43] = amd_bytealign_S (w[23], w[24], offset); - w[42] = amd_bytealign_S (w[22], w[23], offset); - w[41] = amd_bytealign_S (w[21], w[22], offset); - w[40] = amd_bytealign_S (w[20], w[21], offset); - w[39] = amd_bytealign_S (w[19], w[20], offset); - w[38] = amd_bytealign_S (w[18], w[19], offset); - w[37] = amd_bytealign_S (w[17], w[18], offset); - w[36] = amd_bytealign_S (w[16], w[17], offset); - w[35] = amd_bytealign_S (w[15], w[16], offset); - w[34] = amd_bytealign_S (w[14], w[15], offset); - w[33] = amd_bytealign_S (w[13], w[14], offset); - w[32] = amd_bytealign_S (w[12], w[13], offset); - w[31] = amd_bytealign_S (w[11], w[12], offset); - w[30] = amd_bytealign_S (w[10], w[11], offset); - w[29] = amd_bytealign_S (w[ 9], w[10], offset); - w[28] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[27] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[26] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[25] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[24] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[23] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[22] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[21] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[20] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[19] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[43], w[44], offset); + w[62] = hc_bytealign_S (w[42], w[43], offset); + w[61] = hc_bytealign_S (w[41], w[42], offset); + w[60] = hc_bytealign_S (w[40], w[41], offset); + w[59] = hc_bytealign_S (w[39], w[40], offset); + w[58] = hc_bytealign_S (w[38], w[39], offset); + w[57] = hc_bytealign_S (w[37], w[38], offset); + w[56] = hc_bytealign_S (w[36], w[37], offset); + w[55] = hc_bytealign_S (w[35], w[36], offset); + w[54] = hc_bytealign_S (w[34], w[35], offset); + w[53] = hc_bytealign_S (w[33], w[34], offset); + w[52] = hc_bytealign_S (w[32], w[33], offset); + w[51] = hc_bytealign_S (w[31], w[32], offset); + w[50] = hc_bytealign_S (w[30], w[31], offset); + w[49] = hc_bytealign_S (w[29], w[30], offset); + w[48] = hc_bytealign_S (w[28], w[29], offset); + w[47] = hc_bytealign_S (w[27], w[28], offset); + w[46] = hc_bytealign_S (w[26], w[27], offset); + w[45] = hc_bytealign_S (w[25], w[26], offset); + w[44] = hc_bytealign_S (w[24], w[25], offset); + w[43] = hc_bytealign_S (w[23], w[24], offset); + w[42] = hc_bytealign_S (w[22], w[23], offset); + w[41] = hc_bytealign_S (w[21], w[22], offset); + w[40] = hc_bytealign_S (w[20], w[21], offset); + w[39] = hc_bytealign_S (w[19], w[20], offset); + w[38] = hc_bytealign_S (w[18], w[19], offset); + w[37] = hc_bytealign_S (w[17], w[18], offset); + w[36] = hc_bytealign_S (w[16], w[17], offset); + w[35] = hc_bytealign_S (w[15], w[16], offset); + w[34] = hc_bytealign_S (w[14], w[15], offset); + w[33] = hc_bytealign_S (w[13], w[14], offset); + w[32] = hc_bytealign_S (w[12], w[13], offset); + w[31] = hc_bytealign_S (w[11], w[12], offset); + w[30] = hc_bytealign_S (w[10], w[11], offset); + w[29] = hc_bytealign_S (w[ 9], w[10], offset); + w[28] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[27] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[26] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[25] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[24] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[23] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[22] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[21] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[20] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[19] = hc_bytealign_S ( 0, w[ 0], offset); w[18] = 0; w[17] = 0; w[16] = 0; @@ -52446,50 +52446,50 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 20: - w[63] = amd_bytealign_S (w[42], w[43], offset); - w[62] = amd_bytealign_S (w[41], w[42], offset); - w[61] = amd_bytealign_S (w[40], w[41], offset); - w[60] = amd_bytealign_S (w[39], w[40], offset); - w[59] = amd_bytealign_S (w[38], w[39], offset); - w[58] = amd_bytealign_S (w[37], w[38], offset); - w[57] = amd_bytealign_S (w[36], w[37], offset); - w[56] = amd_bytealign_S (w[35], w[36], offset); - w[55] = amd_bytealign_S (w[34], w[35], offset); - w[54] = amd_bytealign_S (w[33], w[34], offset); - w[53] = amd_bytealign_S (w[32], w[33], offset); - w[52] = amd_bytealign_S (w[31], w[32], offset); - w[51] = amd_bytealign_S (w[30], w[31], offset); - w[50] = amd_bytealign_S (w[29], w[30], offset); - w[49] = amd_bytealign_S (w[28], w[29], offset); - w[48] = amd_bytealign_S (w[27], w[28], offset); - w[47] = amd_bytealign_S (w[26], w[27], offset); - w[46] = amd_bytealign_S (w[25], w[26], offset); - w[45] = amd_bytealign_S (w[24], w[25], offset); - w[44] = amd_bytealign_S (w[23], w[24], offset); - w[43] = amd_bytealign_S (w[22], w[23], offset); - w[42] = amd_bytealign_S (w[21], w[22], offset); - w[41] = amd_bytealign_S (w[20], w[21], offset); - w[40] = amd_bytealign_S (w[19], w[20], offset); - w[39] = amd_bytealign_S (w[18], w[19], offset); - w[38] = amd_bytealign_S (w[17], w[18], offset); - w[37] = amd_bytealign_S (w[16], w[17], offset); - w[36] = amd_bytealign_S (w[15], w[16], offset); - w[35] = amd_bytealign_S (w[14], w[15], offset); - w[34] = amd_bytealign_S (w[13], w[14], offset); - w[33] = amd_bytealign_S (w[12], w[13], offset); - w[32] = amd_bytealign_S (w[11], w[12], offset); - w[31] = amd_bytealign_S (w[10], w[11], offset); - w[30] = amd_bytealign_S (w[ 9], w[10], offset); - w[29] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[28] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[27] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[26] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[25] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[24] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[23] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[22] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[21] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[20] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[42], w[43], offset); + w[62] = hc_bytealign_S (w[41], w[42], offset); + w[61] = hc_bytealign_S (w[40], w[41], offset); + w[60] = hc_bytealign_S (w[39], w[40], offset); + w[59] = hc_bytealign_S (w[38], w[39], offset); + w[58] = hc_bytealign_S (w[37], w[38], offset); + w[57] = hc_bytealign_S (w[36], w[37], offset); + w[56] = hc_bytealign_S (w[35], w[36], offset); + w[55] = hc_bytealign_S (w[34], w[35], offset); + w[54] = hc_bytealign_S (w[33], w[34], offset); + w[53] = hc_bytealign_S (w[32], w[33], offset); + w[52] = hc_bytealign_S (w[31], w[32], offset); + w[51] = hc_bytealign_S (w[30], w[31], offset); + w[50] = hc_bytealign_S (w[29], w[30], offset); + w[49] = hc_bytealign_S (w[28], w[29], offset); + w[48] = hc_bytealign_S (w[27], w[28], offset); + w[47] = hc_bytealign_S (w[26], w[27], offset); + w[46] = hc_bytealign_S (w[25], w[26], offset); + w[45] = hc_bytealign_S (w[24], w[25], offset); + w[44] = hc_bytealign_S (w[23], w[24], offset); + w[43] = hc_bytealign_S (w[22], w[23], offset); + w[42] = hc_bytealign_S (w[21], w[22], offset); + w[41] = hc_bytealign_S (w[20], w[21], offset); + w[40] = hc_bytealign_S (w[19], w[20], offset); + w[39] = hc_bytealign_S (w[18], w[19], offset); + w[38] = hc_bytealign_S (w[17], w[18], offset); + w[37] = hc_bytealign_S (w[16], w[17], offset); + w[36] = hc_bytealign_S (w[15], w[16], offset); + w[35] = hc_bytealign_S (w[14], w[15], offset); + w[34] = hc_bytealign_S (w[13], w[14], offset); + w[33] = hc_bytealign_S (w[12], w[13], offset); + w[32] = hc_bytealign_S (w[11], w[12], offset); + w[31] = hc_bytealign_S (w[10], w[11], offset); + w[30] = hc_bytealign_S (w[ 9], w[10], offset); + w[29] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[28] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[27] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[26] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[25] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[24] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[23] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[22] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[21] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[20] = hc_bytealign_S ( 0, w[ 0], offset); w[19] = 0; w[18] = 0; w[17] = 0; @@ -52514,49 +52514,49 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 21: - w[63] = amd_bytealign_S (w[41], w[42], offset); - w[62] = amd_bytealign_S (w[40], w[41], offset); - w[61] = amd_bytealign_S (w[39], w[40], offset); - w[60] = amd_bytealign_S (w[38], w[39], offset); - w[59] = amd_bytealign_S (w[37], w[38], offset); - w[58] = amd_bytealign_S (w[36], w[37], offset); - w[57] = amd_bytealign_S (w[35], w[36], offset); - w[56] = amd_bytealign_S (w[34], w[35], offset); - w[55] = amd_bytealign_S (w[33], w[34], offset); - w[54] = amd_bytealign_S (w[32], w[33], offset); - w[53] = amd_bytealign_S (w[31], w[32], offset); - w[52] = amd_bytealign_S (w[30], w[31], offset); - w[51] = amd_bytealign_S (w[29], w[30], offset); - w[50] = amd_bytealign_S (w[28], w[29], offset); - w[49] = amd_bytealign_S (w[27], w[28], offset); - w[48] = amd_bytealign_S (w[26], w[27], offset); - w[47] = amd_bytealign_S (w[25], w[26], offset); - w[46] = amd_bytealign_S (w[24], w[25], offset); - w[45] = amd_bytealign_S (w[23], w[24], offset); - w[44] = amd_bytealign_S (w[22], w[23], offset); - w[43] = amd_bytealign_S (w[21], w[22], offset); - w[42] = amd_bytealign_S (w[20], w[21], offset); - w[41] = amd_bytealign_S (w[19], w[20], offset); - w[40] = amd_bytealign_S (w[18], w[19], offset); - w[39] = amd_bytealign_S (w[17], w[18], offset); - w[38] = amd_bytealign_S (w[16], w[17], offset); - w[37] = amd_bytealign_S (w[15], w[16], offset); - w[36] = amd_bytealign_S (w[14], w[15], offset); - w[35] = amd_bytealign_S (w[13], w[14], offset); - w[34] = amd_bytealign_S (w[12], w[13], offset); - w[33] = amd_bytealign_S (w[11], w[12], offset); - w[32] = amd_bytealign_S (w[10], w[11], offset); - w[31] = amd_bytealign_S (w[ 9], w[10], offset); - w[30] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[29] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[28] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[27] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[26] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[25] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[24] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[23] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[22] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[21] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[41], w[42], offset); + w[62] = hc_bytealign_S (w[40], w[41], offset); + w[61] = hc_bytealign_S (w[39], w[40], offset); + w[60] = hc_bytealign_S (w[38], w[39], offset); + w[59] = hc_bytealign_S (w[37], w[38], offset); + w[58] = hc_bytealign_S (w[36], w[37], offset); + w[57] = hc_bytealign_S (w[35], w[36], offset); + w[56] = hc_bytealign_S (w[34], w[35], offset); + w[55] = hc_bytealign_S (w[33], w[34], offset); + w[54] = hc_bytealign_S (w[32], w[33], offset); + w[53] = hc_bytealign_S (w[31], w[32], offset); + w[52] = hc_bytealign_S (w[30], w[31], offset); + w[51] = hc_bytealign_S (w[29], w[30], offset); + w[50] = hc_bytealign_S (w[28], w[29], offset); + w[49] = hc_bytealign_S (w[27], w[28], offset); + w[48] = hc_bytealign_S (w[26], w[27], offset); + w[47] = hc_bytealign_S (w[25], w[26], offset); + w[46] = hc_bytealign_S (w[24], w[25], offset); + w[45] = hc_bytealign_S (w[23], w[24], offset); + w[44] = hc_bytealign_S (w[22], w[23], offset); + w[43] = hc_bytealign_S (w[21], w[22], offset); + w[42] = hc_bytealign_S (w[20], w[21], offset); + w[41] = hc_bytealign_S (w[19], w[20], offset); + w[40] = hc_bytealign_S (w[18], w[19], offset); + w[39] = hc_bytealign_S (w[17], w[18], offset); + w[38] = hc_bytealign_S (w[16], w[17], offset); + w[37] = hc_bytealign_S (w[15], w[16], offset); + w[36] = hc_bytealign_S (w[14], w[15], offset); + w[35] = hc_bytealign_S (w[13], w[14], offset); + w[34] = hc_bytealign_S (w[12], w[13], offset); + w[33] = hc_bytealign_S (w[11], w[12], offset); + w[32] = hc_bytealign_S (w[10], w[11], offset); + w[31] = hc_bytealign_S (w[ 9], w[10], offset); + w[30] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[29] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[28] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[27] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[26] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[25] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[24] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[23] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[22] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[21] = hc_bytealign_S ( 0, w[ 0], offset); w[20] = 0; w[19] = 0; w[18] = 0; @@ -52582,48 +52582,48 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 22: - w[63] = amd_bytealign_S (w[40], w[41], offset); - w[62] = amd_bytealign_S (w[39], w[40], offset); - w[61] = amd_bytealign_S (w[38], w[39], offset); - w[60] = amd_bytealign_S (w[37], w[38], offset); - w[59] = amd_bytealign_S (w[36], w[37], offset); - w[58] = amd_bytealign_S (w[35], w[36], offset); - w[57] = amd_bytealign_S (w[34], w[35], offset); - w[56] = amd_bytealign_S (w[33], w[34], offset); - w[55] = amd_bytealign_S (w[32], w[33], offset); - w[54] = amd_bytealign_S (w[31], w[32], offset); - w[53] = amd_bytealign_S (w[30], w[31], offset); - w[52] = amd_bytealign_S (w[29], w[30], offset); - w[51] = amd_bytealign_S (w[28], w[29], offset); - w[50] = amd_bytealign_S (w[27], w[28], offset); - w[49] = amd_bytealign_S (w[26], w[27], offset); - w[48] = amd_bytealign_S (w[25], w[26], offset); - w[47] = amd_bytealign_S (w[24], w[25], offset); - w[46] = amd_bytealign_S (w[23], w[24], offset); - w[45] = amd_bytealign_S (w[22], w[23], offset); - w[44] = amd_bytealign_S (w[21], w[22], offset); - w[43] = amd_bytealign_S (w[20], w[21], offset); - w[42] = amd_bytealign_S (w[19], w[20], offset); - w[41] = amd_bytealign_S (w[18], w[19], offset); - w[40] = amd_bytealign_S (w[17], w[18], offset); - w[39] = amd_bytealign_S (w[16], w[17], offset); - w[38] = amd_bytealign_S (w[15], w[16], offset); - w[37] = amd_bytealign_S (w[14], w[15], offset); - w[36] = amd_bytealign_S (w[13], w[14], offset); - w[35] = amd_bytealign_S (w[12], w[13], offset); - w[34] = amd_bytealign_S (w[11], w[12], offset); - w[33] = amd_bytealign_S (w[10], w[11], offset); - w[32] = amd_bytealign_S (w[ 9], w[10], offset); - w[31] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[30] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[29] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[28] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[27] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[26] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[25] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[24] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[23] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[22] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[40], w[41], offset); + w[62] = hc_bytealign_S (w[39], w[40], offset); + w[61] = hc_bytealign_S (w[38], w[39], offset); + w[60] = hc_bytealign_S (w[37], w[38], offset); + w[59] = hc_bytealign_S (w[36], w[37], offset); + w[58] = hc_bytealign_S (w[35], w[36], offset); + w[57] = hc_bytealign_S (w[34], w[35], offset); + w[56] = hc_bytealign_S (w[33], w[34], offset); + w[55] = hc_bytealign_S (w[32], w[33], offset); + w[54] = hc_bytealign_S (w[31], w[32], offset); + w[53] = hc_bytealign_S (w[30], w[31], offset); + w[52] = hc_bytealign_S (w[29], w[30], offset); + w[51] = hc_bytealign_S (w[28], w[29], offset); + w[50] = hc_bytealign_S (w[27], w[28], offset); + w[49] = hc_bytealign_S (w[26], w[27], offset); + w[48] = hc_bytealign_S (w[25], w[26], offset); + w[47] = hc_bytealign_S (w[24], w[25], offset); + w[46] = hc_bytealign_S (w[23], w[24], offset); + w[45] = hc_bytealign_S (w[22], w[23], offset); + w[44] = hc_bytealign_S (w[21], w[22], offset); + w[43] = hc_bytealign_S (w[20], w[21], offset); + w[42] = hc_bytealign_S (w[19], w[20], offset); + w[41] = hc_bytealign_S (w[18], w[19], offset); + w[40] = hc_bytealign_S (w[17], w[18], offset); + w[39] = hc_bytealign_S (w[16], w[17], offset); + w[38] = hc_bytealign_S (w[15], w[16], offset); + w[37] = hc_bytealign_S (w[14], w[15], offset); + w[36] = hc_bytealign_S (w[13], w[14], offset); + w[35] = hc_bytealign_S (w[12], w[13], offset); + w[34] = hc_bytealign_S (w[11], w[12], offset); + w[33] = hc_bytealign_S (w[10], w[11], offset); + w[32] = hc_bytealign_S (w[ 9], w[10], offset); + w[31] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[30] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[29] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[28] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[27] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[26] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[25] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[24] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[23] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[22] = hc_bytealign_S ( 0, w[ 0], offset); w[21] = 0; w[20] = 0; w[19] = 0; @@ -52650,47 +52650,47 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 23: - w[63] = amd_bytealign_S (w[39], w[40], offset); - w[62] = amd_bytealign_S (w[38], w[39], offset); - w[61] = amd_bytealign_S (w[37], w[38], offset); - w[60] = amd_bytealign_S (w[36], w[37], offset); - w[59] = amd_bytealign_S (w[35], w[36], offset); - w[58] = amd_bytealign_S (w[34], w[35], offset); - w[57] = amd_bytealign_S (w[33], w[34], offset); - w[56] = amd_bytealign_S (w[32], w[33], offset); - w[55] = amd_bytealign_S (w[31], w[32], offset); - w[54] = amd_bytealign_S (w[30], w[31], offset); - w[53] = amd_bytealign_S (w[29], w[30], offset); - w[52] = amd_bytealign_S (w[28], w[29], offset); - w[51] = amd_bytealign_S (w[27], w[28], offset); - w[50] = amd_bytealign_S (w[26], w[27], offset); - w[49] = amd_bytealign_S (w[25], w[26], offset); - w[48] = amd_bytealign_S (w[24], w[25], offset); - w[47] = amd_bytealign_S (w[23], w[24], offset); - w[46] = amd_bytealign_S (w[22], w[23], offset); - w[45] = amd_bytealign_S (w[21], w[22], offset); - w[44] = amd_bytealign_S (w[20], w[21], offset); - w[43] = amd_bytealign_S (w[19], w[20], offset); - w[42] = amd_bytealign_S (w[18], w[19], offset); - w[41] = amd_bytealign_S (w[17], w[18], offset); - w[40] = amd_bytealign_S (w[16], w[17], offset); - w[39] = amd_bytealign_S (w[15], w[16], offset); - w[38] = amd_bytealign_S (w[14], w[15], offset); - w[37] = amd_bytealign_S (w[13], w[14], offset); - w[36] = amd_bytealign_S (w[12], w[13], offset); - w[35] = amd_bytealign_S (w[11], w[12], offset); - w[34] = amd_bytealign_S (w[10], w[11], offset); - w[33] = amd_bytealign_S (w[ 9], w[10], offset); - w[32] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[31] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[30] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[29] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[28] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[27] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[26] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[25] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[24] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[23] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[39], w[40], offset); + w[62] = hc_bytealign_S (w[38], w[39], offset); + w[61] = hc_bytealign_S (w[37], w[38], offset); + w[60] = hc_bytealign_S (w[36], w[37], offset); + w[59] = hc_bytealign_S (w[35], w[36], offset); + w[58] = hc_bytealign_S (w[34], w[35], offset); + w[57] = hc_bytealign_S (w[33], w[34], offset); + w[56] = hc_bytealign_S (w[32], w[33], offset); + w[55] = hc_bytealign_S (w[31], w[32], offset); + w[54] = hc_bytealign_S (w[30], w[31], offset); + w[53] = hc_bytealign_S (w[29], w[30], offset); + w[52] = hc_bytealign_S (w[28], w[29], offset); + w[51] = hc_bytealign_S (w[27], w[28], offset); + w[50] = hc_bytealign_S (w[26], w[27], offset); + w[49] = hc_bytealign_S (w[25], w[26], offset); + w[48] = hc_bytealign_S (w[24], w[25], offset); + w[47] = hc_bytealign_S (w[23], w[24], offset); + w[46] = hc_bytealign_S (w[22], w[23], offset); + w[45] = hc_bytealign_S (w[21], w[22], offset); + w[44] = hc_bytealign_S (w[20], w[21], offset); + w[43] = hc_bytealign_S (w[19], w[20], offset); + w[42] = hc_bytealign_S (w[18], w[19], offset); + w[41] = hc_bytealign_S (w[17], w[18], offset); + w[40] = hc_bytealign_S (w[16], w[17], offset); + w[39] = hc_bytealign_S (w[15], w[16], offset); + w[38] = hc_bytealign_S (w[14], w[15], offset); + w[37] = hc_bytealign_S (w[13], w[14], offset); + w[36] = hc_bytealign_S (w[12], w[13], offset); + w[35] = hc_bytealign_S (w[11], w[12], offset); + w[34] = hc_bytealign_S (w[10], w[11], offset); + w[33] = hc_bytealign_S (w[ 9], w[10], offset); + w[32] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[31] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[30] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[29] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[28] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[27] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[26] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[25] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[24] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[23] = hc_bytealign_S ( 0, w[ 0], offset); w[22] = 0; w[21] = 0; w[20] = 0; @@ -52718,46 +52718,46 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 24: - w[63] = amd_bytealign_S (w[38], w[39], offset); - w[62] = amd_bytealign_S (w[37], w[38], offset); - w[61] = amd_bytealign_S (w[36], w[37], offset); - w[60] = amd_bytealign_S (w[35], w[36], offset); - w[59] = amd_bytealign_S (w[34], w[35], offset); - w[58] = amd_bytealign_S (w[33], w[34], offset); - w[57] = amd_bytealign_S (w[32], w[33], offset); - w[56] = amd_bytealign_S (w[31], w[32], offset); - w[55] = amd_bytealign_S (w[30], w[31], offset); - w[54] = amd_bytealign_S (w[29], w[30], offset); - w[53] = amd_bytealign_S (w[28], w[29], offset); - w[52] = amd_bytealign_S (w[27], w[28], offset); - w[51] = amd_bytealign_S (w[26], w[27], offset); - w[50] = amd_bytealign_S (w[25], w[26], offset); - w[49] = amd_bytealign_S (w[24], w[25], offset); - w[48] = amd_bytealign_S (w[23], w[24], offset); - w[47] = amd_bytealign_S (w[22], w[23], offset); - w[46] = amd_bytealign_S (w[21], w[22], offset); - w[45] = amd_bytealign_S (w[20], w[21], offset); - w[44] = amd_bytealign_S (w[19], w[20], offset); - w[43] = amd_bytealign_S (w[18], w[19], offset); - w[42] = amd_bytealign_S (w[17], w[18], offset); - w[41] = amd_bytealign_S (w[16], w[17], offset); - w[40] = amd_bytealign_S (w[15], w[16], offset); - w[39] = amd_bytealign_S (w[14], w[15], offset); - w[38] = amd_bytealign_S (w[13], w[14], offset); - w[37] = amd_bytealign_S (w[12], w[13], offset); - w[36] = amd_bytealign_S (w[11], w[12], offset); - w[35] = amd_bytealign_S (w[10], w[11], offset); - w[34] = amd_bytealign_S (w[ 9], w[10], offset); - w[33] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[32] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[31] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[30] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[29] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[28] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[27] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[26] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[25] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[24] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[38], w[39], offset); + w[62] = hc_bytealign_S (w[37], w[38], offset); + w[61] = hc_bytealign_S (w[36], w[37], offset); + w[60] = hc_bytealign_S (w[35], w[36], offset); + w[59] = hc_bytealign_S (w[34], w[35], offset); + w[58] = hc_bytealign_S (w[33], w[34], offset); + w[57] = hc_bytealign_S (w[32], w[33], offset); + w[56] = hc_bytealign_S (w[31], w[32], offset); + w[55] = hc_bytealign_S (w[30], w[31], offset); + w[54] = hc_bytealign_S (w[29], w[30], offset); + w[53] = hc_bytealign_S (w[28], w[29], offset); + w[52] = hc_bytealign_S (w[27], w[28], offset); + w[51] = hc_bytealign_S (w[26], w[27], offset); + w[50] = hc_bytealign_S (w[25], w[26], offset); + w[49] = hc_bytealign_S (w[24], w[25], offset); + w[48] = hc_bytealign_S (w[23], w[24], offset); + w[47] = hc_bytealign_S (w[22], w[23], offset); + w[46] = hc_bytealign_S (w[21], w[22], offset); + w[45] = hc_bytealign_S (w[20], w[21], offset); + w[44] = hc_bytealign_S (w[19], w[20], offset); + w[43] = hc_bytealign_S (w[18], w[19], offset); + w[42] = hc_bytealign_S (w[17], w[18], offset); + w[41] = hc_bytealign_S (w[16], w[17], offset); + w[40] = hc_bytealign_S (w[15], w[16], offset); + w[39] = hc_bytealign_S (w[14], w[15], offset); + w[38] = hc_bytealign_S (w[13], w[14], offset); + w[37] = hc_bytealign_S (w[12], w[13], offset); + w[36] = hc_bytealign_S (w[11], w[12], offset); + w[35] = hc_bytealign_S (w[10], w[11], offset); + w[34] = hc_bytealign_S (w[ 9], w[10], offset); + w[33] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[32] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[31] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[30] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[29] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[28] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[27] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[26] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[25] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[24] = hc_bytealign_S ( 0, w[ 0], offset); w[23] = 0; w[22] = 0; w[21] = 0; @@ -52786,45 +52786,45 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 25: - w[63] = amd_bytealign_S (w[37], w[38], offset); - w[62] = amd_bytealign_S (w[36], w[37], offset); - w[61] = amd_bytealign_S (w[35], w[36], offset); - w[60] = amd_bytealign_S (w[34], w[35], offset); - w[59] = amd_bytealign_S (w[33], w[34], offset); - w[58] = amd_bytealign_S (w[32], w[33], offset); - w[57] = amd_bytealign_S (w[31], w[32], offset); - w[56] = amd_bytealign_S (w[30], w[31], offset); - w[55] = amd_bytealign_S (w[29], w[30], offset); - w[54] = amd_bytealign_S (w[28], w[29], offset); - w[53] = amd_bytealign_S (w[27], w[28], offset); - w[52] = amd_bytealign_S (w[26], w[27], offset); - w[51] = amd_bytealign_S (w[25], w[26], offset); - w[50] = amd_bytealign_S (w[24], w[25], offset); - w[49] = amd_bytealign_S (w[23], w[24], offset); - w[48] = amd_bytealign_S (w[22], w[23], offset); - w[47] = amd_bytealign_S (w[21], w[22], offset); - w[46] = amd_bytealign_S (w[20], w[21], offset); - w[45] = amd_bytealign_S (w[19], w[20], offset); - w[44] = amd_bytealign_S (w[18], w[19], offset); - w[43] = amd_bytealign_S (w[17], w[18], offset); - w[42] = amd_bytealign_S (w[16], w[17], offset); - w[41] = amd_bytealign_S (w[15], w[16], offset); - w[40] = amd_bytealign_S (w[14], w[15], offset); - w[39] = amd_bytealign_S (w[13], w[14], offset); - w[38] = amd_bytealign_S (w[12], w[13], offset); - w[37] = amd_bytealign_S (w[11], w[12], offset); - w[36] = amd_bytealign_S (w[10], w[11], offset); - w[35] = amd_bytealign_S (w[ 9], w[10], offset); - w[34] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[33] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[32] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[31] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[30] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[29] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[28] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[27] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[26] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[25] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[37], w[38], offset); + w[62] = hc_bytealign_S (w[36], w[37], offset); + w[61] = hc_bytealign_S (w[35], w[36], offset); + w[60] = hc_bytealign_S (w[34], w[35], offset); + w[59] = hc_bytealign_S (w[33], w[34], offset); + w[58] = hc_bytealign_S (w[32], w[33], offset); + w[57] = hc_bytealign_S (w[31], w[32], offset); + w[56] = hc_bytealign_S (w[30], w[31], offset); + w[55] = hc_bytealign_S (w[29], w[30], offset); + w[54] = hc_bytealign_S (w[28], w[29], offset); + w[53] = hc_bytealign_S (w[27], w[28], offset); + w[52] = hc_bytealign_S (w[26], w[27], offset); + w[51] = hc_bytealign_S (w[25], w[26], offset); + w[50] = hc_bytealign_S (w[24], w[25], offset); + w[49] = hc_bytealign_S (w[23], w[24], offset); + w[48] = hc_bytealign_S (w[22], w[23], offset); + w[47] = hc_bytealign_S (w[21], w[22], offset); + w[46] = hc_bytealign_S (w[20], w[21], offset); + w[45] = hc_bytealign_S (w[19], w[20], offset); + w[44] = hc_bytealign_S (w[18], w[19], offset); + w[43] = hc_bytealign_S (w[17], w[18], offset); + w[42] = hc_bytealign_S (w[16], w[17], offset); + w[41] = hc_bytealign_S (w[15], w[16], offset); + w[40] = hc_bytealign_S (w[14], w[15], offset); + w[39] = hc_bytealign_S (w[13], w[14], offset); + w[38] = hc_bytealign_S (w[12], w[13], offset); + w[37] = hc_bytealign_S (w[11], w[12], offset); + w[36] = hc_bytealign_S (w[10], w[11], offset); + w[35] = hc_bytealign_S (w[ 9], w[10], offset); + w[34] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[33] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[32] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[31] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[30] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[29] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[28] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[27] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[26] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[25] = hc_bytealign_S ( 0, w[ 0], offset); w[24] = 0; w[23] = 0; w[22] = 0; @@ -52854,44 +52854,44 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 26: - w[63] = amd_bytealign_S (w[36], w[37], offset); - w[62] = amd_bytealign_S (w[35], w[36], offset); - w[61] = amd_bytealign_S (w[34], w[35], offset); - w[60] = amd_bytealign_S (w[33], w[34], offset); - w[59] = amd_bytealign_S (w[32], w[33], offset); - w[58] = amd_bytealign_S (w[31], w[32], offset); - w[57] = amd_bytealign_S (w[30], w[31], offset); - w[56] = amd_bytealign_S (w[29], w[30], offset); - w[55] = amd_bytealign_S (w[28], w[29], offset); - w[54] = amd_bytealign_S (w[27], w[28], offset); - w[53] = amd_bytealign_S (w[26], w[27], offset); - w[52] = amd_bytealign_S (w[25], w[26], offset); - w[51] = amd_bytealign_S (w[24], w[25], offset); - w[50] = amd_bytealign_S (w[23], w[24], offset); - w[49] = amd_bytealign_S (w[22], w[23], offset); - w[48] = amd_bytealign_S (w[21], w[22], offset); - w[47] = amd_bytealign_S (w[20], w[21], offset); - w[46] = amd_bytealign_S (w[19], w[20], offset); - w[45] = amd_bytealign_S (w[18], w[19], offset); - w[44] = amd_bytealign_S (w[17], w[18], offset); - w[43] = amd_bytealign_S (w[16], w[17], offset); - w[42] = amd_bytealign_S (w[15], w[16], offset); - w[41] = amd_bytealign_S (w[14], w[15], offset); - w[40] = amd_bytealign_S (w[13], w[14], offset); - w[39] = amd_bytealign_S (w[12], w[13], offset); - w[38] = amd_bytealign_S (w[11], w[12], offset); - w[37] = amd_bytealign_S (w[10], w[11], offset); - w[36] = amd_bytealign_S (w[ 9], w[10], offset); - w[35] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[34] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[33] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[32] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[31] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[30] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[29] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[28] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[27] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[26] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[36], w[37], offset); + w[62] = hc_bytealign_S (w[35], w[36], offset); + w[61] = hc_bytealign_S (w[34], w[35], offset); + w[60] = hc_bytealign_S (w[33], w[34], offset); + w[59] = hc_bytealign_S (w[32], w[33], offset); + w[58] = hc_bytealign_S (w[31], w[32], offset); + w[57] = hc_bytealign_S (w[30], w[31], offset); + w[56] = hc_bytealign_S (w[29], w[30], offset); + w[55] = hc_bytealign_S (w[28], w[29], offset); + w[54] = hc_bytealign_S (w[27], w[28], offset); + w[53] = hc_bytealign_S (w[26], w[27], offset); + w[52] = hc_bytealign_S (w[25], w[26], offset); + w[51] = hc_bytealign_S (w[24], w[25], offset); + w[50] = hc_bytealign_S (w[23], w[24], offset); + w[49] = hc_bytealign_S (w[22], w[23], offset); + w[48] = hc_bytealign_S (w[21], w[22], offset); + w[47] = hc_bytealign_S (w[20], w[21], offset); + w[46] = hc_bytealign_S (w[19], w[20], offset); + w[45] = hc_bytealign_S (w[18], w[19], offset); + w[44] = hc_bytealign_S (w[17], w[18], offset); + w[43] = hc_bytealign_S (w[16], w[17], offset); + w[42] = hc_bytealign_S (w[15], w[16], offset); + w[41] = hc_bytealign_S (w[14], w[15], offset); + w[40] = hc_bytealign_S (w[13], w[14], offset); + w[39] = hc_bytealign_S (w[12], w[13], offset); + w[38] = hc_bytealign_S (w[11], w[12], offset); + w[37] = hc_bytealign_S (w[10], w[11], offset); + w[36] = hc_bytealign_S (w[ 9], w[10], offset); + w[35] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[34] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[33] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[32] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[31] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[30] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[29] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[28] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[27] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[26] = hc_bytealign_S ( 0, w[ 0], offset); w[25] = 0; w[24] = 0; w[23] = 0; @@ -52922,43 +52922,43 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 27: - w[63] = amd_bytealign_S (w[35], w[36], offset); - w[62] = amd_bytealign_S (w[34], w[35], offset); - w[61] = amd_bytealign_S (w[33], w[34], offset); - w[60] = amd_bytealign_S (w[32], w[33], offset); - w[59] = amd_bytealign_S (w[31], w[32], offset); - w[58] = amd_bytealign_S (w[30], w[31], offset); - w[57] = amd_bytealign_S (w[29], w[30], offset); - w[56] = amd_bytealign_S (w[28], w[29], offset); - w[55] = amd_bytealign_S (w[27], w[28], offset); - w[54] = amd_bytealign_S (w[26], w[27], offset); - w[53] = amd_bytealign_S (w[25], w[26], offset); - w[52] = amd_bytealign_S (w[24], w[25], offset); - w[51] = amd_bytealign_S (w[23], w[24], offset); - w[50] = amd_bytealign_S (w[22], w[23], offset); - w[49] = amd_bytealign_S (w[21], w[22], offset); - w[48] = amd_bytealign_S (w[20], w[21], offset); - w[47] = amd_bytealign_S (w[19], w[20], offset); - w[46] = amd_bytealign_S (w[18], w[19], offset); - w[45] = amd_bytealign_S (w[17], w[18], offset); - w[44] = amd_bytealign_S (w[16], w[17], offset); - w[43] = amd_bytealign_S (w[15], w[16], offset); - w[42] = amd_bytealign_S (w[14], w[15], offset); - w[41] = amd_bytealign_S (w[13], w[14], offset); - w[40] = amd_bytealign_S (w[12], w[13], offset); - w[39] = amd_bytealign_S (w[11], w[12], offset); - w[38] = amd_bytealign_S (w[10], w[11], offset); - w[37] = amd_bytealign_S (w[ 9], w[10], offset); - w[36] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[35] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[34] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[33] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[32] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[31] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[30] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[29] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[28] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[27] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[35], w[36], offset); + w[62] = hc_bytealign_S (w[34], w[35], offset); + w[61] = hc_bytealign_S (w[33], w[34], offset); + w[60] = hc_bytealign_S (w[32], w[33], offset); + w[59] = hc_bytealign_S (w[31], w[32], offset); + w[58] = hc_bytealign_S (w[30], w[31], offset); + w[57] = hc_bytealign_S (w[29], w[30], offset); + w[56] = hc_bytealign_S (w[28], w[29], offset); + w[55] = hc_bytealign_S (w[27], w[28], offset); + w[54] = hc_bytealign_S (w[26], w[27], offset); + w[53] = hc_bytealign_S (w[25], w[26], offset); + w[52] = hc_bytealign_S (w[24], w[25], offset); + w[51] = hc_bytealign_S (w[23], w[24], offset); + w[50] = hc_bytealign_S (w[22], w[23], offset); + w[49] = hc_bytealign_S (w[21], w[22], offset); + w[48] = hc_bytealign_S (w[20], w[21], offset); + w[47] = hc_bytealign_S (w[19], w[20], offset); + w[46] = hc_bytealign_S (w[18], w[19], offset); + w[45] = hc_bytealign_S (w[17], w[18], offset); + w[44] = hc_bytealign_S (w[16], w[17], offset); + w[43] = hc_bytealign_S (w[15], w[16], offset); + w[42] = hc_bytealign_S (w[14], w[15], offset); + w[41] = hc_bytealign_S (w[13], w[14], offset); + w[40] = hc_bytealign_S (w[12], w[13], offset); + w[39] = hc_bytealign_S (w[11], w[12], offset); + w[38] = hc_bytealign_S (w[10], w[11], offset); + w[37] = hc_bytealign_S (w[ 9], w[10], offset); + w[36] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[35] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[34] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[33] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[32] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[31] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[30] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[29] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[28] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[27] = hc_bytealign_S ( 0, w[ 0], offset); w[26] = 0; w[25] = 0; w[24] = 0; @@ -52990,42 +52990,42 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 28: - w[63] = amd_bytealign_S (w[34], w[35], offset); - w[62] = amd_bytealign_S (w[33], w[34], offset); - w[61] = amd_bytealign_S (w[32], w[33], offset); - w[60] = amd_bytealign_S (w[31], w[32], offset); - w[59] = amd_bytealign_S (w[30], w[31], offset); - w[58] = amd_bytealign_S (w[29], w[30], offset); - w[57] = amd_bytealign_S (w[28], w[29], offset); - w[56] = amd_bytealign_S (w[27], w[28], offset); - w[55] = amd_bytealign_S (w[26], w[27], offset); - w[54] = amd_bytealign_S (w[25], w[26], offset); - w[53] = amd_bytealign_S (w[24], w[25], offset); - w[52] = amd_bytealign_S (w[23], w[24], offset); - w[51] = amd_bytealign_S (w[22], w[23], offset); - w[50] = amd_bytealign_S (w[21], w[22], offset); - w[49] = amd_bytealign_S (w[20], w[21], offset); - w[48] = amd_bytealign_S (w[19], w[20], offset); - w[47] = amd_bytealign_S (w[18], w[19], offset); - w[46] = amd_bytealign_S (w[17], w[18], offset); - w[45] = amd_bytealign_S (w[16], w[17], offset); - w[44] = amd_bytealign_S (w[15], w[16], offset); - w[43] = amd_bytealign_S (w[14], w[15], offset); - w[42] = amd_bytealign_S (w[13], w[14], offset); - w[41] = amd_bytealign_S (w[12], w[13], offset); - w[40] = amd_bytealign_S (w[11], w[12], offset); - w[39] = amd_bytealign_S (w[10], w[11], offset); - w[38] = amd_bytealign_S (w[ 9], w[10], offset); - w[37] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[36] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[35] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[34] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[33] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[32] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[31] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[30] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[29] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[28] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[34], w[35], offset); + w[62] = hc_bytealign_S (w[33], w[34], offset); + w[61] = hc_bytealign_S (w[32], w[33], offset); + w[60] = hc_bytealign_S (w[31], w[32], offset); + w[59] = hc_bytealign_S (w[30], w[31], offset); + w[58] = hc_bytealign_S (w[29], w[30], offset); + w[57] = hc_bytealign_S (w[28], w[29], offset); + w[56] = hc_bytealign_S (w[27], w[28], offset); + w[55] = hc_bytealign_S (w[26], w[27], offset); + w[54] = hc_bytealign_S (w[25], w[26], offset); + w[53] = hc_bytealign_S (w[24], w[25], offset); + w[52] = hc_bytealign_S (w[23], w[24], offset); + w[51] = hc_bytealign_S (w[22], w[23], offset); + w[50] = hc_bytealign_S (w[21], w[22], offset); + w[49] = hc_bytealign_S (w[20], w[21], offset); + w[48] = hc_bytealign_S (w[19], w[20], offset); + w[47] = hc_bytealign_S (w[18], w[19], offset); + w[46] = hc_bytealign_S (w[17], w[18], offset); + w[45] = hc_bytealign_S (w[16], w[17], offset); + w[44] = hc_bytealign_S (w[15], w[16], offset); + w[43] = hc_bytealign_S (w[14], w[15], offset); + w[42] = hc_bytealign_S (w[13], w[14], offset); + w[41] = hc_bytealign_S (w[12], w[13], offset); + w[40] = hc_bytealign_S (w[11], w[12], offset); + w[39] = hc_bytealign_S (w[10], w[11], offset); + w[38] = hc_bytealign_S (w[ 9], w[10], offset); + w[37] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[36] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[35] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[34] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[33] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[32] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[31] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[30] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[29] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[28] = hc_bytealign_S ( 0, w[ 0], offset); w[27] = 0; w[26] = 0; w[25] = 0; @@ -53058,41 +53058,41 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 29: - w[63] = amd_bytealign_S (w[33], w[34], offset); - w[62] = amd_bytealign_S (w[32], w[33], offset); - w[61] = amd_bytealign_S (w[31], w[32], offset); - w[60] = amd_bytealign_S (w[30], w[31], offset); - w[59] = amd_bytealign_S (w[29], w[30], offset); - w[58] = amd_bytealign_S (w[28], w[29], offset); - w[57] = amd_bytealign_S (w[27], w[28], offset); - w[56] = amd_bytealign_S (w[26], w[27], offset); - w[55] = amd_bytealign_S (w[25], w[26], offset); - w[54] = amd_bytealign_S (w[24], w[25], offset); - w[53] = amd_bytealign_S (w[23], w[24], offset); - w[52] = amd_bytealign_S (w[22], w[23], offset); - w[51] = amd_bytealign_S (w[21], w[22], offset); - w[50] = amd_bytealign_S (w[20], w[21], offset); - w[49] = amd_bytealign_S (w[19], w[20], offset); - w[48] = amd_bytealign_S (w[18], w[19], offset); - w[47] = amd_bytealign_S (w[17], w[18], offset); - w[46] = amd_bytealign_S (w[16], w[17], offset); - w[45] = amd_bytealign_S (w[15], w[16], offset); - w[44] = amd_bytealign_S (w[14], w[15], offset); - w[43] = amd_bytealign_S (w[13], w[14], offset); - w[42] = amd_bytealign_S (w[12], w[13], offset); - w[41] = amd_bytealign_S (w[11], w[12], offset); - w[40] = amd_bytealign_S (w[10], w[11], offset); - w[39] = amd_bytealign_S (w[ 9], w[10], offset); - w[38] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[37] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[36] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[35] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[34] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[33] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[32] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[31] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[30] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[29] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[33], w[34], offset); + w[62] = hc_bytealign_S (w[32], w[33], offset); + w[61] = hc_bytealign_S (w[31], w[32], offset); + w[60] = hc_bytealign_S (w[30], w[31], offset); + w[59] = hc_bytealign_S (w[29], w[30], offset); + w[58] = hc_bytealign_S (w[28], w[29], offset); + w[57] = hc_bytealign_S (w[27], w[28], offset); + w[56] = hc_bytealign_S (w[26], w[27], offset); + w[55] = hc_bytealign_S (w[25], w[26], offset); + w[54] = hc_bytealign_S (w[24], w[25], offset); + w[53] = hc_bytealign_S (w[23], w[24], offset); + w[52] = hc_bytealign_S (w[22], w[23], offset); + w[51] = hc_bytealign_S (w[21], w[22], offset); + w[50] = hc_bytealign_S (w[20], w[21], offset); + w[49] = hc_bytealign_S (w[19], w[20], offset); + w[48] = hc_bytealign_S (w[18], w[19], offset); + w[47] = hc_bytealign_S (w[17], w[18], offset); + w[46] = hc_bytealign_S (w[16], w[17], offset); + w[45] = hc_bytealign_S (w[15], w[16], offset); + w[44] = hc_bytealign_S (w[14], w[15], offset); + w[43] = hc_bytealign_S (w[13], w[14], offset); + w[42] = hc_bytealign_S (w[12], w[13], offset); + w[41] = hc_bytealign_S (w[11], w[12], offset); + w[40] = hc_bytealign_S (w[10], w[11], offset); + w[39] = hc_bytealign_S (w[ 9], w[10], offset); + w[38] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[37] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[36] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[35] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[34] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[33] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[32] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[31] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[30] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[29] = hc_bytealign_S ( 0, w[ 0], offset); w[28] = 0; w[27] = 0; w[26] = 0; @@ -53126,40 +53126,40 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 30: - w[63] = amd_bytealign_S (w[32], w[33], offset); - w[62] = amd_bytealign_S (w[31], w[32], offset); - w[61] = amd_bytealign_S (w[30], w[31], offset); - w[60] = amd_bytealign_S (w[29], w[30], offset); - w[59] = amd_bytealign_S (w[28], w[29], offset); - w[58] = amd_bytealign_S (w[27], w[28], offset); - w[57] = amd_bytealign_S (w[26], w[27], offset); - w[56] = amd_bytealign_S (w[25], w[26], offset); - w[55] = amd_bytealign_S (w[24], w[25], offset); - w[54] = amd_bytealign_S (w[23], w[24], offset); - w[53] = amd_bytealign_S (w[22], w[23], offset); - w[52] = amd_bytealign_S (w[21], w[22], offset); - w[51] = amd_bytealign_S (w[20], w[21], offset); - w[50] = amd_bytealign_S (w[19], w[20], offset); - w[49] = amd_bytealign_S (w[18], w[19], offset); - w[48] = amd_bytealign_S (w[17], w[18], offset); - w[47] = amd_bytealign_S (w[16], w[17], offset); - w[46] = amd_bytealign_S (w[15], w[16], offset); - w[45] = amd_bytealign_S (w[14], w[15], offset); - w[44] = amd_bytealign_S (w[13], w[14], offset); - w[43] = amd_bytealign_S (w[12], w[13], offset); - w[42] = amd_bytealign_S (w[11], w[12], offset); - w[41] = amd_bytealign_S (w[10], w[11], offset); - w[40] = amd_bytealign_S (w[ 9], w[10], offset); - w[39] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[38] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[37] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[36] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[35] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[34] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[33] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[32] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[31] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[30] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[32], w[33], offset); + w[62] = hc_bytealign_S (w[31], w[32], offset); + w[61] = hc_bytealign_S (w[30], w[31], offset); + w[60] = hc_bytealign_S (w[29], w[30], offset); + w[59] = hc_bytealign_S (w[28], w[29], offset); + w[58] = hc_bytealign_S (w[27], w[28], offset); + w[57] = hc_bytealign_S (w[26], w[27], offset); + w[56] = hc_bytealign_S (w[25], w[26], offset); + w[55] = hc_bytealign_S (w[24], w[25], offset); + w[54] = hc_bytealign_S (w[23], w[24], offset); + w[53] = hc_bytealign_S (w[22], w[23], offset); + w[52] = hc_bytealign_S (w[21], w[22], offset); + w[51] = hc_bytealign_S (w[20], w[21], offset); + w[50] = hc_bytealign_S (w[19], w[20], offset); + w[49] = hc_bytealign_S (w[18], w[19], offset); + w[48] = hc_bytealign_S (w[17], w[18], offset); + w[47] = hc_bytealign_S (w[16], w[17], offset); + w[46] = hc_bytealign_S (w[15], w[16], offset); + w[45] = hc_bytealign_S (w[14], w[15], offset); + w[44] = hc_bytealign_S (w[13], w[14], offset); + w[43] = hc_bytealign_S (w[12], w[13], offset); + w[42] = hc_bytealign_S (w[11], w[12], offset); + w[41] = hc_bytealign_S (w[10], w[11], offset); + w[40] = hc_bytealign_S (w[ 9], w[10], offset); + w[39] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[38] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[37] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[36] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[35] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[34] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[33] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[32] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[31] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[30] = hc_bytealign_S ( 0, w[ 0], offset); w[29] = 0; w[28] = 0; w[27] = 0; @@ -53194,39 +53194,39 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 31: - w[63] = amd_bytealign_S (w[31], w[32], offset); - w[62] = amd_bytealign_S (w[30], w[31], offset); - w[61] = amd_bytealign_S (w[29], w[30], offset); - w[60] = amd_bytealign_S (w[28], w[29], offset); - w[59] = amd_bytealign_S (w[27], w[28], offset); - w[58] = amd_bytealign_S (w[26], w[27], offset); - w[57] = amd_bytealign_S (w[25], w[26], offset); - w[56] = amd_bytealign_S (w[24], w[25], offset); - w[55] = amd_bytealign_S (w[23], w[24], offset); - w[54] = amd_bytealign_S (w[22], w[23], offset); - w[53] = amd_bytealign_S (w[21], w[22], offset); - w[52] = amd_bytealign_S (w[20], w[21], offset); - w[51] = amd_bytealign_S (w[19], w[20], offset); - w[50] = amd_bytealign_S (w[18], w[19], offset); - w[49] = amd_bytealign_S (w[17], w[18], offset); - w[48] = amd_bytealign_S (w[16], w[17], offset); - w[47] = amd_bytealign_S (w[15], w[16], offset); - w[46] = amd_bytealign_S (w[14], w[15], offset); - w[45] = amd_bytealign_S (w[13], w[14], offset); - w[44] = amd_bytealign_S (w[12], w[13], offset); - w[43] = amd_bytealign_S (w[11], w[12], offset); - w[42] = amd_bytealign_S (w[10], w[11], offset); - w[41] = amd_bytealign_S (w[ 9], w[10], offset); - w[40] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[39] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[38] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[37] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[36] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[35] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[34] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[33] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[32] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[31] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[31], w[32], offset); + w[62] = hc_bytealign_S (w[30], w[31], offset); + w[61] = hc_bytealign_S (w[29], w[30], offset); + w[60] = hc_bytealign_S (w[28], w[29], offset); + w[59] = hc_bytealign_S (w[27], w[28], offset); + w[58] = hc_bytealign_S (w[26], w[27], offset); + w[57] = hc_bytealign_S (w[25], w[26], offset); + w[56] = hc_bytealign_S (w[24], w[25], offset); + w[55] = hc_bytealign_S (w[23], w[24], offset); + w[54] = hc_bytealign_S (w[22], w[23], offset); + w[53] = hc_bytealign_S (w[21], w[22], offset); + w[52] = hc_bytealign_S (w[20], w[21], offset); + w[51] = hc_bytealign_S (w[19], w[20], offset); + w[50] = hc_bytealign_S (w[18], w[19], offset); + w[49] = hc_bytealign_S (w[17], w[18], offset); + w[48] = hc_bytealign_S (w[16], w[17], offset); + w[47] = hc_bytealign_S (w[15], w[16], offset); + w[46] = hc_bytealign_S (w[14], w[15], offset); + w[45] = hc_bytealign_S (w[13], w[14], offset); + w[44] = hc_bytealign_S (w[12], w[13], offset); + w[43] = hc_bytealign_S (w[11], w[12], offset); + w[42] = hc_bytealign_S (w[10], w[11], offset); + w[41] = hc_bytealign_S (w[ 9], w[10], offset); + w[40] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[39] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[38] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[37] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[36] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[35] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[34] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[33] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[32] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[31] = hc_bytealign_S ( 0, w[ 0], offset); w[30] = 0; w[29] = 0; w[28] = 0; @@ -53262,38 +53262,38 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 32: - w[63] = amd_bytealign_S (w[30], w[31], offset); - w[62] = amd_bytealign_S (w[29], w[30], offset); - w[61] = amd_bytealign_S (w[28], w[29], offset); - w[60] = amd_bytealign_S (w[27], w[28], offset); - w[59] = amd_bytealign_S (w[26], w[27], offset); - w[58] = amd_bytealign_S (w[25], w[26], offset); - w[57] = amd_bytealign_S (w[24], w[25], offset); - w[56] = amd_bytealign_S (w[23], w[24], offset); - w[55] = amd_bytealign_S (w[22], w[23], offset); - w[54] = amd_bytealign_S (w[21], w[22], offset); - w[53] = amd_bytealign_S (w[20], w[21], offset); - w[52] = amd_bytealign_S (w[19], w[20], offset); - w[51] = amd_bytealign_S (w[18], w[19], offset); - w[50] = amd_bytealign_S (w[17], w[18], offset); - w[49] = amd_bytealign_S (w[16], w[17], offset); - w[48] = amd_bytealign_S (w[15], w[16], offset); - w[47] = amd_bytealign_S (w[14], w[15], offset); - w[46] = amd_bytealign_S (w[13], w[14], offset); - w[45] = amd_bytealign_S (w[12], w[13], offset); - w[44] = amd_bytealign_S (w[11], w[12], offset); - w[43] = amd_bytealign_S (w[10], w[11], offset); - w[42] = amd_bytealign_S (w[ 9], w[10], offset); - w[41] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[40] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[39] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[38] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[37] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[36] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[35] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[34] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[33] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[32] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[30], w[31], offset); + w[62] = hc_bytealign_S (w[29], w[30], offset); + w[61] = hc_bytealign_S (w[28], w[29], offset); + w[60] = hc_bytealign_S (w[27], w[28], offset); + w[59] = hc_bytealign_S (w[26], w[27], offset); + w[58] = hc_bytealign_S (w[25], w[26], offset); + w[57] = hc_bytealign_S (w[24], w[25], offset); + w[56] = hc_bytealign_S (w[23], w[24], offset); + w[55] = hc_bytealign_S (w[22], w[23], offset); + w[54] = hc_bytealign_S (w[21], w[22], offset); + w[53] = hc_bytealign_S (w[20], w[21], offset); + w[52] = hc_bytealign_S (w[19], w[20], offset); + w[51] = hc_bytealign_S (w[18], w[19], offset); + w[50] = hc_bytealign_S (w[17], w[18], offset); + w[49] = hc_bytealign_S (w[16], w[17], offset); + w[48] = hc_bytealign_S (w[15], w[16], offset); + w[47] = hc_bytealign_S (w[14], w[15], offset); + w[46] = hc_bytealign_S (w[13], w[14], offset); + w[45] = hc_bytealign_S (w[12], w[13], offset); + w[44] = hc_bytealign_S (w[11], w[12], offset); + w[43] = hc_bytealign_S (w[10], w[11], offset); + w[42] = hc_bytealign_S (w[ 9], w[10], offset); + w[41] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[40] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[39] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[38] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[37] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[36] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[35] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[34] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[33] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[32] = hc_bytealign_S ( 0, w[ 0], offset); w[31] = 0; w[30] = 0; w[29] = 0; @@ -53330,37 +53330,37 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 33: - w[63] = amd_bytealign_S (w[29], w[30], offset); - w[62] = amd_bytealign_S (w[28], w[29], offset); - w[61] = amd_bytealign_S (w[27], w[28], offset); - w[60] = amd_bytealign_S (w[26], w[27], offset); - w[59] = amd_bytealign_S (w[25], w[26], offset); - w[58] = amd_bytealign_S (w[24], w[25], offset); - w[57] = amd_bytealign_S (w[23], w[24], offset); - w[56] = amd_bytealign_S (w[22], w[23], offset); - w[55] = amd_bytealign_S (w[21], w[22], offset); - w[54] = amd_bytealign_S (w[20], w[21], offset); - w[53] = amd_bytealign_S (w[19], w[20], offset); - w[52] = amd_bytealign_S (w[18], w[19], offset); - w[51] = amd_bytealign_S (w[17], w[18], offset); - w[50] = amd_bytealign_S (w[16], w[17], offset); - w[49] = amd_bytealign_S (w[15], w[16], offset); - w[48] = amd_bytealign_S (w[14], w[15], offset); - w[47] = amd_bytealign_S (w[13], w[14], offset); - w[46] = amd_bytealign_S (w[12], w[13], offset); - w[45] = amd_bytealign_S (w[11], w[12], offset); - w[44] = amd_bytealign_S (w[10], w[11], offset); - w[43] = amd_bytealign_S (w[ 9], w[10], offset); - w[42] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[41] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[40] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[39] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[38] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[37] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[36] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[35] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[34] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[33] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[29], w[30], offset); + w[62] = hc_bytealign_S (w[28], w[29], offset); + w[61] = hc_bytealign_S (w[27], w[28], offset); + w[60] = hc_bytealign_S (w[26], w[27], offset); + w[59] = hc_bytealign_S (w[25], w[26], offset); + w[58] = hc_bytealign_S (w[24], w[25], offset); + w[57] = hc_bytealign_S (w[23], w[24], offset); + w[56] = hc_bytealign_S (w[22], w[23], offset); + w[55] = hc_bytealign_S (w[21], w[22], offset); + w[54] = hc_bytealign_S (w[20], w[21], offset); + w[53] = hc_bytealign_S (w[19], w[20], offset); + w[52] = hc_bytealign_S (w[18], w[19], offset); + w[51] = hc_bytealign_S (w[17], w[18], offset); + w[50] = hc_bytealign_S (w[16], w[17], offset); + w[49] = hc_bytealign_S (w[15], w[16], offset); + w[48] = hc_bytealign_S (w[14], w[15], offset); + w[47] = hc_bytealign_S (w[13], w[14], offset); + w[46] = hc_bytealign_S (w[12], w[13], offset); + w[45] = hc_bytealign_S (w[11], w[12], offset); + w[44] = hc_bytealign_S (w[10], w[11], offset); + w[43] = hc_bytealign_S (w[ 9], w[10], offset); + w[42] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[41] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[40] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[39] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[38] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[37] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[36] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[35] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[34] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[33] = hc_bytealign_S ( 0, w[ 0], offset); w[32] = 0; w[31] = 0; w[30] = 0; @@ -53398,36 +53398,36 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 34: - w[63] = amd_bytealign_S (w[28], w[29], offset); - w[62] = amd_bytealign_S (w[27], w[28], offset); - w[61] = amd_bytealign_S (w[26], w[27], offset); - w[60] = amd_bytealign_S (w[25], w[26], offset); - w[59] = amd_bytealign_S (w[24], w[25], offset); - w[58] = amd_bytealign_S (w[23], w[24], offset); - w[57] = amd_bytealign_S (w[22], w[23], offset); - w[56] = amd_bytealign_S (w[21], w[22], offset); - w[55] = amd_bytealign_S (w[20], w[21], offset); - w[54] = amd_bytealign_S (w[19], w[20], offset); - w[53] = amd_bytealign_S (w[18], w[19], offset); - w[52] = amd_bytealign_S (w[17], w[18], offset); - w[51] = amd_bytealign_S (w[16], w[17], offset); - w[50] = amd_bytealign_S (w[15], w[16], offset); - w[49] = amd_bytealign_S (w[14], w[15], offset); - w[48] = amd_bytealign_S (w[13], w[14], offset); - w[47] = amd_bytealign_S (w[12], w[13], offset); - w[46] = amd_bytealign_S (w[11], w[12], offset); - w[45] = amd_bytealign_S (w[10], w[11], offset); - w[44] = amd_bytealign_S (w[ 9], w[10], offset); - w[43] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[42] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[41] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[40] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[39] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[38] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[37] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[36] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[35] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[34] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[28], w[29], offset); + w[62] = hc_bytealign_S (w[27], w[28], offset); + w[61] = hc_bytealign_S (w[26], w[27], offset); + w[60] = hc_bytealign_S (w[25], w[26], offset); + w[59] = hc_bytealign_S (w[24], w[25], offset); + w[58] = hc_bytealign_S (w[23], w[24], offset); + w[57] = hc_bytealign_S (w[22], w[23], offset); + w[56] = hc_bytealign_S (w[21], w[22], offset); + w[55] = hc_bytealign_S (w[20], w[21], offset); + w[54] = hc_bytealign_S (w[19], w[20], offset); + w[53] = hc_bytealign_S (w[18], w[19], offset); + w[52] = hc_bytealign_S (w[17], w[18], offset); + w[51] = hc_bytealign_S (w[16], w[17], offset); + w[50] = hc_bytealign_S (w[15], w[16], offset); + w[49] = hc_bytealign_S (w[14], w[15], offset); + w[48] = hc_bytealign_S (w[13], w[14], offset); + w[47] = hc_bytealign_S (w[12], w[13], offset); + w[46] = hc_bytealign_S (w[11], w[12], offset); + w[45] = hc_bytealign_S (w[10], w[11], offset); + w[44] = hc_bytealign_S (w[ 9], w[10], offset); + w[43] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[42] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[41] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[40] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[39] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[38] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[37] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[36] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[35] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[34] = hc_bytealign_S ( 0, w[ 0], offset); w[33] = 0; w[32] = 0; w[31] = 0; @@ -53466,35 +53466,35 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 35: - w[63] = amd_bytealign_S (w[27], w[28], offset); - w[62] = amd_bytealign_S (w[26], w[27], offset); - w[61] = amd_bytealign_S (w[25], w[26], offset); - w[60] = amd_bytealign_S (w[24], w[25], offset); - w[59] = amd_bytealign_S (w[23], w[24], offset); - w[58] = amd_bytealign_S (w[22], w[23], offset); - w[57] = amd_bytealign_S (w[21], w[22], offset); - w[56] = amd_bytealign_S (w[20], w[21], offset); - w[55] = amd_bytealign_S (w[19], w[20], offset); - w[54] = amd_bytealign_S (w[18], w[19], offset); - w[53] = amd_bytealign_S (w[17], w[18], offset); - w[52] = amd_bytealign_S (w[16], w[17], offset); - w[51] = amd_bytealign_S (w[15], w[16], offset); - w[50] = amd_bytealign_S (w[14], w[15], offset); - w[49] = amd_bytealign_S (w[13], w[14], offset); - w[48] = amd_bytealign_S (w[12], w[13], offset); - w[47] = amd_bytealign_S (w[11], w[12], offset); - w[46] = amd_bytealign_S (w[10], w[11], offset); - w[45] = amd_bytealign_S (w[ 9], w[10], offset); - w[44] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[43] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[42] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[41] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[40] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[39] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[38] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[37] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[36] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[35] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[27], w[28], offset); + w[62] = hc_bytealign_S (w[26], w[27], offset); + w[61] = hc_bytealign_S (w[25], w[26], offset); + w[60] = hc_bytealign_S (w[24], w[25], offset); + w[59] = hc_bytealign_S (w[23], w[24], offset); + w[58] = hc_bytealign_S (w[22], w[23], offset); + w[57] = hc_bytealign_S (w[21], w[22], offset); + w[56] = hc_bytealign_S (w[20], w[21], offset); + w[55] = hc_bytealign_S (w[19], w[20], offset); + w[54] = hc_bytealign_S (w[18], w[19], offset); + w[53] = hc_bytealign_S (w[17], w[18], offset); + w[52] = hc_bytealign_S (w[16], w[17], offset); + w[51] = hc_bytealign_S (w[15], w[16], offset); + w[50] = hc_bytealign_S (w[14], w[15], offset); + w[49] = hc_bytealign_S (w[13], w[14], offset); + w[48] = hc_bytealign_S (w[12], w[13], offset); + w[47] = hc_bytealign_S (w[11], w[12], offset); + w[46] = hc_bytealign_S (w[10], w[11], offset); + w[45] = hc_bytealign_S (w[ 9], w[10], offset); + w[44] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[43] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[42] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[41] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[40] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[39] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[38] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[37] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[36] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[35] = hc_bytealign_S ( 0, w[ 0], offset); w[34] = 0; w[33] = 0; w[32] = 0; @@ -53534,34 +53534,34 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 36: - w[63] = amd_bytealign_S (w[26], w[27], offset); - w[62] = amd_bytealign_S (w[25], w[26], offset); - w[61] = amd_bytealign_S (w[24], w[25], offset); - w[60] = amd_bytealign_S (w[23], w[24], offset); - w[59] = amd_bytealign_S (w[22], w[23], offset); - w[58] = amd_bytealign_S (w[21], w[22], offset); - w[57] = amd_bytealign_S (w[20], w[21], offset); - w[56] = amd_bytealign_S (w[19], w[20], offset); - w[55] = amd_bytealign_S (w[18], w[19], offset); - w[54] = amd_bytealign_S (w[17], w[18], offset); - w[53] = amd_bytealign_S (w[16], w[17], offset); - w[52] = amd_bytealign_S (w[15], w[16], offset); - w[51] = amd_bytealign_S (w[14], w[15], offset); - w[50] = amd_bytealign_S (w[13], w[14], offset); - w[49] = amd_bytealign_S (w[12], w[13], offset); - w[48] = amd_bytealign_S (w[11], w[12], offset); - w[47] = amd_bytealign_S (w[10], w[11], offset); - w[46] = amd_bytealign_S (w[ 9], w[10], offset); - w[45] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[44] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[43] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[42] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[41] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[40] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[39] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[38] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[37] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[36] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[26], w[27], offset); + w[62] = hc_bytealign_S (w[25], w[26], offset); + w[61] = hc_bytealign_S (w[24], w[25], offset); + w[60] = hc_bytealign_S (w[23], w[24], offset); + w[59] = hc_bytealign_S (w[22], w[23], offset); + w[58] = hc_bytealign_S (w[21], w[22], offset); + w[57] = hc_bytealign_S (w[20], w[21], offset); + w[56] = hc_bytealign_S (w[19], w[20], offset); + w[55] = hc_bytealign_S (w[18], w[19], offset); + w[54] = hc_bytealign_S (w[17], w[18], offset); + w[53] = hc_bytealign_S (w[16], w[17], offset); + w[52] = hc_bytealign_S (w[15], w[16], offset); + w[51] = hc_bytealign_S (w[14], w[15], offset); + w[50] = hc_bytealign_S (w[13], w[14], offset); + w[49] = hc_bytealign_S (w[12], w[13], offset); + w[48] = hc_bytealign_S (w[11], w[12], offset); + w[47] = hc_bytealign_S (w[10], w[11], offset); + w[46] = hc_bytealign_S (w[ 9], w[10], offset); + w[45] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[44] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[43] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[42] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[41] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[40] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[39] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[38] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[37] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[36] = hc_bytealign_S ( 0, w[ 0], offset); w[35] = 0; w[34] = 0; w[33] = 0; @@ -53602,33 +53602,33 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 37: - w[63] = amd_bytealign_S (w[25], w[26], offset); - w[62] = amd_bytealign_S (w[24], w[25], offset); - w[61] = amd_bytealign_S (w[23], w[24], offset); - w[60] = amd_bytealign_S (w[22], w[23], offset); - w[59] = amd_bytealign_S (w[21], w[22], offset); - w[58] = amd_bytealign_S (w[20], w[21], offset); - w[57] = amd_bytealign_S (w[19], w[20], offset); - w[56] = amd_bytealign_S (w[18], w[19], offset); - w[55] = amd_bytealign_S (w[17], w[18], offset); - w[54] = amd_bytealign_S (w[16], w[17], offset); - w[53] = amd_bytealign_S (w[15], w[16], offset); - w[52] = amd_bytealign_S (w[14], w[15], offset); - w[51] = amd_bytealign_S (w[13], w[14], offset); - w[50] = amd_bytealign_S (w[12], w[13], offset); - w[49] = amd_bytealign_S (w[11], w[12], offset); - w[48] = amd_bytealign_S (w[10], w[11], offset); - w[47] = amd_bytealign_S (w[ 9], w[10], offset); - w[46] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[45] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[44] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[43] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[42] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[41] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[40] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[39] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[38] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[37] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[25], w[26], offset); + w[62] = hc_bytealign_S (w[24], w[25], offset); + w[61] = hc_bytealign_S (w[23], w[24], offset); + w[60] = hc_bytealign_S (w[22], w[23], offset); + w[59] = hc_bytealign_S (w[21], w[22], offset); + w[58] = hc_bytealign_S (w[20], w[21], offset); + w[57] = hc_bytealign_S (w[19], w[20], offset); + w[56] = hc_bytealign_S (w[18], w[19], offset); + w[55] = hc_bytealign_S (w[17], w[18], offset); + w[54] = hc_bytealign_S (w[16], w[17], offset); + w[53] = hc_bytealign_S (w[15], w[16], offset); + w[52] = hc_bytealign_S (w[14], w[15], offset); + w[51] = hc_bytealign_S (w[13], w[14], offset); + w[50] = hc_bytealign_S (w[12], w[13], offset); + w[49] = hc_bytealign_S (w[11], w[12], offset); + w[48] = hc_bytealign_S (w[10], w[11], offset); + w[47] = hc_bytealign_S (w[ 9], w[10], offset); + w[46] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[45] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[44] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[43] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[42] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[41] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[40] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[39] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[38] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[37] = hc_bytealign_S ( 0, w[ 0], offset); w[36] = 0; w[35] = 0; w[34] = 0; @@ -53670,32 +53670,32 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 38: - w[63] = amd_bytealign_S (w[24], w[25], offset); - w[62] = amd_bytealign_S (w[23], w[24], offset); - w[61] = amd_bytealign_S (w[22], w[23], offset); - w[60] = amd_bytealign_S (w[21], w[22], offset); - w[59] = amd_bytealign_S (w[20], w[21], offset); - w[58] = amd_bytealign_S (w[19], w[20], offset); - w[57] = amd_bytealign_S (w[18], w[19], offset); - w[56] = amd_bytealign_S (w[17], w[18], offset); - w[55] = amd_bytealign_S (w[16], w[17], offset); - w[54] = amd_bytealign_S (w[15], w[16], offset); - w[53] = amd_bytealign_S (w[14], w[15], offset); - w[52] = amd_bytealign_S (w[13], w[14], offset); - w[51] = amd_bytealign_S (w[12], w[13], offset); - w[50] = amd_bytealign_S (w[11], w[12], offset); - w[49] = amd_bytealign_S (w[10], w[11], offset); - w[48] = amd_bytealign_S (w[ 9], w[10], offset); - w[47] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[46] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[45] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[44] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[43] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[42] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[41] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[40] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[39] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[38] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[24], w[25], offset); + w[62] = hc_bytealign_S (w[23], w[24], offset); + w[61] = hc_bytealign_S (w[22], w[23], offset); + w[60] = hc_bytealign_S (w[21], w[22], offset); + w[59] = hc_bytealign_S (w[20], w[21], offset); + w[58] = hc_bytealign_S (w[19], w[20], offset); + w[57] = hc_bytealign_S (w[18], w[19], offset); + w[56] = hc_bytealign_S (w[17], w[18], offset); + w[55] = hc_bytealign_S (w[16], w[17], offset); + w[54] = hc_bytealign_S (w[15], w[16], offset); + w[53] = hc_bytealign_S (w[14], w[15], offset); + w[52] = hc_bytealign_S (w[13], w[14], offset); + w[51] = hc_bytealign_S (w[12], w[13], offset); + w[50] = hc_bytealign_S (w[11], w[12], offset); + w[49] = hc_bytealign_S (w[10], w[11], offset); + w[48] = hc_bytealign_S (w[ 9], w[10], offset); + w[47] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[46] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[45] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[44] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[43] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[42] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[41] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[40] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[39] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[38] = hc_bytealign_S ( 0, w[ 0], offset); w[37] = 0; w[36] = 0; w[35] = 0; @@ -53738,31 +53738,31 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 39: - w[63] = amd_bytealign_S (w[23], w[24], offset); - w[62] = amd_bytealign_S (w[22], w[23], offset); - w[61] = amd_bytealign_S (w[21], w[22], offset); - w[60] = amd_bytealign_S (w[20], w[21], offset); - w[59] = amd_bytealign_S (w[19], w[20], offset); - w[58] = amd_bytealign_S (w[18], w[19], offset); - w[57] = amd_bytealign_S (w[17], w[18], offset); - w[56] = amd_bytealign_S (w[16], w[17], offset); - w[55] = amd_bytealign_S (w[15], w[16], offset); - w[54] = amd_bytealign_S (w[14], w[15], offset); - w[53] = amd_bytealign_S (w[13], w[14], offset); - w[52] = amd_bytealign_S (w[12], w[13], offset); - w[51] = amd_bytealign_S (w[11], w[12], offset); - w[50] = amd_bytealign_S (w[10], w[11], offset); - w[49] = amd_bytealign_S (w[ 9], w[10], offset); - w[48] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[47] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[46] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[45] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[44] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[43] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[42] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[41] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[40] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[39] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[23], w[24], offset); + w[62] = hc_bytealign_S (w[22], w[23], offset); + w[61] = hc_bytealign_S (w[21], w[22], offset); + w[60] = hc_bytealign_S (w[20], w[21], offset); + w[59] = hc_bytealign_S (w[19], w[20], offset); + w[58] = hc_bytealign_S (w[18], w[19], offset); + w[57] = hc_bytealign_S (w[17], w[18], offset); + w[56] = hc_bytealign_S (w[16], w[17], offset); + w[55] = hc_bytealign_S (w[15], w[16], offset); + w[54] = hc_bytealign_S (w[14], w[15], offset); + w[53] = hc_bytealign_S (w[13], w[14], offset); + w[52] = hc_bytealign_S (w[12], w[13], offset); + w[51] = hc_bytealign_S (w[11], w[12], offset); + w[50] = hc_bytealign_S (w[10], w[11], offset); + w[49] = hc_bytealign_S (w[ 9], w[10], offset); + w[48] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[47] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[46] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[45] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[44] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[43] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[42] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[41] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[40] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[39] = hc_bytealign_S ( 0, w[ 0], offset); w[38] = 0; w[37] = 0; w[36] = 0; @@ -53806,30 +53806,30 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 40: - w[63] = amd_bytealign_S (w[22], w[23], offset); - w[62] = amd_bytealign_S (w[21], w[22], offset); - w[61] = amd_bytealign_S (w[20], w[21], offset); - w[60] = amd_bytealign_S (w[19], w[20], offset); - w[59] = amd_bytealign_S (w[18], w[19], offset); - w[58] = amd_bytealign_S (w[17], w[18], offset); - w[57] = amd_bytealign_S (w[16], w[17], offset); - w[56] = amd_bytealign_S (w[15], w[16], offset); - w[55] = amd_bytealign_S (w[14], w[15], offset); - w[54] = amd_bytealign_S (w[13], w[14], offset); - w[53] = amd_bytealign_S (w[12], w[13], offset); - w[52] = amd_bytealign_S (w[11], w[12], offset); - w[51] = amd_bytealign_S (w[10], w[11], offset); - w[50] = amd_bytealign_S (w[ 9], w[10], offset); - w[49] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[48] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[47] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[46] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[45] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[44] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[43] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[42] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[41] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[40] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[22], w[23], offset); + w[62] = hc_bytealign_S (w[21], w[22], offset); + w[61] = hc_bytealign_S (w[20], w[21], offset); + w[60] = hc_bytealign_S (w[19], w[20], offset); + w[59] = hc_bytealign_S (w[18], w[19], offset); + w[58] = hc_bytealign_S (w[17], w[18], offset); + w[57] = hc_bytealign_S (w[16], w[17], offset); + w[56] = hc_bytealign_S (w[15], w[16], offset); + w[55] = hc_bytealign_S (w[14], w[15], offset); + w[54] = hc_bytealign_S (w[13], w[14], offset); + w[53] = hc_bytealign_S (w[12], w[13], offset); + w[52] = hc_bytealign_S (w[11], w[12], offset); + w[51] = hc_bytealign_S (w[10], w[11], offset); + w[50] = hc_bytealign_S (w[ 9], w[10], offset); + w[49] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[48] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[47] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[46] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[45] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[44] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[43] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[42] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[41] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[40] = hc_bytealign_S ( 0, w[ 0], offset); w[39] = 0; w[38] = 0; w[37] = 0; @@ -53874,29 +53874,29 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 41: - w[63] = amd_bytealign_S (w[21], w[22], offset); - w[62] = amd_bytealign_S (w[20], w[21], offset); - w[61] = amd_bytealign_S (w[19], w[20], offset); - w[60] = amd_bytealign_S (w[18], w[19], offset); - w[59] = amd_bytealign_S (w[17], w[18], offset); - w[58] = amd_bytealign_S (w[16], w[17], offset); - w[57] = amd_bytealign_S (w[15], w[16], offset); - w[56] = amd_bytealign_S (w[14], w[15], offset); - w[55] = amd_bytealign_S (w[13], w[14], offset); - w[54] = amd_bytealign_S (w[12], w[13], offset); - w[53] = amd_bytealign_S (w[11], w[12], offset); - w[52] = amd_bytealign_S (w[10], w[11], offset); - w[51] = amd_bytealign_S (w[ 9], w[10], offset); - w[50] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[49] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[48] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[47] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[46] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[45] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[44] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[43] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[42] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[41] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[21], w[22], offset); + w[62] = hc_bytealign_S (w[20], w[21], offset); + w[61] = hc_bytealign_S (w[19], w[20], offset); + w[60] = hc_bytealign_S (w[18], w[19], offset); + w[59] = hc_bytealign_S (w[17], w[18], offset); + w[58] = hc_bytealign_S (w[16], w[17], offset); + w[57] = hc_bytealign_S (w[15], w[16], offset); + w[56] = hc_bytealign_S (w[14], w[15], offset); + w[55] = hc_bytealign_S (w[13], w[14], offset); + w[54] = hc_bytealign_S (w[12], w[13], offset); + w[53] = hc_bytealign_S (w[11], w[12], offset); + w[52] = hc_bytealign_S (w[10], w[11], offset); + w[51] = hc_bytealign_S (w[ 9], w[10], offset); + w[50] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[49] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[48] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[47] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[46] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[45] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[44] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[43] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[42] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[41] = hc_bytealign_S ( 0, w[ 0], offset); w[40] = 0; w[39] = 0; w[38] = 0; @@ -53942,28 +53942,28 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 42: - w[63] = amd_bytealign_S (w[20], w[21], offset); - w[62] = amd_bytealign_S (w[19], w[20], offset); - w[61] = amd_bytealign_S (w[18], w[19], offset); - w[60] = amd_bytealign_S (w[17], w[18], offset); - w[59] = amd_bytealign_S (w[16], w[17], offset); - w[58] = amd_bytealign_S (w[15], w[16], offset); - w[57] = amd_bytealign_S (w[14], w[15], offset); - w[56] = amd_bytealign_S (w[13], w[14], offset); - w[55] = amd_bytealign_S (w[12], w[13], offset); - w[54] = amd_bytealign_S (w[11], w[12], offset); - w[53] = amd_bytealign_S (w[10], w[11], offset); - w[52] = amd_bytealign_S (w[ 9], w[10], offset); - w[51] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[50] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[49] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[48] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[47] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[46] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[45] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[44] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[43] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[42] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[20], w[21], offset); + w[62] = hc_bytealign_S (w[19], w[20], offset); + w[61] = hc_bytealign_S (w[18], w[19], offset); + w[60] = hc_bytealign_S (w[17], w[18], offset); + w[59] = hc_bytealign_S (w[16], w[17], offset); + w[58] = hc_bytealign_S (w[15], w[16], offset); + w[57] = hc_bytealign_S (w[14], w[15], offset); + w[56] = hc_bytealign_S (w[13], w[14], offset); + w[55] = hc_bytealign_S (w[12], w[13], offset); + w[54] = hc_bytealign_S (w[11], w[12], offset); + w[53] = hc_bytealign_S (w[10], w[11], offset); + w[52] = hc_bytealign_S (w[ 9], w[10], offset); + w[51] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[50] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[49] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[48] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[47] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[46] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[45] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[44] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[43] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[42] = hc_bytealign_S ( 0, w[ 0], offset); w[41] = 0; w[40] = 0; w[39] = 0; @@ -54010,27 +54010,27 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 43: - w[63] = amd_bytealign_S (w[19], w[20], offset); - w[62] = amd_bytealign_S (w[18], w[19], offset); - w[61] = amd_bytealign_S (w[17], w[18], offset); - w[60] = amd_bytealign_S (w[16], w[17], offset); - w[59] = amd_bytealign_S (w[15], w[16], offset); - w[58] = amd_bytealign_S (w[14], w[15], offset); - w[57] = amd_bytealign_S (w[13], w[14], offset); - w[56] = amd_bytealign_S (w[12], w[13], offset); - w[55] = amd_bytealign_S (w[11], w[12], offset); - w[54] = amd_bytealign_S (w[10], w[11], offset); - w[53] = amd_bytealign_S (w[ 9], w[10], offset); - w[52] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[51] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[50] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[49] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[48] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[47] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[46] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[45] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[44] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[43] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[19], w[20], offset); + w[62] = hc_bytealign_S (w[18], w[19], offset); + w[61] = hc_bytealign_S (w[17], w[18], offset); + w[60] = hc_bytealign_S (w[16], w[17], offset); + w[59] = hc_bytealign_S (w[15], w[16], offset); + w[58] = hc_bytealign_S (w[14], w[15], offset); + w[57] = hc_bytealign_S (w[13], w[14], offset); + w[56] = hc_bytealign_S (w[12], w[13], offset); + w[55] = hc_bytealign_S (w[11], w[12], offset); + w[54] = hc_bytealign_S (w[10], w[11], offset); + w[53] = hc_bytealign_S (w[ 9], w[10], offset); + w[52] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[51] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[50] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[49] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[48] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[47] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[46] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[45] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[44] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[43] = hc_bytealign_S ( 0, w[ 0], offset); w[42] = 0; w[41] = 0; w[40] = 0; @@ -54078,26 +54078,26 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 44: - w[63] = amd_bytealign_S (w[18], w[19], offset); - w[62] = amd_bytealign_S (w[17], w[18], offset); - w[61] = amd_bytealign_S (w[16], w[17], offset); - w[60] = amd_bytealign_S (w[15], w[16], offset); - w[59] = amd_bytealign_S (w[14], w[15], offset); - w[58] = amd_bytealign_S (w[13], w[14], offset); - w[57] = amd_bytealign_S (w[12], w[13], offset); - w[56] = amd_bytealign_S (w[11], w[12], offset); - w[55] = amd_bytealign_S (w[10], w[11], offset); - w[54] = amd_bytealign_S (w[ 9], w[10], offset); - w[53] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[52] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[51] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[50] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[49] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[48] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[47] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[46] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[45] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[44] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[18], w[19], offset); + w[62] = hc_bytealign_S (w[17], w[18], offset); + w[61] = hc_bytealign_S (w[16], w[17], offset); + w[60] = hc_bytealign_S (w[15], w[16], offset); + w[59] = hc_bytealign_S (w[14], w[15], offset); + w[58] = hc_bytealign_S (w[13], w[14], offset); + w[57] = hc_bytealign_S (w[12], w[13], offset); + w[56] = hc_bytealign_S (w[11], w[12], offset); + w[55] = hc_bytealign_S (w[10], w[11], offset); + w[54] = hc_bytealign_S (w[ 9], w[10], offset); + w[53] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[52] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[51] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[50] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[49] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[48] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[47] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[46] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[45] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[44] = hc_bytealign_S ( 0, w[ 0], offset); w[43] = 0; w[42] = 0; w[41] = 0; @@ -54146,25 +54146,25 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 45: - w[63] = amd_bytealign_S (w[17], w[18], offset); - w[62] = amd_bytealign_S (w[16], w[17], offset); - w[61] = amd_bytealign_S (w[15], w[16], offset); - w[60] = amd_bytealign_S (w[14], w[15], offset); - w[59] = amd_bytealign_S (w[13], w[14], offset); - w[58] = amd_bytealign_S (w[12], w[13], offset); - w[57] = amd_bytealign_S (w[11], w[12], offset); - w[56] = amd_bytealign_S (w[10], w[11], offset); - w[55] = amd_bytealign_S (w[ 9], w[10], offset); - w[54] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[53] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[52] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[51] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[50] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[49] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[48] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[47] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[46] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[45] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[17], w[18], offset); + w[62] = hc_bytealign_S (w[16], w[17], offset); + w[61] = hc_bytealign_S (w[15], w[16], offset); + w[60] = hc_bytealign_S (w[14], w[15], offset); + w[59] = hc_bytealign_S (w[13], w[14], offset); + w[58] = hc_bytealign_S (w[12], w[13], offset); + w[57] = hc_bytealign_S (w[11], w[12], offset); + w[56] = hc_bytealign_S (w[10], w[11], offset); + w[55] = hc_bytealign_S (w[ 9], w[10], offset); + w[54] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[53] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[52] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[51] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[50] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[49] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[48] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[47] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[46] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[45] = hc_bytealign_S ( 0, w[ 0], offset); w[44] = 0; w[43] = 0; w[42] = 0; @@ -54214,24 +54214,24 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 46: - w[63] = amd_bytealign_S (w[16], w[17], offset); - w[62] = amd_bytealign_S (w[15], w[16], offset); - w[61] = amd_bytealign_S (w[14], w[15], offset); - w[60] = amd_bytealign_S (w[13], w[14], offset); - w[59] = amd_bytealign_S (w[12], w[13], offset); - w[58] = amd_bytealign_S (w[11], w[12], offset); - w[57] = amd_bytealign_S (w[10], w[11], offset); - w[56] = amd_bytealign_S (w[ 9], w[10], offset); - w[55] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[54] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[53] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[52] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[51] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[50] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[49] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[48] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[47] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[46] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[16], w[17], offset); + w[62] = hc_bytealign_S (w[15], w[16], offset); + w[61] = hc_bytealign_S (w[14], w[15], offset); + w[60] = hc_bytealign_S (w[13], w[14], offset); + w[59] = hc_bytealign_S (w[12], w[13], offset); + w[58] = hc_bytealign_S (w[11], w[12], offset); + w[57] = hc_bytealign_S (w[10], w[11], offset); + w[56] = hc_bytealign_S (w[ 9], w[10], offset); + w[55] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[54] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[53] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[52] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[51] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[50] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[49] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[48] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[47] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[46] = hc_bytealign_S ( 0, w[ 0], offset); w[45] = 0; w[44] = 0; w[43] = 0; @@ -54282,23 +54282,23 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 47: - w[63] = amd_bytealign_S (w[15], w[16], offset); - w[62] = amd_bytealign_S (w[14], w[15], offset); - w[61] = amd_bytealign_S (w[13], w[14], offset); - w[60] = amd_bytealign_S (w[12], w[13], offset); - w[59] = amd_bytealign_S (w[11], w[12], offset); - w[58] = amd_bytealign_S (w[10], w[11], offset); - w[57] = amd_bytealign_S (w[ 9], w[10], offset); - w[56] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[55] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[54] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[53] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[52] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[51] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[50] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[49] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[48] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[47] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[15], w[16], offset); + w[62] = hc_bytealign_S (w[14], w[15], offset); + w[61] = hc_bytealign_S (w[13], w[14], offset); + w[60] = hc_bytealign_S (w[12], w[13], offset); + w[59] = hc_bytealign_S (w[11], w[12], offset); + w[58] = hc_bytealign_S (w[10], w[11], offset); + w[57] = hc_bytealign_S (w[ 9], w[10], offset); + w[56] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[55] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[54] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[53] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[52] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[51] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[50] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[49] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[48] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[47] = hc_bytealign_S ( 0, w[ 0], offset); w[46] = 0; w[45] = 0; w[44] = 0; @@ -54350,22 +54350,22 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 48: - w[63] = amd_bytealign_S (w[14], w[15], offset); - w[62] = amd_bytealign_S (w[13], w[14], offset); - w[61] = amd_bytealign_S (w[12], w[13], offset); - w[60] = amd_bytealign_S (w[11], w[12], offset); - w[59] = amd_bytealign_S (w[10], w[11], offset); - w[58] = amd_bytealign_S (w[ 9], w[10], offset); - w[57] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[56] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[55] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[54] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[53] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[52] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[51] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[50] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[49] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[48] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[14], w[15], offset); + w[62] = hc_bytealign_S (w[13], w[14], offset); + w[61] = hc_bytealign_S (w[12], w[13], offset); + w[60] = hc_bytealign_S (w[11], w[12], offset); + w[59] = hc_bytealign_S (w[10], w[11], offset); + w[58] = hc_bytealign_S (w[ 9], w[10], offset); + w[57] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[56] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[55] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[54] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[53] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[52] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[51] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[50] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[49] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[48] = hc_bytealign_S ( 0, w[ 0], offset); w[47] = 0; w[46] = 0; w[45] = 0; @@ -54418,21 +54418,21 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 49: - w[63] = amd_bytealign_S (w[13], w[14], offset); - w[62] = amd_bytealign_S (w[12], w[13], offset); - w[61] = amd_bytealign_S (w[11], w[12], offset); - w[60] = amd_bytealign_S (w[10], w[11], offset); - w[59] = amd_bytealign_S (w[ 9], w[10], offset); - w[58] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[57] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[56] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[55] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[54] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[53] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[52] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[51] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[50] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[49] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[13], w[14], offset); + w[62] = hc_bytealign_S (w[12], w[13], offset); + w[61] = hc_bytealign_S (w[11], w[12], offset); + w[60] = hc_bytealign_S (w[10], w[11], offset); + w[59] = hc_bytealign_S (w[ 9], w[10], offset); + w[58] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[57] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[56] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[55] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[54] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[53] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[52] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[51] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[50] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[49] = hc_bytealign_S ( 0, w[ 0], offset); w[48] = 0; w[47] = 0; w[46] = 0; @@ -54486,20 +54486,20 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 50: - w[63] = amd_bytealign_S (w[12], w[13], offset); - w[62] = amd_bytealign_S (w[11], w[12], offset); - w[61] = amd_bytealign_S (w[10], w[11], offset); - w[60] = amd_bytealign_S (w[ 9], w[10], offset); - w[59] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[58] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[57] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[56] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[55] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[54] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[53] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[52] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[51] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[50] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[12], w[13], offset); + w[62] = hc_bytealign_S (w[11], w[12], offset); + w[61] = hc_bytealign_S (w[10], w[11], offset); + w[60] = hc_bytealign_S (w[ 9], w[10], offset); + w[59] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[58] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[57] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[56] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[55] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[54] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[53] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[52] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[51] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[50] = hc_bytealign_S ( 0, w[ 0], offset); w[49] = 0; w[48] = 0; w[47] = 0; @@ -54554,19 +54554,19 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 51: - w[63] = amd_bytealign_S (w[11], w[12], offset); - w[62] = amd_bytealign_S (w[10], w[11], offset); - w[61] = amd_bytealign_S (w[ 9], w[10], offset); - w[60] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[59] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[58] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[57] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[56] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[55] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[54] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[53] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[52] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[51] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[11], w[12], offset); + w[62] = hc_bytealign_S (w[10], w[11], offset); + w[61] = hc_bytealign_S (w[ 9], w[10], offset); + w[60] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[59] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[58] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[57] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[56] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[55] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[54] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[53] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[52] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[51] = hc_bytealign_S ( 0, w[ 0], offset); w[50] = 0; w[49] = 0; w[48] = 0; @@ -54622,18 +54622,18 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 52: - w[63] = amd_bytealign_S (w[10], w[11], offset); - w[62] = amd_bytealign_S (w[ 9], w[10], offset); - w[61] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[60] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[59] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[58] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[57] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[56] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[55] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[54] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[53] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[52] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[10], w[11], offset); + w[62] = hc_bytealign_S (w[ 9], w[10], offset); + w[61] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[60] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[59] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[58] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[57] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[56] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[55] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[54] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[53] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[52] = hc_bytealign_S ( 0, w[ 0], offset); w[51] = 0; w[50] = 0; w[49] = 0; @@ -54690,17 +54690,17 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 53: - w[63] = amd_bytealign_S (w[ 9], w[10], offset); - w[62] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[61] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[60] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[59] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[58] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[57] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[56] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[55] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[54] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[53] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[ 9], w[10], offset); + w[62] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[61] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[60] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[59] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[58] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[57] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[56] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[55] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[54] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[53] = hc_bytealign_S ( 0, w[ 0], offset); w[52] = 0; w[51] = 0; w[50] = 0; @@ -54758,16 +54758,16 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 54: - w[63] = amd_bytealign_S (w[ 8], w[ 9], offset); - w[62] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[61] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[60] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[59] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[58] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[57] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[56] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[55] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[54] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[ 8], w[ 9], offset); + w[62] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[61] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[60] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[59] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[58] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[57] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[56] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[55] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[54] = hc_bytealign_S ( 0, w[ 0], offset); w[53] = 0; w[52] = 0; w[51] = 0; @@ -54826,15 +54826,15 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 55: - w[63] = amd_bytealign_S (w[ 7], w[ 8], offset); - w[62] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[61] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[60] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[59] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[58] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[57] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[56] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[55] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[ 7], w[ 8], offset); + w[62] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[61] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[60] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[59] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[58] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[57] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[56] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[55] = hc_bytealign_S ( 0, w[ 0], offset); w[54] = 0; w[53] = 0; w[52] = 0; @@ -54894,14 +54894,14 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 56: - w[63] = amd_bytealign_S (w[ 6], w[ 7], offset); - w[62] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[61] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[60] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[59] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[58] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[57] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[56] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[ 6], w[ 7], offset); + w[62] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[61] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[60] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[59] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[58] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[57] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[56] = hc_bytealign_S ( 0, w[ 0], offset); w[55] = 0; w[54] = 0; w[53] = 0; @@ -54962,13 +54962,13 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 57: - w[63] = amd_bytealign_S (w[ 5], w[ 6], offset); - w[62] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[61] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[60] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[59] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[58] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[57] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[ 5], w[ 6], offset); + w[62] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[61] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[60] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[59] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[58] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[57] = hc_bytealign_S ( 0, w[ 0], offset); w[56] = 0; w[55] = 0; w[54] = 0; @@ -55030,12 +55030,12 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 58: - w[63] = amd_bytealign_S (w[ 4], w[ 5], offset); - w[62] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[61] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[60] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[59] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[58] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[ 4], w[ 5], offset); + w[62] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[61] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[60] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[59] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[58] = hc_bytealign_S ( 0, w[ 0], offset); w[57] = 0; w[56] = 0; w[55] = 0; @@ -55098,11 +55098,11 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 59: - w[63] = amd_bytealign_S (w[ 3], w[ 4], offset); - w[62] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[61] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[60] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[59] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[ 3], w[ 4], offset); + w[62] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[61] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[60] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[59] = hc_bytealign_S ( 0, w[ 0], offset); w[58] = 0; w[57] = 0; w[56] = 0; @@ -55166,10 +55166,10 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 60: - w[63] = amd_bytealign_S (w[ 2], w[ 3], offset); - w[62] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[61] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[60] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[ 2], w[ 3], offset); + w[62] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[61] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[60] = hc_bytealign_S ( 0, w[ 0], offset); w[59] = 0; w[58] = 0; w[57] = 0; @@ -55234,9 +55234,9 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 61: - w[63] = amd_bytealign_S (w[ 1], w[ 2], offset); - w[62] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[61] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[ 1], w[ 2], offset); + w[62] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[61] = hc_bytealign_S ( 0, w[ 0], offset); w[60] = 0; w[59] = 0; w[58] = 0; @@ -55302,8 +55302,8 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 62: - w[63] = amd_bytealign_S (w[ 0], w[ 1], offset); - w[62] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S (w[ 0], w[ 1], offset); + w[62] = hc_bytealign_S ( 0, w[ 0], offset); w[61] = 0; w[60] = 0; w[59] = 0; @@ -55370,7 +55370,7 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 63: - w[63] = amd_bytealign_S ( 0, w[ 0], offset); + w[63] = hc_bytealign_S ( 0, w[ 0], offset); w[62] = 0; w[61] = 0; w[60] = 0; @@ -55452,271 +55452,271 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) switch (offset_switch) { case 0: - w[63] = __byte_perm_S (w[63], w[62], selector); - w[62] = __byte_perm_S (w[62], w[61], selector); - w[61] = __byte_perm_S (w[61], w[60], selector); - w[60] = __byte_perm_S (w[60], w[59], selector); - w[59] = __byte_perm_S (w[59], w[58], selector); - w[58] = __byte_perm_S (w[58], w[57], selector); - w[57] = __byte_perm_S (w[57], w[56], selector); - w[56] = __byte_perm_S (w[56], w[55], selector); - w[55] = __byte_perm_S (w[55], w[54], selector); - w[54] = __byte_perm_S (w[54], w[53], selector); - w[53] = __byte_perm_S (w[53], w[52], selector); - w[52] = __byte_perm_S (w[52], w[51], selector); - w[51] = __byte_perm_S (w[51], w[50], selector); - w[50] = __byte_perm_S (w[50], w[49], selector); - w[49] = __byte_perm_S (w[49], w[48], selector); - w[48] = __byte_perm_S (w[48], w[47], selector); - w[47] = __byte_perm_S (w[47], w[46], selector); - w[46] = __byte_perm_S (w[46], w[45], selector); - w[45] = __byte_perm_S (w[45], w[44], selector); - w[44] = __byte_perm_S (w[44], w[43], selector); - w[43] = __byte_perm_S (w[43], w[42], selector); - w[42] = __byte_perm_S (w[42], w[41], selector); - w[41] = __byte_perm_S (w[41], w[40], selector); - w[40] = __byte_perm_S (w[40], w[39], selector); - w[39] = __byte_perm_S (w[39], w[38], selector); - w[38] = __byte_perm_S (w[38], w[37], selector); - w[37] = __byte_perm_S (w[37], w[36], selector); - w[36] = __byte_perm_S (w[36], w[35], selector); - w[35] = __byte_perm_S (w[35], w[34], selector); - w[34] = __byte_perm_S (w[34], w[33], selector); - w[33] = __byte_perm_S (w[33], w[32], selector); - w[32] = __byte_perm_S (w[32], w[31], selector); - w[31] = __byte_perm_S (w[31], w[30], selector); - w[30] = __byte_perm_S (w[30], w[29], selector); - w[29] = __byte_perm_S (w[29], w[28], selector); - w[28] = __byte_perm_S (w[28], w[27], selector); - w[27] = __byte_perm_S (w[27], w[26], selector); - w[26] = __byte_perm_S (w[26], w[25], selector); - w[25] = __byte_perm_S (w[25], w[24], selector); - w[24] = __byte_perm_S (w[24], w[23], selector); - w[23] = __byte_perm_S (w[23], w[22], selector); - w[22] = __byte_perm_S (w[22], w[21], selector); - w[21] = __byte_perm_S (w[21], w[20], selector); - w[20] = __byte_perm_S (w[20], w[19], selector); - w[19] = __byte_perm_S (w[19], w[18], selector); - w[18] = __byte_perm_S (w[18], w[17], selector); - w[17] = __byte_perm_S (w[17], w[16], selector); - w[16] = __byte_perm_S (w[16], w[15], selector); - w[15] = __byte_perm_S (w[15], w[14], selector); - w[14] = __byte_perm_S (w[14], w[13], selector); - w[13] = __byte_perm_S (w[13], w[12], selector); - w[12] = __byte_perm_S (w[12], w[11], selector); - w[11] = __byte_perm_S (w[11], w[10], selector); - w[10] = __byte_perm_S (w[10], w[ 9], selector); - w[ 9] = __byte_perm_S (w[ 9], w[ 8], selector); - w[ 8] = __byte_perm_S (w[ 8], w[ 7], selector); - w[ 7] = __byte_perm_S (w[ 7], w[ 6], selector); - w[ 6] = __byte_perm_S (w[ 6], w[ 5], selector); - w[ 5] = __byte_perm_S (w[ 5], w[ 4], selector); - w[ 4] = __byte_perm_S (w[ 4], w[ 3], selector); - w[ 3] = __byte_perm_S (w[ 3], w[ 2], selector); - w[ 2] = __byte_perm_S (w[ 2], w[ 1], selector); - w[ 1] = __byte_perm_S (w[ 1], w[ 0], selector); - w[ 0] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[63], w[62], selector); + w[62] = hc_byte_perm_S (w[62], w[61], selector); + w[61] = hc_byte_perm_S (w[61], w[60], selector); + w[60] = hc_byte_perm_S (w[60], w[59], selector); + w[59] = hc_byte_perm_S (w[59], w[58], selector); + w[58] = hc_byte_perm_S (w[58], w[57], selector); + w[57] = hc_byte_perm_S (w[57], w[56], selector); + w[56] = hc_byte_perm_S (w[56], w[55], selector); + w[55] = hc_byte_perm_S (w[55], w[54], selector); + w[54] = hc_byte_perm_S (w[54], w[53], selector); + w[53] = hc_byte_perm_S (w[53], w[52], selector); + w[52] = hc_byte_perm_S (w[52], w[51], selector); + w[51] = hc_byte_perm_S (w[51], w[50], selector); + w[50] = hc_byte_perm_S (w[50], w[49], selector); + w[49] = hc_byte_perm_S (w[49], w[48], selector); + w[48] = hc_byte_perm_S (w[48], w[47], selector); + w[47] = hc_byte_perm_S (w[47], w[46], selector); + w[46] = hc_byte_perm_S (w[46], w[45], selector); + w[45] = hc_byte_perm_S (w[45], w[44], selector); + w[44] = hc_byte_perm_S (w[44], w[43], selector); + w[43] = hc_byte_perm_S (w[43], w[42], selector); + w[42] = hc_byte_perm_S (w[42], w[41], selector); + w[41] = hc_byte_perm_S (w[41], w[40], selector); + w[40] = hc_byte_perm_S (w[40], w[39], selector); + w[39] = hc_byte_perm_S (w[39], w[38], selector); + w[38] = hc_byte_perm_S (w[38], w[37], selector); + w[37] = hc_byte_perm_S (w[37], w[36], selector); + w[36] = hc_byte_perm_S (w[36], w[35], selector); + w[35] = hc_byte_perm_S (w[35], w[34], selector); + w[34] = hc_byte_perm_S (w[34], w[33], selector); + w[33] = hc_byte_perm_S (w[33], w[32], selector); + w[32] = hc_byte_perm_S (w[32], w[31], selector); + w[31] = hc_byte_perm_S (w[31], w[30], selector); + w[30] = hc_byte_perm_S (w[30], w[29], selector); + w[29] = hc_byte_perm_S (w[29], w[28], selector); + w[28] = hc_byte_perm_S (w[28], w[27], selector); + w[27] = hc_byte_perm_S (w[27], w[26], selector); + w[26] = hc_byte_perm_S (w[26], w[25], selector); + w[25] = hc_byte_perm_S (w[25], w[24], selector); + w[24] = hc_byte_perm_S (w[24], w[23], selector); + w[23] = hc_byte_perm_S (w[23], w[22], selector); + w[22] = hc_byte_perm_S (w[22], w[21], selector); + w[21] = hc_byte_perm_S (w[21], w[20], selector); + w[20] = hc_byte_perm_S (w[20], w[19], selector); + w[19] = hc_byte_perm_S (w[19], w[18], selector); + w[18] = hc_byte_perm_S (w[18], w[17], selector); + w[17] = hc_byte_perm_S (w[17], w[16], selector); + w[16] = hc_byte_perm_S (w[16], w[15], selector); + w[15] = hc_byte_perm_S (w[15], w[14], selector); + w[14] = hc_byte_perm_S (w[14], w[13], selector); + w[13] = hc_byte_perm_S (w[13], w[12], selector); + w[12] = hc_byte_perm_S (w[12], w[11], selector); + w[11] = hc_byte_perm_S (w[11], w[10], selector); + w[10] = hc_byte_perm_S (w[10], w[ 9], selector); + w[ 9] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[ 8] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[ 7] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[ 6] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[ 5] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[ 4] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[ 3] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[ 2] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[ 1] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[ 0] = hc_byte_perm_S (w[ 0], 0, selector); break; case 1: - w[63] = __byte_perm_S (w[62], w[61], selector); - w[62] = __byte_perm_S (w[61], w[60], selector); - w[61] = __byte_perm_S (w[60], w[59], selector); - w[60] = __byte_perm_S (w[59], w[58], selector); - w[59] = __byte_perm_S (w[58], w[57], selector); - w[58] = __byte_perm_S (w[57], w[56], selector); - w[57] = __byte_perm_S (w[56], w[55], selector); - w[56] = __byte_perm_S (w[55], w[54], selector); - w[55] = __byte_perm_S (w[54], w[53], selector); - w[54] = __byte_perm_S (w[53], w[52], selector); - w[53] = __byte_perm_S (w[52], w[51], selector); - w[52] = __byte_perm_S (w[51], w[50], selector); - w[51] = __byte_perm_S (w[50], w[49], selector); - w[50] = __byte_perm_S (w[49], w[48], selector); - w[49] = __byte_perm_S (w[48], w[47], selector); - w[48] = __byte_perm_S (w[47], w[46], selector); - w[47] = __byte_perm_S (w[46], w[45], selector); - w[46] = __byte_perm_S (w[45], w[44], selector); - w[45] = __byte_perm_S (w[44], w[43], selector); - w[44] = __byte_perm_S (w[43], w[42], selector); - w[43] = __byte_perm_S (w[42], w[41], selector); - w[42] = __byte_perm_S (w[41], w[40], selector); - w[41] = __byte_perm_S (w[40], w[39], selector); - w[40] = __byte_perm_S (w[39], w[38], selector); - w[39] = __byte_perm_S (w[38], w[37], selector); - w[38] = __byte_perm_S (w[37], w[36], selector); - w[37] = __byte_perm_S (w[36], w[35], selector); - w[36] = __byte_perm_S (w[35], w[34], selector); - w[35] = __byte_perm_S (w[34], w[33], selector); - w[34] = __byte_perm_S (w[33], w[32], selector); - w[33] = __byte_perm_S (w[32], w[31], selector); - w[32] = __byte_perm_S (w[31], w[30], selector); - w[31] = __byte_perm_S (w[30], w[29], selector); - w[30] = __byte_perm_S (w[29], w[28], selector); - w[29] = __byte_perm_S (w[28], w[27], selector); - w[28] = __byte_perm_S (w[27], w[26], selector); - w[27] = __byte_perm_S (w[26], w[25], selector); - w[26] = __byte_perm_S (w[25], w[24], selector); - w[25] = __byte_perm_S (w[24], w[23], selector); - w[24] = __byte_perm_S (w[23], w[22], selector); - w[23] = __byte_perm_S (w[22], w[21], selector); - w[22] = __byte_perm_S (w[21], w[20], selector); - w[21] = __byte_perm_S (w[20], w[19], selector); - w[20] = __byte_perm_S (w[19], w[18], selector); - w[19] = __byte_perm_S (w[18], w[17], selector); - w[18] = __byte_perm_S (w[17], w[16], selector); - w[17] = __byte_perm_S (w[16], w[15], selector); - w[16] = __byte_perm_S (w[15], w[14], selector); - w[15] = __byte_perm_S (w[14], w[13], selector); - w[14] = __byte_perm_S (w[13], w[12], selector); - w[13] = __byte_perm_S (w[12], w[11], selector); - w[12] = __byte_perm_S (w[11], w[10], selector); - w[11] = __byte_perm_S (w[10], w[ 9], selector); - w[10] = __byte_perm_S (w[ 9], w[ 8], selector); - w[ 9] = __byte_perm_S (w[ 8], w[ 7], selector); - w[ 8] = __byte_perm_S (w[ 7], w[ 6], selector); - w[ 7] = __byte_perm_S (w[ 6], w[ 5], selector); - w[ 6] = __byte_perm_S (w[ 5], w[ 4], selector); - w[ 5] = __byte_perm_S (w[ 4], w[ 3], selector); - w[ 4] = __byte_perm_S (w[ 3], w[ 2], selector); - w[ 3] = __byte_perm_S (w[ 2], w[ 1], selector); - w[ 2] = __byte_perm_S (w[ 1], w[ 0], selector); - w[ 1] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[62], w[61], selector); + w[62] = hc_byte_perm_S (w[61], w[60], selector); + w[61] = hc_byte_perm_S (w[60], w[59], selector); + w[60] = hc_byte_perm_S (w[59], w[58], selector); + w[59] = hc_byte_perm_S (w[58], w[57], selector); + w[58] = hc_byte_perm_S (w[57], w[56], selector); + w[57] = hc_byte_perm_S (w[56], w[55], selector); + w[56] = hc_byte_perm_S (w[55], w[54], selector); + w[55] = hc_byte_perm_S (w[54], w[53], selector); + w[54] = hc_byte_perm_S (w[53], w[52], selector); + w[53] = hc_byte_perm_S (w[52], w[51], selector); + w[52] = hc_byte_perm_S (w[51], w[50], selector); + w[51] = hc_byte_perm_S (w[50], w[49], selector); + w[50] = hc_byte_perm_S (w[49], w[48], selector); + w[49] = hc_byte_perm_S (w[48], w[47], selector); + w[48] = hc_byte_perm_S (w[47], w[46], selector); + w[47] = hc_byte_perm_S (w[46], w[45], selector); + w[46] = hc_byte_perm_S (w[45], w[44], selector); + w[45] = hc_byte_perm_S (w[44], w[43], selector); + w[44] = hc_byte_perm_S (w[43], w[42], selector); + w[43] = hc_byte_perm_S (w[42], w[41], selector); + w[42] = hc_byte_perm_S (w[41], w[40], selector); + w[41] = hc_byte_perm_S (w[40], w[39], selector); + w[40] = hc_byte_perm_S (w[39], w[38], selector); + w[39] = hc_byte_perm_S (w[38], w[37], selector); + w[38] = hc_byte_perm_S (w[37], w[36], selector); + w[37] = hc_byte_perm_S (w[36], w[35], selector); + w[36] = hc_byte_perm_S (w[35], w[34], selector); + w[35] = hc_byte_perm_S (w[34], w[33], selector); + w[34] = hc_byte_perm_S (w[33], w[32], selector); + w[33] = hc_byte_perm_S (w[32], w[31], selector); + w[32] = hc_byte_perm_S (w[31], w[30], selector); + w[31] = hc_byte_perm_S (w[30], w[29], selector); + w[30] = hc_byte_perm_S (w[29], w[28], selector); + w[29] = hc_byte_perm_S (w[28], w[27], selector); + w[28] = hc_byte_perm_S (w[27], w[26], selector); + w[27] = hc_byte_perm_S (w[26], w[25], selector); + w[26] = hc_byte_perm_S (w[25], w[24], selector); + w[25] = hc_byte_perm_S (w[24], w[23], selector); + w[24] = hc_byte_perm_S (w[23], w[22], selector); + w[23] = hc_byte_perm_S (w[22], w[21], selector); + w[22] = hc_byte_perm_S (w[21], w[20], selector); + w[21] = hc_byte_perm_S (w[20], w[19], selector); + w[20] = hc_byte_perm_S (w[19], w[18], selector); + w[19] = hc_byte_perm_S (w[18], w[17], selector); + w[18] = hc_byte_perm_S (w[17], w[16], selector); + w[17] = hc_byte_perm_S (w[16], w[15], selector); + w[16] = hc_byte_perm_S (w[15], w[14], selector); + w[15] = hc_byte_perm_S (w[14], w[13], selector); + w[14] = hc_byte_perm_S (w[13], w[12], selector); + w[13] = hc_byte_perm_S (w[12], w[11], selector); + w[12] = hc_byte_perm_S (w[11], w[10], selector); + w[11] = hc_byte_perm_S (w[10], w[ 9], selector); + w[10] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[ 9] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[ 8] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[ 7] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[ 6] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[ 5] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[ 4] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[ 3] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[ 2] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[ 1] = hc_byte_perm_S (w[ 0], 0, selector); w[ 0] = 0; break; case 2: - w[63] = __byte_perm_S (w[61], w[60], selector); - w[62] = __byte_perm_S (w[60], w[59], selector); - w[61] = __byte_perm_S (w[59], w[58], selector); - w[60] = __byte_perm_S (w[58], w[57], selector); - w[59] = __byte_perm_S (w[57], w[56], selector); - w[58] = __byte_perm_S (w[56], w[55], selector); - w[57] = __byte_perm_S (w[55], w[54], selector); - w[56] = __byte_perm_S (w[54], w[53], selector); - w[55] = __byte_perm_S (w[53], w[52], selector); - w[54] = __byte_perm_S (w[52], w[51], selector); - w[53] = __byte_perm_S (w[51], w[50], selector); - w[52] = __byte_perm_S (w[50], w[49], selector); - w[51] = __byte_perm_S (w[49], w[48], selector); - w[50] = __byte_perm_S (w[48], w[47], selector); - w[49] = __byte_perm_S (w[47], w[46], selector); - w[48] = __byte_perm_S (w[46], w[45], selector); - w[47] = __byte_perm_S (w[45], w[44], selector); - w[46] = __byte_perm_S (w[44], w[43], selector); - w[45] = __byte_perm_S (w[43], w[42], selector); - w[44] = __byte_perm_S (w[42], w[41], selector); - w[43] = __byte_perm_S (w[41], w[40], selector); - w[42] = __byte_perm_S (w[40], w[39], selector); - w[41] = __byte_perm_S (w[39], w[38], selector); - w[40] = __byte_perm_S (w[38], w[37], selector); - w[39] = __byte_perm_S (w[37], w[36], selector); - w[38] = __byte_perm_S (w[36], w[35], selector); - w[37] = __byte_perm_S (w[35], w[34], selector); - w[36] = __byte_perm_S (w[34], w[33], selector); - w[35] = __byte_perm_S (w[33], w[32], selector); - w[34] = __byte_perm_S (w[32], w[31], selector); - w[33] = __byte_perm_S (w[31], w[30], selector); - w[32] = __byte_perm_S (w[30], w[29], selector); - w[31] = __byte_perm_S (w[29], w[28], selector); - w[30] = __byte_perm_S (w[28], w[27], selector); - w[29] = __byte_perm_S (w[27], w[26], selector); - w[28] = __byte_perm_S (w[26], w[25], selector); - w[27] = __byte_perm_S (w[25], w[24], selector); - w[26] = __byte_perm_S (w[24], w[23], selector); - w[25] = __byte_perm_S (w[23], w[22], selector); - w[24] = __byte_perm_S (w[22], w[21], selector); - w[23] = __byte_perm_S (w[21], w[20], selector); - w[22] = __byte_perm_S (w[20], w[19], selector); - w[21] = __byte_perm_S (w[19], w[18], selector); - w[20] = __byte_perm_S (w[18], w[17], selector); - w[19] = __byte_perm_S (w[17], w[16], selector); - w[18] = __byte_perm_S (w[16], w[15], selector); - w[17] = __byte_perm_S (w[15], w[14], selector); - w[16] = __byte_perm_S (w[14], w[13], selector); - w[15] = __byte_perm_S (w[13], w[12], selector); - w[14] = __byte_perm_S (w[12], w[11], selector); - w[13] = __byte_perm_S (w[11], w[10], selector); - w[12] = __byte_perm_S (w[10], w[ 9], selector); - w[11] = __byte_perm_S (w[ 9], w[ 8], selector); - w[10] = __byte_perm_S (w[ 8], w[ 7], selector); - w[ 9] = __byte_perm_S (w[ 7], w[ 6], selector); - w[ 8] = __byte_perm_S (w[ 6], w[ 5], selector); - w[ 7] = __byte_perm_S (w[ 5], w[ 4], selector); - w[ 6] = __byte_perm_S (w[ 4], w[ 3], selector); - w[ 5] = __byte_perm_S (w[ 3], w[ 2], selector); - w[ 4] = __byte_perm_S (w[ 2], w[ 1], selector); - w[ 3] = __byte_perm_S (w[ 1], w[ 0], selector); - w[ 2] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[61], w[60], selector); + w[62] = hc_byte_perm_S (w[60], w[59], selector); + w[61] = hc_byte_perm_S (w[59], w[58], selector); + w[60] = hc_byte_perm_S (w[58], w[57], selector); + w[59] = hc_byte_perm_S (w[57], w[56], selector); + w[58] = hc_byte_perm_S (w[56], w[55], selector); + w[57] = hc_byte_perm_S (w[55], w[54], selector); + w[56] = hc_byte_perm_S (w[54], w[53], selector); + w[55] = hc_byte_perm_S (w[53], w[52], selector); + w[54] = hc_byte_perm_S (w[52], w[51], selector); + w[53] = hc_byte_perm_S (w[51], w[50], selector); + w[52] = hc_byte_perm_S (w[50], w[49], selector); + w[51] = hc_byte_perm_S (w[49], w[48], selector); + w[50] = hc_byte_perm_S (w[48], w[47], selector); + w[49] = hc_byte_perm_S (w[47], w[46], selector); + w[48] = hc_byte_perm_S (w[46], w[45], selector); + w[47] = hc_byte_perm_S (w[45], w[44], selector); + w[46] = hc_byte_perm_S (w[44], w[43], selector); + w[45] = hc_byte_perm_S (w[43], w[42], selector); + w[44] = hc_byte_perm_S (w[42], w[41], selector); + w[43] = hc_byte_perm_S (w[41], w[40], selector); + w[42] = hc_byte_perm_S (w[40], w[39], selector); + w[41] = hc_byte_perm_S (w[39], w[38], selector); + w[40] = hc_byte_perm_S (w[38], w[37], selector); + w[39] = hc_byte_perm_S (w[37], w[36], selector); + w[38] = hc_byte_perm_S (w[36], w[35], selector); + w[37] = hc_byte_perm_S (w[35], w[34], selector); + w[36] = hc_byte_perm_S (w[34], w[33], selector); + w[35] = hc_byte_perm_S (w[33], w[32], selector); + w[34] = hc_byte_perm_S (w[32], w[31], selector); + w[33] = hc_byte_perm_S (w[31], w[30], selector); + w[32] = hc_byte_perm_S (w[30], w[29], selector); + w[31] = hc_byte_perm_S (w[29], w[28], selector); + w[30] = hc_byte_perm_S (w[28], w[27], selector); + w[29] = hc_byte_perm_S (w[27], w[26], selector); + w[28] = hc_byte_perm_S (w[26], w[25], selector); + w[27] = hc_byte_perm_S (w[25], w[24], selector); + w[26] = hc_byte_perm_S (w[24], w[23], selector); + w[25] = hc_byte_perm_S (w[23], w[22], selector); + w[24] = hc_byte_perm_S (w[22], w[21], selector); + w[23] = hc_byte_perm_S (w[21], w[20], selector); + w[22] = hc_byte_perm_S (w[20], w[19], selector); + w[21] = hc_byte_perm_S (w[19], w[18], selector); + w[20] = hc_byte_perm_S (w[18], w[17], selector); + w[19] = hc_byte_perm_S (w[17], w[16], selector); + w[18] = hc_byte_perm_S (w[16], w[15], selector); + w[17] = hc_byte_perm_S (w[15], w[14], selector); + w[16] = hc_byte_perm_S (w[14], w[13], selector); + w[15] = hc_byte_perm_S (w[13], w[12], selector); + w[14] = hc_byte_perm_S (w[12], w[11], selector); + w[13] = hc_byte_perm_S (w[11], w[10], selector); + w[12] = hc_byte_perm_S (w[10], w[ 9], selector); + w[11] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[10] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[ 9] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[ 8] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[ 7] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[ 6] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[ 5] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[ 4] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[ 3] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[ 2] = hc_byte_perm_S (w[ 0], 0, selector); w[ 1] = 0; w[ 0] = 0; break; case 3: - w[63] = __byte_perm_S (w[60], w[59], selector); - w[62] = __byte_perm_S (w[59], w[58], selector); - w[61] = __byte_perm_S (w[58], w[57], selector); - w[60] = __byte_perm_S (w[57], w[56], selector); - w[59] = __byte_perm_S (w[56], w[55], selector); - w[58] = __byte_perm_S (w[55], w[54], selector); - w[57] = __byte_perm_S (w[54], w[53], selector); - w[56] = __byte_perm_S (w[53], w[52], selector); - w[55] = __byte_perm_S (w[52], w[51], selector); - w[54] = __byte_perm_S (w[51], w[50], selector); - w[53] = __byte_perm_S (w[50], w[49], selector); - w[52] = __byte_perm_S (w[49], w[48], selector); - w[51] = __byte_perm_S (w[48], w[47], selector); - w[50] = __byte_perm_S (w[47], w[46], selector); - w[49] = __byte_perm_S (w[46], w[45], selector); - w[48] = __byte_perm_S (w[45], w[44], selector); - w[47] = __byte_perm_S (w[44], w[43], selector); - w[46] = __byte_perm_S (w[43], w[42], selector); - w[45] = __byte_perm_S (w[42], w[41], selector); - w[44] = __byte_perm_S (w[41], w[40], selector); - w[43] = __byte_perm_S (w[40], w[39], selector); - w[42] = __byte_perm_S (w[39], w[38], selector); - w[41] = __byte_perm_S (w[38], w[37], selector); - w[40] = __byte_perm_S (w[37], w[36], selector); - w[39] = __byte_perm_S (w[36], w[35], selector); - w[38] = __byte_perm_S (w[35], w[34], selector); - w[37] = __byte_perm_S (w[34], w[33], selector); - w[36] = __byte_perm_S (w[33], w[32], selector); - w[35] = __byte_perm_S (w[32], w[31], selector); - w[34] = __byte_perm_S (w[31], w[30], selector); - w[33] = __byte_perm_S (w[30], w[29], selector); - w[32] = __byte_perm_S (w[29], w[28], selector); - w[31] = __byte_perm_S (w[28], w[27], selector); - w[30] = __byte_perm_S (w[27], w[26], selector); - w[29] = __byte_perm_S (w[26], w[25], selector); - w[28] = __byte_perm_S (w[25], w[24], selector); - w[27] = __byte_perm_S (w[24], w[23], selector); - w[26] = __byte_perm_S (w[23], w[22], selector); - w[25] = __byte_perm_S (w[22], w[21], selector); - w[24] = __byte_perm_S (w[21], w[20], selector); - w[23] = __byte_perm_S (w[20], w[19], selector); - w[22] = __byte_perm_S (w[19], w[18], selector); - w[21] = __byte_perm_S (w[18], w[17], selector); - w[20] = __byte_perm_S (w[17], w[16], selector); - w[19] = __byte_perm_S (w[16], w[15], selector); - w[18] = __byte_perm_S (w[15], w[14], selector); - w[17] = __byte_perm_S (w[14], w[13], selector); - w[16] = __byte_perm_S (w[13], w[12], selector); - w[15] = __byte_perm_S (w[12], w[11], selector); - w[14] = __byte_perm_S (w[11], w[10], selector); - w[13] = __byte_perm_S (w[10], w[ 9], selector); - w[12] = __byte_perm_S (w[ 9], w[ 8], selector); - w[11] = __byte_perm_S (w[ 8], w[ 7], selector); - w[10] = __byte_perm_S (w[ 7], w[ 6], selector); - w[ 9] = __byte_perm_S (w[ 6], w[ 5], selector); - w[ 8] = __byte_perm_S (w[ 5], w[ 4], selector); - w[ 7] = __byte_perm_S (w[ 4], w[ 3], selector); - w[ 6] = __byte_perm_S (w[ 3], w[ 2], selector); - w[ 5] = __byte_perm_S (w[ 2], w[ 1], selector); - w[ 4] = __byte_perm_S (w[ 1], w[ 0], selector); - w[ 3] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[60], w[59], selector); + w[62] = hc_byte_perm_S (w[59], w[58], selector); + w[61] = hc_byte_perm_S (w[58], w[57], selector); + w[60] = hc_byte_perm_S (w[57], w[56], selector); + w[59] = hc_byte_perm_S (w[56], w[55], selector); + w[58] = hc_byte_perm_S (w[55], w[54], selector); + w[57] = hc_byte_perm_S (w[54], w[53], selector); + w[56] = hc_byte_perm_S (w[53], w[52], selector); + w[55] = hc_byte_perm_S (w[52], w[51], selector); + w[54] = hc_byte_perm_S (w[51], w[50], selector); + w[53] = hc_byte_perm_S (w[50], w[49], selector); + w[52] = hc_byte_perm_S (w[49], w[48], selector); + w[51] = hc_byte_perm_S (w[48], w[47], selector); + w[50] = hc_byte_perm_S (w[47], w[46], selector); + w[49] = hc_byte_perm_S (w[46], w[45], selector); + w[48] = hc_byte_perm_S (w[45], w[44], selector); + w[47] = hc_byte_perm_S (w[44], w[43], selector); + w[46] = hc_byte_perm_S (w[43], w[42], selector); + w[45] = hc_byte_perm_S (w[42], w[41], selector); + w[44] = hc_byte_perm_S (w[41], w[40], selector); + w[43] = hc_byte_perm_S (w[40], w[39], selector); + w[42] = hc_byte_perm_S (w[39], w[38], selector); + w[41] = hc_byte_perm_S (w[38], w[37], selector); + w[40] = hc_byte_perm_S (w[37], w[36], selector); + w[39] = hc_byte_perm_S (w[36], w[35], selector); + w[38] = hc_byte_perm_S (w[35], w[34], selector); + w[37] = hc_byte_perm_S (w[34], w[33], selector); + w[36] = hc_byte_perm_S (w[33], w[32], selector); + w[35] = hc_byte_perm_S (w[32], w[31], selector); + w[34] = hc_byte_perm_S (w[31], w[30], selector); + w[33] = hc_byte_perm_S (w[30], w[29], selector); + w[32] = hc_byte_perm_S (w[29], w[28], selector); + w[31] = hc_byte_perm_S (w[28], w[27], selector); + w[30] = hc_byte_perm_S (w[27], w[26], selector); + w[29] = hc_byte_perm_S (w[26], w[25], selector); + w[28] = hc_byte_perm_S (w[25], w[24], selector); + w[27] = hc_byte_perm_S (w[24], w[23], selector); + w[26] = hc_byte_perm_S (w[23], w[22], selector); + w[25] = hc_byte_perm_S (w[22], w[21], selector); + w[24] = hc_byte_perm_S (w[21], w[20], selector); + w[23] = hc_byte_perm_S (w[20], w[19], selector); + w[22] = hc_byte_perm_S (w[19], w[18], selector); + w[21] = hc_byte_perm_S (w[18], w[17], selector); + w[20] = hc_byte_perm_S (w[17], w[16], selector); + w[19] = hc_byte_perm_S (w[16], w[15], selector); + w[18] = hc_byte_perm_S (w[15], w[14], selector); + w[17] = hc_byte_perm_S (w[14], w[13], selector); + w[16] = hc_byte_perm_S (w[13], w[12], selector); + w[15] = hc_byte_perm_S (w[12], w[11], selector); + w[14] = hc_byte_perm_S (w[11], w[10], selector); + w[13] = hc_byte_perm_S (w[10], w[ 9], selector); + w[12] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[11] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[10] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[ 9] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[ 8] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[ 7] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[ 6] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[ 5] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[ 4] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[ 3] = hc_byte_perm_S (w[ 0], 0, selector); w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; @@ -55724,66 +55724,66 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 4: - w[63] = __byte_perm_S (w[59], w[58], selector); - w[62] = __byte_perm_S (w[58], w[57], selector); - w[61] = __byte_perm_S (w[57], w[56], selector); - w[60] = __byte_perm_S (w[56], w[55], selector); - w[59] = __byte_perm_S (w[55], w[54], selector); - w[58] = __byte_perm_S (w[54], w[53], selector); - w[57] = __byte_perm_S (w[53], w[52], selector); - w[56] = __byte_perm_S (w[52], w[51], selector); - w[55] = __byte_perm_S (w[51], w[50], selector); - w[54] = __byte_perm_S (w[50], w[49], selector); - w[53] = __byte_perm_S (w[49], w[48], selector); - w[52] = __byte_perm_S (w[48], w[47], selector); - w[51] = __byte_perm_S (w[47], w[46], selector); - w[50] = __byte_perm_S (w[46], w[45], selector); - w[49] = __byte_perm_S (w[45], w[44], selector); - w[48] = __byte_perm_S (w[44], w[43], selector); - w[47] = __byte_perm_S (w[43], w[42], selector); - w[46] = __byte_perm_S (w[42], w[41], selector); - w[45] = __byte_perm_S (w[41], w[40], selector); - w[44] = __byte_perm_S (w[40], w[39], selector); - w[43] = __byte_perm_S (w[39], w[38], selector); - w[42] = __byte_perm_S (w[38], w[37], selector); - w[41] = __byte_perm_S (w[37], w[36], selector); - w[40] = __byte_perm_S (w[36], w[35], selector); - w[39] = __byte_perm_S (w[35], w[34], selector); - w[38] = __byte_perm_S (w[34], w[33], selector); - w[37] = __byte_perm_S (w[33], w[32], selector); - w[36] = __byte_perm_S (w[32], w[31], selector); - w[35] = __byte_perm_S (w[31], w[30], selector); - w[34] = __byte_perm_S (w[30], w[29], selector); - w[33] = __byte_perm_S (w[29], w[28], selector); - w[32] = __byte_perm_S (w[28], w[27], selector); - w[31] = __byte_perm_S (w[27], w[26], selector); - w[30] = __byte_perm_S (w[26], w[25], selector); - w[29] = __byte_perm_S (w[25], w[24], selector); - w[28] = __byte_perm_S (w[24], w[23], selector); - w[27] = __byte_perm_S (w[23], w[22], selector); - w[26] = __byte_perm_S (w[22], w[21], selector); - w[25] = __byte_perm_S (w[21], w[20], selector); - w[24] = __byte_perm_S (w[20], w[19], selector); - w[23] = __byte_perm_S (w[19], w[18], selector); - w[22] = __byte_perm_S (w[18], w[17], selector); - w[21] = __byte_perm_S (w[17], w[16], selector); - w[20] = __byte_perm_S (w[16], w[15], selector); - w[19] = __byte_perm_S (w[15], w[14], selector); - w[18] = __byte_perm_S (w[14], w[13], selector); - w[17] = __byte_perm_S (w[13], w[12], selector); - w[16] = __byte_perm_S (w[12], w[11], selector); - w[15] = __byte_perm_S (w[11], w[10], selector); - w[14] = __byte_perm_S (w[10], w[ 9], selector); - w[13] = __byte_perm_S (w[ 9], w[ 8], selector); - w[12] = __byte_perm_S (w[ 8], w[ 7], selector); - w[11] = __byte_perm_S (w[ 7], w[ 6], selector); - w[10] = __byte_perm_S (w[ 6], w[ 5], selector); - w[ 9] = __byte_perm_S (w[ 5], w[ 4], selector); - w[ 8] = __byte_perm_S (w[ 4], w[ 3], selector); - w[ 7] = __byte_perm_S (w[ 3], w[ 2], selector); - w[ 6] = __byte_perm_S (w[ 2], w[ 1], selector); - w[ 5] = __byte_perm_S (w[ 1], w[ 0], selector); - w[ 4] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[59], w[58], selector); + w[62] = hc_byte_perm_S (w[58], w[57], selector); + w[61] = hc_byte_perm_S (w[57], w[56], selector); + w[60] = hc_byte_perm_S (w[56], w[55], selector); + w[59] = hc_byte_perm_S (w[55], w[54], selector); + w[58] = hc_byte_perm_S (w[54], w[53], selector); + w[57] = hc_byte_perm_S (w[53], w[52], selector); + w[56] = hc_byte_perm_S (w[52], w[51], selector); + w[55] = hc_byte_perm_S (w[51], w[50], selector); + w[54] = hc_byte_perm_S (w[50], w[49], selector); + w[53] = hc_byte_perm_S (w[49], w[48], selector); + w[52] = hc_byte_perm_S (w[48], w[47], selector); + w[51] = hc_byte_perm_S (w[47], w[46], selector); + w[50] = hc_byte_perm_S (w[46], w[45], selector); + w[49] = hc_byte_perm_S (w[45], w[44], selector); + w[48] = hc_byte_perm_S (w[44], w[43], selector); + w[47] = hc_byte_perm_S (w[43], w[42], selector); + w[46] = hc_byte_perm_S (w[42], w[41], selector); + w[45] = hc_byte_perm_S (w[41], w[40], selector); + w[44] = hc_byte_perm_S (w[40], w[39], selector); + w[43] = hc_byte_perm_S (w[39], w[38], selector); + w[42] = hc_byte_perm_S (w[38], w[37], selector); + w[41] = hc_byte_perm_S (w[37], w[36], selector); + w[40] = hc_byte_perm_S (w[36], w[35], selector); + w[39] = hc_byte_perm_S (w[35], w[34], selector); + w[38] = hc_byte_perm_S (w[34], w[33], selector); + w[37] = hc_byte_perm_S (w[33], w[32], selector); + w[36] = hc_byte_perm_S (w[32], w[31], selector); + w[35] = hc_byte_perm_S (w[31], w[30], selector); + w[34] = hc_byte_perm_S (w[30], w[29], selector); + w[33] = hc_byte_perm_S (w[29], w[28], selector); + w[32] = hc_byte_perm_S (w[28], w[27], selector); + w[31] = hc_byte_perm_S (w[27], w[26], selector); + w[30] = hc_byte_perm_S (w[26], w[25], selector); + w[29] = hc_byte_perm_S (w[25], w[24], selector); + w[28] = hc_byte_perm_S (w[24], w[23], selector); + w[27] = hc_byte_perm_S (w[23], w[22], selector); + w[26] = hc_byte_perm_S (w[22], w[21], selector); + w[25] = hc_byte_perm_S (w[21], w[20], selector); + w[24] = hc_byte_perm_S (w[20], w[19], selector); + w[23] = hc_byte_perm_S (w[19], w[18], selector); + w[22] = hc_byte_perm_S (w[18], w[17], selector); + w[21] = hc_byte_perm_S (w[17], w[16], selector); + w[20] = hc_byte_perm_S (w[16], w[15], selector); + w[19] = hc_byte_perm_S (w[15], w[14], selector); + w[18] = hc_byte_perm_S (w[14], w[13], selector); + w[17] = hc_byte_perm_S (w[13], w[12], selector); + w[16] = hc_byte_perm_S (w[12], w[11], selector); + w[15] = hc_byte_perm_S (w[11], w[10], selector); + w[14] = hc_byte_perm_S (w[10], w[ 9], selector); + w[13] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[12] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[11] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[10] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[ 9] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[ 8] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[ 7] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[ 6] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[ 5] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[ 4] = hc_byte_perm_S (w[ 0], 0, selector); w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; @@ -55792,65 +55792,65 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 5: - w[63] = __byte_perm_S (w[58], w[57], selector); - w[62] = __byte_perm_S (w[57], w[56], selector); - w[61] = __byte_perm_S (w[56], w[55], selector); - w[60] = __byte_perm_S (w[55], w[54], selector); - w[59] = __byte_perm_S (w[54], w[53], selector); - w[58] = __byte_perm_S (w[53], w[52], selector); - w[57] = __byte_perm_S (w[52], w[51], selector); - w[56] = __byte_perm_S (w[51], w[50], selector); - w[55] = __byte_perm_S (w[50], w[49], selector); - w[54] = __byte_perm_S (w[49], w[48], selector); - w[53] = __byte_perm_S (w[48], w[47], selector); - w[52] = __byte_perm_S (w[47], w[46], selector); - w[51] = __byte_perm_S (w[46], w[45], selector); - w[50] = __byte_perm_S (w[45], w[44], selector); - w[49] = __byte_perm_S (w[44], w[43], selector); - w[48] = __byte_perm_S (w[43], w[42], selector); - w[47] = __byte_perm_S (w[42], w[41], selector); - w[46] = __byte_perm_S (w[41], w[40], selector); - w[45] = __byte_perm_S (w[40], w[39], selector); - w[44] = __byte_perm_S (w[39], w[38], selector); - w[43] = __byte_perm_S (w[38], w[37], selector); - w[42] = __byte_perm_S (w[37], w[36], selector); - w[41] = __byte_perm_S (w[36], w[35], selector); - w[40] = __byte_perm_S (w[35], w[34], selector); - w[39] = __byte_perm_S (w[34], w[33], selector); - w[38] = __byte_perm_S (w[33], w[32], selector); - w[37] = __byte_perm_S (w[32], w[31], selector); - w[36] = __byte_perm_S (w[31], w[30], selector); - w[35] = __byte_perm_S (w[30], w[29], selector); - w[34] = __byte_perm_S (w[29], w[28], selector); - w[33] = __byte_perm_S (w[28], w[27], selector); - w[32] = __byte_perm_S (w[27], w[26], selector); - w[31] = __byte_perm_S (w[26], w[25], selector); - w[30] = __byte_perm_S (w[25], w[24], selector); - w[29] = __byte_perm_S (w[24], w[23], selector); - w[28] = __byte_perm_S (w[23], w[22], selector); - w[27] = __byte_perm_S (w[22], w[21], selector); - w[26] = __byte_perm_S (w[21], w[20], selector); - w[25] = __byte_perm_S (w[20], w[19], selector); - w[24] = __byte_perm_S (w[19], w[18], selector); - w[23] = __byte_perm_S (w[18], w[17], selector); - w[22] = __byte_perm_S (w[17], w[16], selector); - w[21] = __byte_perm_S (w[16], w[15], selector); - w[20] = __byte_perm_S (w[15], w[14], selector); - w[19] = __byte_perm_S (w[14], w[13], selector); - w[18] = __byte_perm_S (w[13], w[12], selector); - w[17] = __byte_perm_S (w[12], w[11], selector); - w[16] = __byte_perm_S (w[11], w[10], selector); - w[15] = __byte_perm_S (w[10], w[ 9], selector); - w[14] = __byte_perm_S (w[ 9], w[ 8], selector); - w[13] = __byte_perm_S (w[ 8], w[ 7], selector); - w[12] = __byte_perm_S (w[ 7], w[ 6], selector); - w[11] = __byte_perm_S (w[ 6], w[ 5], selector); - w[10] = __byte_perm_S (w[ 5], w[ 4], selector); - w[ 9] = __byte_perm_S (w[ 4], w[ 3], selector); - w[ 8] = __byte_perm_S (w[ 3], w[ 2], selector); - w[ 7] = __byte_perm_S (w[ 2], w[ 1], selector); - w[ 6] = __byte_perm_S (w[ 1], w[ 0], selector); - w[ 5] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[58], w[57], selector); + w[62] = hc_byte_perm_S (w[57], w[56], selector); + w[61] = hc_byte_perm_S (w[56], w[55], selector); + w[60] = hc_byte_perm_S (w[55], w[54], selector); + w[59] = hc_byte_perm_S (w[54], w[53], selector); + w[58] = hc_byte_perm_S (w[53], w[52], selector); + w[57] = hc_byte_perm_S (w[52], w[51], selector); + w[56] = hc_byte_perm_S (w[51], w[50], selector); + w[55] = hc_byte_perm_S (w[50], w[49], selector); + w[54] = hc_byte_perm_S (w[49], w[48], selector); + w[53] = hc_byte_perm_S (w[48], w[47], selector); + w[52] = hc_byte_perm_S (w[47], w[46], selector); + w[51] = hc_byte_perm_S (w[46], w[45], selector); + w[50] = hc_byte_perm_S (w[45], w[44], selector); + w[49] = hc_byte_perm_S (w[44], w[43], selector); + w[48] = hc_byte_perm_S (w[43], w[42], selector); + w[47] = hc_byte_perm_S (w[42], w[41], selector); + w[46] = hc_byte_perm_S (w[41], w[40], selector); + w[45] = hc_byte_perm_S (w[40], w[39], selector); + w[44] = hc_byte_perm_S (w[39], w[38], selector); + w[43] = hc_byte_perm_S (w[38], w[37], selector); + w[42] = hc_byte_perm_S (w[37], w[36], selector); + w[41] = hc_byte_perm_S (w[36], w[35], selector); + w[40] = hc_byte_perm_S (w[35], w[34], selector); + w[39] = hc_byte_perm_S (w[34], w[33], selector); + w[38] = hc_byte_perm_S (w[33], w[32], selector); + w[37] = hc_byte_perm_S (w[32], w[31], selector); + w[36] = hc_byte_perm_S (w[31], w[30], selector); + w[35] = hc_byte_perm_S (w[30], w[29], selector); + w[34] = hc_byte_perm_S (w[29], w[28], selector); + w[33] = hc_byte_perm_S (w[28], w[27], selector); + w[32] = hc_byte_perm_S (w[27], w[26], selector); + w[31] = hc_byte_perm_S (w[26], w[25], selector); + w[30] = hc_byte_perm_S (w[25], w[24], selector); + w[29] = hc_byte_perm_S (w[24], w[23], selector); + w[28] = hc_byte_perm_S (w[23], w[22], selector); + w[27] = hc_byte_perm_S (w[22], w[21], selector); + w[26] = hc_byte_perm_S (w[21], w[20], selector); + w[25] = hc_byte_perm_S (w[20], w[19], selector); + w[24] = hc_byte_perm_S (w[19], w[18], selector); + w[23] = hc_byte_perm_S (w[18], w[17], selector); + w[22] = hc_byte_perm_S (w[17], w[16], selector); + w[21] = hc_byte_perm_S (w[16], w[15], selector); + w[20] = hc_byte_perm_S (w[15], w[14], selector); + w[19] = hc_byte_perm_S (w[14], w[13], selector); + w[18] = hc_byte_perm_S (w[13], w[12], selector); + w[17] = hc_byte_perm_S (w[12], w[11], selector); + w[16] = hc_byte_perm_S (w[11], w[10], selector); + w[15] = hc_byte_perm_S (w[10], w[ 9], selector); + w[14] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[13] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[12] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[11] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[10] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[ 9] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[ 8] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[ 7] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[ 6] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[ 5] = hc_byte_perm_S (w[ 0], 0, selector); w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; @@ -55860,64 +55860,64 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 6: - w[63] = __byte_perm_S (w[57], w[56], selector); - w[62] = __byte_perm_S (w[56], w[55], selector); - w[61] = __byte_perm_S (w[55], w[54], selector); - w[60] = __byte_perm_S (w[54], w[53], selector); - w[59] = __byte_perm_S (w[53], w[52], selector); - w[58] = __byte_perm_S (w[52], w[51], selector); - w[57] = __byte_perm_S (w[51], w[50], selector); - w[56] = __byte_perm_S (w[50], w[49], selector); - w[55] = __byte_perm_S (w[49], w[48], selector); - w[54] = __byte_perm_S (w[48], w[47], selector); - w[53] = __byte_perm_S (w[47], w[46], selector); - w[52] = __byte_perm_S (w[46], w[45], selector); - w[51] = __byte_perm_S (w[45], w[44], selector); - w[50] = __byte_perm_S (w[44], w[43], selector); - w[49] = __byte_perm_S (w[43], w[42], selector); - w[48] = __byte_perm_S (w[42], w[41], selector); - w[47] = __byte_perm_S (w[41], w[40], selector); - w[46] = __byte_perm_S (w[40], w[39], selector); - w[45] = __byte_perm_S (w[39], w[38], selector); - w[44] = __byte_perm_S (w[38], w[37], selector); - w[43] = __byte_perm_S (w[37], w[36], selector); - w[42] = __byte_perm_S (w[36], w[35], selector); - w[41] = __byte_perm_S (w[35], w[34], selector); - w[40] = __byte_perm_S (w[34], w[33], selector); - w[39] = __byte_perm_S (w[33], w[32], selector); - w[38] = __byte_perm_S (w[32], w[31], selector); - w[37] = __byte_perm_S (w[31], w[30], selector); - w[36] = __byte_perm_S (w[30], w[29], selector); - w[35] = __byte_perm_S (w[29], w[28], selector); - w[34] = __byte_perm_S (w[28], w[27], selector); - w[33] = __byte_perm_S (w[27], w[26], selector); - w[32] = __byte_perm_S (w[26], w[25], selector); - w[31] = __byte_perm_S (w[25], w[24], selector); - w[30] = __byte_perm_S (w[24], w[23], selector); - w[29] = __byte_perm_S (w[23], w[22], selector); - w[28] = __byte_perm_S (w[22], w[21], selector); - w[27] = __byte_perm_S (w[21], w[20], selector); - w[26] = __byte_perm_S (w[20], w[19], selector); - w[25] = __byte_perm_S (w[19], w[18], selector); - w[24] = __byte_perm_S (w[18], w[17], selector); - w[23] = __byte_perm_S (w[17], w[16], selector); - w[22] = __byte_perm_S (w[16], w[15], selector); - w[21] = __byte_perm_S (w[15], w[14], selector); - w[20] = __byte_perm_S (w[14], w[13], selector); - w[19] = __byte_perm_S (w[13], w[12], selector); - w[18] = __byte_perm_S (w[12], w[11], selector); - w[17] = __byte_perm_S (w[11], w[10], selector); - w[16] = __byte_perm_S (w[10], w[ 9], selector); - w[15] = __byte_perm_S (w[ 9], w[ 8], selector); - w[14] = __byte_perm_S (w[ 8], w[ 7], selector); - w[13] = __byte_perm_S (w[ 7], w[ 6], selector); - w[12] = __byte_perm_S (w[ 6], w[ 5], selector); - w[11] = __byte_perm_S (w[ 5], w[ 4], selector); - w[10] = __byte_perm_S (w[ 4], w[ 3], selector); - w[ 9] = __byte_perm_S (w[ 3], w[ 2], selector); - w[ 8] = __byte_perm_S (w[ 2], w[ 1], selector); - w[ 7] = __byte_perm_S (w[ 1], w[ 0], selector); - w[ 6] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[57], w[56], selector); + w[62] = hc_byte_perm_S (w[56], w[55], selector); + w[61] = hc_byte_perm_S (w[55], w[54], selector); + w[60] = hc_byte_perm_S (w[54], w[53], selector); + w[59] = hc_byte_perm_S (w[53], w[52], selector); + w[58] = hc_byte_perm_S (w[52], w[51], selector); + w[57] = hc_byte_perm_S (w[51], w[50], selector); + w[56] = hc_byte_perm_S (w[50], w[49], selector); + w[55] = hc_byte_perm_S (w[49], w[48], selector); + w[54] = hc_byte_perm_S (w[48], w[47], selector); + w[53] = hc_byte_perm_S (w[47], w[46], selector); + w[52] = hc_byte_perm_S (w[46], w[45], selector); + w[51] = hc_byte_perm_S (w[45], w[44], selector); + w[50] = hc_byte_perm_S (w[44], w[43], selector); + w[49] = hc_byte_perm_S (w[43], w[42], selector); + w[48] = hc_byte_perm_S (w[42], w[41], selector); + w[47] = hc_byte_perm_S (w[41], w[40], selector); + w[46] = hc_byte_perm_S (w[40], w[39], selector); + w[45] = hc_byte_perm_S (w[39], w[38], selector); + w[44] = hc_byte_perm_S (w[38], w[37], selector); + w[43] = hc_byte_perm_S (w[37], w[36], selector); + w[42] = hc_byte_perm_S (w[36], w[35], selector); + w[41] = hc_byte_perm_S (w[35], w[34], selector); + w[40] = hc_byte_perm_S (w[34], w[33], selector); + w[39] = hc_byte_perm_S (w[33], w[32], selector); + w[38] = hc_byte_perm_S (w[32], w[31], selector); + w[37] = hc_byte_perm_S (w[31], w[30], selector); + w[36] = hc_byte_perm_S (w[30], w[29], selector); + w[35] = hc_byte_perm_S (w[29], w[28], selector); + w[34] = hc_byte_perm_S (w[28], w[27], selector); + w[33] = hc_byte_perm_S (w[27], w[26], selector); + w[32] = hc_byte_perm_S (w[26], w[25], selector); + w[31] = hc_byte_perm_S (w[25], w[24], selector); + w[30] = hc_byte_perm_S (w[24], w[23], selector); + w[29] = hc_byte_perm_S (w[23], w[22], selector); + w[28] = hc_byte_perm_S (w[22], w[21], selector); + w[27] = hc_byte_perm_S (w[21], w[20], selector); + w[26] = hc_byte_perm_S (w[20], w[19], selector); + w[25] = hc_byte_perm_S (w[19], w[18], selector); + w[24] = hc_byte_perm_S (w[18], w[17], selector); + w[23] = hc_byte_perm_S (w[17], w[16], selector); + w[22] = hc_byte_perm_S (w[16], w[15], selector); + w[21] = hc_byte_perm_S (w[15], w[14], selector); + w[20] = hc_byte_perm_S (w[14], w[13], selector); + w[19] = hc_byte_perm_S (w[13], w[12], selector); + w[18] = hc_byte_perm_S (w[12], w[11], selector); + w[17] = hc_byte_perm_S (w[11], w[10], selector); + w[16] = hc_byte_perm_S (w[10], w[ 9], selector); + w[15] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[14] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[13] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[12] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[11] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[10] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[ 9] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[ 8] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[ 7] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[ 6] = hc_byte_perm_S (w[ 0], 0, selector); w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; @@ -55928,63 +55928,63 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 7: - w[63] = __byte_perm_S (w[56], w[55], selector); - w[62] = __byte_perm_S (w[55], w[54], selector); - w[61] = __byte_perm_S (w[54], w[53], selector); - w[60] = __byte_perm_S (w[53], w[52], selector); - w[59] = __byte_perm_S (w[52], w[51], selector); - w[58] = __byte_perm_S (w[51], w[50], selector); - w[57] = __byte_perm_S (w[50], w[49], selector); - w[56] = __byte_perm_S (w[49], w[48], selector); - w[55] = __byte_perm_S (w[48], w[47], selector); - w[54] = __byte_perm_S (w[47], w[46], selector); - w[53] = __byte_perm_S (w[46], w[45], selector); - w[52] = __byte_perm_S (w[45], w[44], selector); - w[51] = __byte_perm_S (w[44], w[43], selector); - w[50] = __byte_perm_S (w[43], w[42], selector); - w[49] = __byte_perm_S (w[42], w[41], selector); - w[48] = __byte_perm_S (w[41], w[40], selector); - w[47] = __byte_perm_S (w[40], w[39], selector); - w[46] = __byte_perm_S (w[39], w[38], selector); - w[45] = __byte_perm_S (w[38], w[37], selector); - w[44] = __byte_perm_S (w[37], w[36], selector); - w[43] = __byte_perm_S (w[36], w[35], selector); - w[42] = __byte_perm_S (w[35], w[34], selector); - w[41] = __byte_perm_S (w[34], w[33], selector); - w[40] = __byte_perm_S (w[33], w[32], selector); - w[39] = __byte_perm_S (w[32], w[31], selector); - w[38] = __byte_perm_S (w[31], w[30], selector); - w[37] = __byte_perm_S (w[30], w[29], selector); - w[36] = __byte_perm_S (w[29], w[28], selector); - w[35] = __byte_perm_S (w[28], w[27], selector); - w[34] = __byte_perm_S (w[27], w[26], selector); - w[33] = __byte_perm_S (w[26], w[25], selector); - w[32] = __byte_perm_S (w[25], w[24], selector); - w[31] = __byte_perm_S (w[24], w[23], selector); - w[30] = __byte_perm_S (w[23], w[22], selector); - w[29] = __byte_perm_S (w[22], w[21], selector); - w[28] = __byte_perm_S (w[21], w[20], selector); - w[27] = __byte_perm_S (w[20], w[19], selector); - w[26] = __byte_perm_S (w[19], w[18], selector); - w[25] = __byte_perm_S (w[18], w[17], selector); - w[24] = __byte_perm_S (w[17], w[16], selector); - w[23] = __byte_perm_S (w[16], w[15], selector); - w[22] = __byte_perm_S (w[15], w[14], selector); - w[21] = __byte_perm_S (w[14], w[13], selector); - w[20] = __byte_perm_S (w[13], w[12], selector); - w[19] = __byte_perm_S (w[12], w[11], selector); - w[18] = __byte_perm_S (w[11], w[10], selector); - w[17] = __byte_perm_S (w[10], w[ 9], selector); - w[16] = __byte_perm_S (w[ 9], w[ 8], selector); - w[15] = __byte_perm_S (w[ 8], w[ 7], selector); - w[14] = __byte_perm_S (w[ 7], w[ 6], selector); - w[13] = __byte_perm_S (w[ 6], w[ 5], selector); - w[12] = __byte_perm_S (w[ 5], w[ 4], selector); - w[11] = __byte_perm_S (w[ 4], w[ 3], selector); - w[10] = __byte_perm_S (w[ 3], w[ 2], selector); - w[ 9] = __byte_perm_S (w[ 2], w[ 1], selector); - w[ 8] = __byte_perm_S (w[ 1], w[ 0], selector); - w[ 7] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[56], w[55], selector); + w[62] = hc_byte_perm_S (w[55], w[54], selector); + w[61] = hc_byte_perm_S (w[54], w[53], selector); + w[60] = hc_byte_perm_S (w[53], w[52], selector); + w[59] = hc_byte_perm_S (w[52], w[51], selector); + w[58] = hc_byte_perm_S (w[51], w[50], selector); + w[57] = hc_byte_perm_S (w[50], w[49], selector); + w[56] = hc_byte_perm_S (w[49], w[48], selector); + w[55] = hc_byte_perm_S (w[48], w[47], selector); + w[54] = hc_byte_perm_S (w[47], w[46], selector); + w[53] = hc_byte_perm_S (w[46], w[45], selector); + w[52] = hc_byte_perm_S (w[45], w[44], selector); + w[51] = hc_byte_perm_S (w[44], w[43], selector); + w[50] = hc_byte_perm_S (w[43], w[42], selector); + w[49] = hc_byte_perm_S (w[42], w[41], selector); + w[48] = hc_byte_perm_S (w[41], w[40], selector); + w[47] = hc_byte_perm_S (w[40], w[39], selector); + w[46] = hc_byte_perm_S (w[39], w[38], selector); + w[45] = hc_byte_perm_S (w[38], w[37], selector); + w[44] = hc_byte_perm_S (w[37], w[36], selector); + w[43] = hc_byte_perm_S (w[36], w[35], selector); + w[42] = hc_byte_perm_S (w[35], w[34], selector); + w[41] = hc_byte_perm_S (w[34], w[33], selector); + w[40] = hc_byte_perm_S (w[33], w[32], selector); + w[39] = hc_byte_perm_S (w[32], w[31], selector); + w[38] = hc_byte_perm_S (w[31], w[30], selector); + w[37] = hc_byte_perm_S (w[30], w[29], selector); + w[36] = hc_byte_perm_S (w[29], w[28], selector); + w[35] = hc_byte_perm_S (w[28], w[27], selector); + w[34] = hc_byte_perm_S (w[27], w[26], selector); + w[33] = hc_byte_perm_S (w[26], w[25], selector); + w[32] = hc_byte_perm_S (w[25], w[24], selector); + w[31] = hc_byte_perm_S (w[24], w[23], selector); + w[30] = hc_byte_perm_S (w[23], w[22], selector); + w[29] = hc_byte_perm_S (w[22], w[21], selector); + w[28] = hc_byte_perm_S (w[21], w[20], selector); + w[27] = hc_byte_perm_S (w[20], w[19], selector); + w[26] = hc_byte_perm_S (w[19], w[18], selector); + w[25] = hc_byte_perm_S (w[18], w[17], selector); + w[24] = hc_byte_perm_S (w[17], w[16], selector); + w[23] = hc_byte_perm_S (w[16], w[15], selector); + w[22] = hc_byte_perm_S (w[15], w[14], selector); + w[21] = hc_byte_perm_S (w[14], w[13], selector); + w[20] = hc_byte_perm_S (w[13], w[12], selector); + w[19] = hc_byte_perm_S (w[12], w[11], selector); + w[18] = hc_byte_perm_S (w[11], w[10], selector); + w[17] = hc_byte_perm_S (w[10], w[ 9], selector); + w[16] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[15] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[14] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[13] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[12] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[11] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[10] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[ 9] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[ 8] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[ 7] = hc_byte_perm_S (w[ 0], 0, selector); w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; @@ -55996,62 +55996,62 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 8: - w[63] = __byte_perm_S (w[55], w[54], selector); - w[62] = __byte_perm_S (w[54], w[53], selector); - w[61] = __byte_perm_S (w[53], w[52], selector); - w[60] = __byte_perm_S (w[52], w[51], selector); - w[59] = __byte_perm_S (w[51], w[50], selector); - w[58] = __byte_perm_S (w[50], w[49], selector); - w[57] = __byte_perm_S (w[49], w[48], selector); - w[56] = __byte_perm_S (w[48], w[47], selector); - w[55] = __byte_perm_S (w[47], w[46], selector); - w[54] = __byte_perm_S (w[46], w[45], selector); - w[53] = __byte_perm_S (w[45], w[44], selector); - w[52] = __byte_perm_S (w[44], w[43], selector); - w[51] = __byte_perm_S (w[43], w[42], selector); - w[50] = __byte_perm_S (w[42], w[41], selector); - w[49] = __byte_perm_S (w[41], w[40], selector); - w[48] = __byte_perm_S (w[40], w[39], selector); - w[47] = __byte_perm_S (w[39], w[38], selector); - w[46] = __byte_perm_S (w[38], w[37], selector); - w[45] = __byte_perm_S (w[37], w[36], selector); - w[44] = __byte_perm_S (w[36], w[35], selector); - w[43] = __byte_perm_S (w[35], w[34], selector); - w[42] = __byte_perm_S (w[34], w[33], selector); - w[41] = __byte_perm_S (w[33], w[32], selector); - w[40] = __byte_perm_S (w[32], w[31], selector); - w[39] = __byte_perm_S (w[31], w[30], selector); - w[38] = __byte_perm_S (w[30], w[29], selector); - w[37] = __byte_perm_S (w[29], w[28], selector); - w[36] = __byte_perm_S (w[28], w[27], selector); - w[35] = __byte_perm_S (w[27], w[26], selector); - w[34] = __byte_perm_S (w[26], w[25], selector); - w[33] = __byte_perm_S (w[25], w[24], selector); - w[32] = __byte_perm_S (w[24], w[23], selector); - w[31] = __byte_perm_S (w[23], w[22], selector); - w[30] = __byte_perm_S (w[22], w[21], selector); - w[29] = __byte_perm_S (w[21], w[20], selector); - w[28] = __byte_perm_S (w[20], w[19], selector); - w[27] = __byte_perm_S (w[19], w[18], selector); - w[26] = __byte_perm_S (w[18], w[17], selector); - w[25] = __byte_perm_S (w[17], w[16], selector); - w[24] = __byte_perm_S (w[16], w[15], selector); - w[23] = __byte_perm_S (w[15], w[14], selector); - w[22] = __byte_perm_S (w[14], w[13], selector); - w[21] = __byte_perm_S (w[13], w[12], selector); - w[20] = __byte_perm_S (w[12], w[11], selector); - w[19] = __byte_perm_S (w[11], w[10], selector); - w[18] = __byte_perm_S (w[10], w[ 9], selector); - w[17] = __byte_perm_S (w[ 9], w[ 8], selector); - w[16] = __byte_perm_S (w[ 8], w[ 7], selector); - w[15] = __byte_perm_S (w[ 7], w[ 6], selector); - w[14] = __byte_perm_S (w[ 6], w[ 5], selector); - w[13] = __byte_perm_S (w[ 5], w[ 4], selector); - w[12] = __byte_perm_S (w[ 4], w[ 3], selector); - w[11] = __byte_perm_S (w[ 3], w[ 2], selector); - w[10] = __byte_perm_S (w[ 2], w[ 1], selector); - w[ 9] = __byte_perm_S (w[ 1], w[ 0], selector); - w[ 8] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[55], w[54], selector); + w[62] = hc_byte_perm_S (w[54], w[53], selector); + w[61] = hc_byte_perm_S (w[53], w[52], selector); + w[60] = hc_byte_perm_S (w[52], w[51], selector); + w[59] = hc_byte_perm_S (w[51], w[50], selector); + w[58] = hc_byte_perm_S (w[50], w[49], selector); + w[57] = hc_byte_perm_S (w[49], w[48], selector); + w[56] = hc_byte_perm_S (w[48], w[47], selector); + w[55] = hc_byte_perm_S (w[47], w[46], selector); + w[54] = hc_byte_perm_S (w[46], w[45], selector); + w[53] = hc_byte_perm_S (w[45], w[44], selector); + w[52] = hc_byte_perm_S (w[44], w[43], selector); + w[51] = hc_byte_perm_S (w[43], w[42], selector); + w[50] = hc_byte_perm_S (w[42], w[41], selector); + w[49] = hc_byte_perm_S (w[41], w[40], selector); + w[48] = hc_byte_perm_S (w[40], w[39], selector); + w[47] = hc_byte_perm_S (w[39], w[38], selector); + w[46] = hc_byte_perm_S (w[38], w[37], selector); + w[45] = hc_byte_perm_S (w[37], w[36], selector); + w[44] = hc_byte_perm_S (w[36], w[35], selector); + w[43] = hc_byte_perm_S (w[35], w[34], selector); + w[42] = hc_byte_perm_S (w[34], w[33], selector); + w[41] = hc_byte_perm_S (w[33], w[32], selector); + w[40] = hc_byte_perm_S (w[32], w[31], selector); + w[39] = hc_byte_perm_S (w[31], w[30], selector); + w[38] = hc_byte_perm_S (w[30], w[29], selector); + w[37] = hc_byte_perm_S (w[29], w[28], selector); + w[36] = hc_byte_perm_S (w[28], w[27], selector); + w[35] = hc_byte_perm_S (w[27], w[26], selector); + w[34] = hc_byte_perm_S (w[26], w[25], selector); + w[33] = hc_byte_perm_S (w[25], w[24], selector); + w[32] = hc_byte_perm_S (w[24], w[23], selector); + w[31] = hc_byte_perm_S (w[23], w[22], selector); + w[30] = hc_byte_perm_S (w[22], w[21], selector); + w[29] = hc_byte_perm_S (w[21], w[20], selector); + w[28] = hc_byte_perm_S (w[20], w[19], selector); + w[27] = hc_byte_perm_S (w[19], w[18], selector); + w[26] = hc_byte_perm_S (w[18], w[17], selector); + w[25] = hc_byte_perm_S (w[17], w[16], selector); + w[24] = hc_byte_perm_S (w[16], w[15], selector); + w[23] = hc_byte_perm_S (w[15], w[14], selector); + w[22] = hc_byte_perm_S (w[14], w[13], selector); + w[21] = hc_byte_perm_S (w[13], w[12], selector); + w[20] = hc_byte_perm_S (w[12], w[11], selector); + w[19] = hc_byte_perm_S (w[11], w[10], selector); + w[18] = hc_byte_perm_S (w[10], w[ 9], selector); + w[17] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[16] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[15] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[14] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[13] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[12] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[11] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[10] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[ 9] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[ 8] = hc_byte_perm_S (w[ 0], 0, selector); w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; @@ -56064,61 +56064,61 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 9: - w[63] = __byte_perm_S (w[54], w[53], selector); - w[62] = __byte_perm_S (w[53], w[52], selector); - w[61] = __byte_perm_S (w[52], w[51], selector); - w[60] = __byte_perm_S (w[51], w[50], selector); - w[59] = __byte_perm_S (w[50], w[49], selector); - w[58] = __byte_perm_S (w[49], w[48], selector); - w[57] = __byte_perm_S (w[48], w[47], selector); - w[56] = __byte_perm_S (w[47], w[46], selector); - w[55] = __byte_perm_S (w[46], w[45], selector); - w[54] = __byte_perm_S (w[45], w[44], selector); - w[53] = __byte_perm_S (w[44], w[43], selector); - w[52] = __byte_perm_S (w[43], w[42], selector); - w[51] = __byte_perm_S (w[42], w[41], selector); - w[50] = __byte_perm_S (w[41], w[40], selector); - w[49] = __byte_perm_S (w[40], w[39], selector); - w[48] = __byte_perm_S (w[39], w[38], selector); - w[47] = __byte_perm_S (w[38], w[37], selector); - w[46] = __byte_perm_S (w[37], w[36], selector); - w[45] = __byte_perm_S (w[36], w[35], selector); - w[44] = __byte_perm_S (w[35], w[34], selector); - w[43] = __byte_perm_S (w[34], w[33], selector); - w[42] = __byte_perm_S (w[33], w[32], selector); - w[41] = __byte_perm_S (w[32], w[31], selector); - w[40] = __byte_perm_S (w[31], w[30], selector); - w[39] = __byte_perm_S (w[30], w[29], selector); - w[38] = __byte_perm_S (w[29], w[28], selector); - w[37] = __byte_perm_S (w[28], w[27], selector); - w[36] = __byte_perm_S (w[27], w[26], selector); - w[35] = __byte_perm_S (w[26], w[25], selector); - w[34] = __byte_perm_S (w[25], w[24], selector); - w[33] = __byte_perm_S (w[24], w[23], selector); - w[32] = __byte_perm_S (w[23], w[22], selector); - w[31] = __byte_perm_S (w[22], w[21], selector); - w[30] = __byte_perm_S (w[21], w[20], selector); - w[29] = __byte_perm_S (w[20], w[19], selector); - w[28] = __byte_perm_S (w[19], w[18], selector); - w[27] = __byte_perm_S (w[18], w[17], selector); - w[26] = __byte_perm_S (w[17], w[16], selector); - w[25] = __byte_perm_S (w[16], w[15], selector); - w[24] = __byte_perm_S (w[15], w[14], selector); - w[23] = __byte_perm_S (w[14], w[13], selector); - w[22] = __byte_perm_S (w[13], w[12], selector); - w[21] = __byte_perm_S (w[12], w[11], selector); - w[20] = __byte_perm_S (w[11], w[10], selector); - w[19] = __byte_perm_S (w[10], w[ 9], selector); - w[18] = __byte_perm_S (w[ 9], w[ 8], selector); - w[17] = __byte_perm_S (w[ 8], w[ 7], selector); - w[16] = __byte_perm_S (w[ 7], w[ 6], selector); - w[15] = __byte_perm_S (w[ 6], w[ 5], selector); - w[14] = __byte_perm_S (w[ 5], w[ 4], selector); - w[13] = __byte_perm_S (w[ 4], w[ 3], selector); - w[12] = __byte_perm_S (w[ 3], w[ 2], selector); - w[11] = __byte_perm_S (w[ 2], w[ 1], selector); - w[10] = __byte_perm_S (w[ 1], w[ 0], selector); - w[ 9] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[54], w[53], selector); + w[62] = hc_byte_perm_S (w[53], w[52], selector); + w[61] = hc_byte_perm_S (w[52], w[51], selector); + w[60] = hc_byte_perm_S (w[51], w[50], selector); + w[59] = hc_byte_perm_S (w[50], w[49], selector); + w[58] = hc_byte_perm_S (w[49], w[48], selector); + w[57] = hc_byte_perm_S (w[48], w[47], selector); + w[56] = hc_byte_perm_S (w[47], w[46], selector); + w[55] = hc_byte_perm_S (w[46], w[45], selector); + w[54] = hc_byte_perm_S (w[45], w[44], selector); + w[53] = hc_byte_perm_S (w[44], w[43], selector); + w[52] = hc_byte_perm_S (w[43], w[42], selector); + w[51] = hc_byte_perm_S (w[42], w[41], selector); + w[50] = hc_byte_perm_S (w[41], w[40], selector); + w[49] = hc_byte_perm_S (w[40], w[39], selector); + w[48] = hc_byte_perm_S (w[39], w[38], selector); + w[47] = hc_byte_perm_S (w[38], w[37], selector); + w[46] = hc_byte_perm_S (w[37], w[36], selector); + w[45] = hc_byte_perm_S (w[36], w[35], selector); + w[44] = hc_byte_perm_S (w[35], w[34], selector); + w[43] = hc_byte_perm_S (w[34], w[33], selector); + w[42] = hc_byte_perm_S (w[33], w[32], selector); + w[41] = hc_byte_perm_S (w[32], w[31], selector); + w[40] = hc_byte_perm_S (w[31], w[30], selector); + w[39] = hc_byte_perm_S (w[30], w[29], selector); + w[38] = hc_byte_perm_S (w[29], w[28], selector); + w[37] = hc_byte_perm_S (w[28], w[27], selector); + w[36] = hc_byte_perm_S (w[27], w[26], selector); + w[35] = hc_byte_perm_S (w[26], w[25], selector); + w[34] = hc_byte_perm_S (w[25], w[24], selector); + w[33] = hc_byte_perm_S (w[24], w[23], selector); + w[32] = hc_byte_perm_S (w[23], w[22], selector); + w[31] = hc_byte_perm_S (w[22], w[21], selector); + w[30] = hc_byte_perm_S (w[21], w[20], selector); + w[29] = hc_byte_perm_S (w[20], w[19], selector); + w[28] = hc_byte_perm_S (w[19], w[18], selector); + w[27] = hc_byte_perm_S (w[18], w[17], selector); + w[26] = hc_byte_perm_S (w[17], w[16], selector); + w[25] = hc_byte_perm_S (w[16], w[15], selector); + w[24] = hc_byte_perm_S (w[15], w[14], selector); + w[23] = hc_byte_perm_S (w[14], w[13], selector); + w[22] = hc_byte_perm_S (w[13], w[12], selector); + w[21] = hc_byte_perm_S (w[12], w[11], selector); + w[20] = hc_byte_perm_S (w[11], w[10], selector); + w[19] = hc_byte_perm_S (w[10], w[ 9], selector); + w[18] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[17] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[16] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[15] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[14] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[13] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[12] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[11] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[10] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[ 9] = hc_byte_perm_S (w[ 0], 0, selector); w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; @@ -56132,60 +56132,60 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 10: - w[63] = __byte_perm_S (w[53], w[52], selector); - w[62] = __byte_perm_S (w[52], w[51], selector); - w[61] = __byte_perm_S (w[51], w[50], selector); - w[60] = __byte_perm_S (w[50], w[49], selector); - w[59] = __byte_perm_S (w[49], w[48], selector); - w[58] = __byte_perm_S (w[48], w[47], selector); - w[57] = __byte_perm_S (w[47], w[46], selector); - w[56] = __byte_perm_S (w[46], w[45], selector); - w[55] = __byte_perm_S (w[45], w[44], selector); - w[54] = __byte_perm_S (w[44], w[43], selector); - w[53] = __byte_perm_S (w[43], w[42], selector); - w[52] = __byte_perm_S (w[42], w[41], selector); - w[51] = __byte_perm_S (w[41], w[40], selector); - w[50] = __byte_perm_S (w[40], w[39], selector); - w[49] = __byte_perm_S (w[39], w[38], selector); - w[48] = __byte_perm_S (w[38], w[37], selector); - w[47] = __byte_perm_S (w[37], w[36], selector); - w[46] = __byte_perm_S (w[36], w[35], selector); - w[45] = __byte_perm_S (w[35], w[34], selector); - w[44] = __byte_perm_S (w[34], w[33], selector); - w[43] = __byte_perm_S (w[33], w[32], selector); - w[42] = __byte_perm_S (w[32], w[31], selector); - w[41] = __byte_perm_S (w[31], w[30], selector); - w[40] = __byte_perm_S (w[30], w[29], selector); - w[39] = __byte_perm_S (w[29], w[28], selector); - w[38] = __byte_perm_S (w[28], w[27], selector); - w[37] = __byte_perm_S (w[27], w[26], selector); - w[36] = __byte_perm_S (w[26], w[25], selector); - w[35] = __byte_perm_S (w[25], w[24], selector); - w[34] = __byte_perm_S (w[24], w[23], selector); - w[33] = __byte_perm_S (w[23], w[22], selector); - w[32] = __byte_perm_S (w[22], w[21], selector); - w[31] = __byte_perm_S (w[21], w[20], selector); - w[30] = __byte_perm_S (w[20], w[19], selector); - w[29] = __byte_perm_S (w[19], w[18], selector); - w[28] = __byte_perm_S (w[18], w[17], selector); - w[27] = __byte_perm_S (w[17], w[16], selector); - w[26] = __byte_perm_S (w[16], w[15], selector); - w[25] = __byte_perm_S (w[15], w[14], selector); - w[24] = __byte_perm_S (w[14], w[13], selector); - w[23] = __byte_perm_S (w[13], w[12], selector); - w[22] = __byte_perm_S (w[12], w[11], selector); - w[21] = __byte_perm_S (w[11], w[10], selector); - w[20] = __byte_perm_S (w[10], w[ 9], selector); - w[19] = __byte_perm_S (w[ 9], w[ 8], selector); - w[18] = __byte_perm_S (w[ 8], w[ 7], selector); - w[17] = __byte_perm_S (w[ 7], w[ 6], selector); - w[16] = __byte_perm_S (w[ 6], w[ 5], selector); - w[15] = __byte_perm_S (w[ 5], w[ 4], selector); - w[14] = __byte_perm_S (w[ 4], w[ 3], selector); - w[13] = __byte_perm_S (w[ 3], w[ 2], selector); - w[12] = __byte_perm_S (w[ 2], w[ 1], selector); - w[11] = __byte_perm_S (w[ 1], w[ 0], selector); - w[10] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[53], w[52], selector); + w[62] = hc_byte_perm_S (w[52], w[51], selector); + w[61] = hc_byte_perm_S (w[51], w[50], selector); + w[60] = hc_byte_perm_S (w[50], w[49], selector); + w[59] = hc_byte_perm_S (w[49], w[48], selector); + w[58] = hc_byte_perm_S (w[48], w[47], selector); + w[57] = hc_byte_perm_S (w[47], w[46], selector); + w[56] = hc_byte_perm_S (w[46], w[45], selector); + w[55] = hc_byte_perm_S (w[45], w[44], selector); + w[54] = hc_byte_perm_S (w[44], w[43], selector); + w[53] = hc_byte_perm_S (w[43], w[42], selector); + w[52] = hc_byte_perm_S (w[42], w[41], selector); + w[51] = hc_byte_perm_S (w[41], w[40], selector); + w[50] = hc_byte_perm_S (w[40], w[39], selector); + w[49] = hc_byte_perm_S (w[39], w[38], selector); + w[48] = hc_byte_perm_S (w[38], w[37], selector); + w[47] = hc_byte_perm_S (w[37], w[36], selector); + w[46] = hc_byte_perm_S (w[36], w[35], selector); + w[45] = hc_byte_perm_S (w[35], w[34], selector); + w[44] = hc_byte_perm_S (w[34], w[33], selector); + w[43] = hc_byte_perm_S (w[33], w[32], selector); + w[42] = hc_byte_perm_S (w[32], w[31], selector); + w[41] = hc_byte_perm_S (w[31], w[30], selector); + w[40] = hc_byte_perm_S (w[30], w[29], selector); + w[39] = hc_byte_perm_S (w[29], w[28], selector); + w[38] = hc_byte_perm_S (w[28], w[27], selector); + w[37] = hc_byte_perm_S (w[27], w[26], selector); + w[36] = hc_byte_perm_S (w[26], w[25], selector); + w[35] = hc_byte_perm_S (w[25], w[24], selector); + w[34] = hc_byte_perm_S (w[24], w[23], selector); + w[33] = hc_byte_perm_S (w[23], w[22], selector); + w[32] = hc_byte_perm_S (w[22], w[21], selector); + w[31] = hc_byte_perm_S (w[21], w[20], selector); + w[30] = hc_byte_perm_S (w[20], w[19], selector); + w[29] = hc_byte_perm_S (w[19], w[18], selector); + w[28] = hc_byte_perm_S (w[18], w[17], selector); + w[27] = hc_byte_perm_S (w[17], w[16], selector); + w[26] = hc_byte_perm_S (w[16], w[15], selector); + w[25] = hc_byte_perm_S (w[15], w[14], selector); + w[24] = hc_byte_perm_S (w[14], w[13], selector); + w[23] = hc_byte_perm_S (w[13], w[12], selector); + w[22] = hc_byte_perm_S (w[12], w[11], selector); + w[21] = hc_byte_perm_S (w[11], w[10], selector); + w[20] = hc_byte_perm_S (w[10], w[ 9], selector); + w[19] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[18] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[17] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[16] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[15] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[14] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[13] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[12] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[11] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[10] = hc_byte_perm_S (w[ 0], 0, selector); w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; @@ -56200,59 +56200,59 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 11: - w[63] = __byte_perm_S (w[52], w[51], selector); - w[62] = __byte_perm_S (w[51], w[50], selector); - w[61] = __byte_perm_S (w[50], w[49], selector); - w[60] = __byte_perm_S (w[49], w[48], selector); - w[59] = __byte_perm_S (w[48], w[47], selector); - w[58] = __byte_perm_S (w[47], w[46], selector); - w[57] = __byte_perm_S (w[46], w[45], selector); - w[56] = __byte_perm_S (w[45], w[44], selector); - w[55] = __byte_perm_S (w[44], w[43], selector); - w[54] = __byte_perm_S (w[43], w[42], selector); - w[53] = __byte_perm_S (w[42], w[41], selector); - w[52] = __byte_perm_S (w[41], w[40], selector); - w[51] = __byte_perm_S (w[40], w[39], selector); - w[50] = __byte_perm_S (w[39], w[38], selector); - w[49] = __byte_perm_S (w[38], w[37], selector); - w[48] = __byte_perm_S (w[37], w[36], selector); - w[47] = __byte_perm_S (w[36], w[35], selector); - w[46] = __byte_perm_S (w[35], w[34], selector); - w[45] = __byte_perm_S (w[34], w[33], selector); - w[44] = __byte_perm_S (w[33], w[32], selector); - w[43] = __byte_perm_S (w[32], w[31], selector); - w[42] = __byte_perm_S (w[31], w[30], selector); - w[41] = __byte_perm_S (w[30], w[29], selector); - w[40] = __byte_perm_S (w[29], w[28], selector); - w[39] = __byte_perm_S (w[28], w[27], selector); - w[38] = __byte_perm_S (w[27], w[26], selector); - w[37] = __byte_perm_S (w[26], w[25], selector); - w[36] = __byte_perm_S (w[25], w[24], selector); - w[35] = __byte_perm_S (w[24], w[23], selector); - w[34] = __byte_perm_S (w[23], w[22], selector); - w[33] = __byte_perm_S (w[22], w[21], selector); - w[32] = __byte_perm_S (w[21], w[20], selector); - w[31] = __byte_perm_S (w[20], w[19], selector); - w[30] = __byte_perm_S (w[19], w[18], selector); - w[29] = __byte_perm_S (w[18], w[17], selector); - w[28] = __byte_perm_S (w[17], w[16], selector); - w[27] = __byte_perm_S (w[16], w[15], selector); - w[26] = __byte_perm_S (w[15], w[14], selector); - w[25] = __byte_perm_S (w[14], w[13], selector); - w[24] = __byte_perm_S (w[13], w[12], selector); - w[23] = __byte_perm_S (w[12], w[11], selector); - w[22] = __byte_perm_S (w[11], w[10], selector); - w[21] = __byte_perm_S (w[10], w[ 9], selector); - w[20] = __byte_perm_S (w[ 9], w[ 8], selector); - w[19] = __byte_perm_S (w[ 8], w[ 7], selector); - w[18] = __byte_perm_S (w[ 7], w[ 6], selector); - w[17] = __byte_perm_S (w[ 6], w[ 5], selector); - w[16] = __byte_perm_S (w[ 5], w[ 4], selector); - w[15] = __byte_perm_S (w[ 4], w[ 3], selector); - w[14] = __byte_perm_S (w[ 3], w[ 2], selector); - w[13] = __byte_perm_S (w[ 2], w[ 1], selector); - w[12] = __byte_perm_S (w[ 1], w[ 0], selector); - w[11] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[52], w[51], selector); + w[62] = hc_byte_perm_S (w[51], w[50], selector); + w[61] = hc_byte_perm_S (w[50], w[49], selector); + w[60] = hc_byte_perm_S (w[49], w[48], selector); + w[59] = hc_byte_perm_S (w[48], w[47], selector); + w[58] = hc_byte_perm_S (w[47], w[46], selector); + w[57] = hc_byte_perm_S (w[46], w[45], selector); + w[56] = hc_byte_perm_S (w[45], w[44], selector); + w[55] = hc_byte_perm_S (w[44], w[43], selector); + w[54] = hc_byte_perm_S (w[43], w[42], selector); + w[53] = hc_byte_perm_S (w[42], w[41], selector); + w[52] = hc_byte_perm_S (w[41], w[40], selector); + w[51] = hc_byte_perm_S (w[40], w[39], selector); + w[50] = hc_byte_perm_S (w[39], w[38], selector); + w[49] = hc_byte_perm_S (w[38], w[37], selector); + w[48] = hc_byte_perm_S (w[37], w[36], selector); + w[47] = hc_byte_perm_S (w[36], w[35], selector); + w[46] = hc_byte_perm_S (w[35], w[34], selector); + w[45] = hc_byte_perm_S (w[34], w[33], selector); + w[44] = hc_byte_perm_S (w[33], w[32], selector); + w[43] = hc_byte_perm_S (w[32], w[31], selector); + w[42] = hc_byte_perm_S (w[31], w[30], selector); + w[41] = hc_byte_perm_S (w[30], w[29], selector); + w[40] = hc_byte_perm_S (w[29], w[28], selector); + w[39] = hc_byte_perm_S (w[28], w[27], selector); + w[38] = hc_byte_perm_S (w[27], w[26], selector); + w[37] = hc_byte_perm_S (w[26], w[25], selector); + w[36] = hc_byte_perm_S (w[25], w[24], selector); + w[35] = hc_byte_perm_S (w[24], w[23], selector); + w[34] = hc_byte_perm_S (w[23], w[22], selector); + w[33] = hc_byte_perm_S (w[22], w[21], selector); + w[32] = hc_byte_perm_S (w[21], w[20], selector); + w[31] = hc_byte_perm_S (w[20], w[19], selector); + w[30] = hc_byte_perm_S (w[19], w[18], selector); + w[29] = hc_byte_perm_S (w[18], w[17], selector); + w[28] = hc_byte_perm_S (w[17], w[16], selector); + w[27] = hc_byte_perm_S (w[16], w[15], selector); + w[26] = hc_byte_perm_S (w[15], w[14], selector); + w[25] = hc_byte_perm_S (w[14], w[13], selector); + w[24] = hc_byte_perm_S (w[13], w[12], selector); + w[23] = hc_byte_perm_S (w[12], w[11], selector); + w[22] = hc_byte_perm_S (w[11], w[10], selector); + w[21] = hc_byte_perm_S (w[10], w[ 9], selector); + w[20] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[19] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[18] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[17] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[16] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[15] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[14] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[13] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[12] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[11] = hc_byte_perm_S (w[ 0], 0, selector); w[10] = 0; w[ 9] = 0; w[ 8] = 0; @@ -56268,58 +56268,58 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 12: - w[63] = __byte_perm_S (w[51], w[50], selector); - w[62] = __byte_perm_S (w[50], w[49], selector); - w[61] = __byte_perm_S (w[49], w[48], selector); - w[60] = __byte_perm_S (w[48], w[47], selector); - w[59] = __byte_perm_S (w[47], w[46], selector); - w[58] = __byte_perm_S (w[46], w[45], selector); - w[57] = __byte_perm_S (w[45], w[44], selector); - w[56] = __byte_perm_S (w[44], w[43], selector); - w[55] = __byte_perm_S (w[43], w[42], selector); - w[54] = __byte_perm_S (w[42], w[41], selector); - w[53] = __byte_perm_S (w[41], w[40], selector); - w[52] = __byte_perm_S (w[40], w[39], selector); - w[51] = __byte_perm_S (w[39], w[38], selector); - w[50] = __byte_perm_S (w[38], w[37], selector); - w[49] = __byte_perm_S (w[37], w[36], selector); - w[48] = __byte_perm_S (w[36], w[35], selector); - w[47] = __byte_perm_S (w[35], w[34], selector); - w[46] = __byte_perm_S (w[34], w[33], selector); - w[45] = __byte_perm_S (w[33], w[32], selector); - w[44] = __byte_perm_S (w[32], w[31], selector); - w[43] = __byte_perm_S (w[31], w[30], selector); - w[42] = __byte_perm_S (w[30], w[29], selector); - w[41] = __byte_perm_S (w[29], w[28], selector); - w[40] = __byte_perm_S (w[28], w[27], selector); - w[39] = __byte_perm_S (w[27], w[26], selector); - w[38] = __byte_perm_S (w[26], w[25], selector); - w[37] = __byte_perm_S (w[25], w[24], selector); - w[36] = __byte_perm_S (w[24], w[23], selector); - w[35] = __byte_perm_S (w[23], w[22], selector); - w[34] = __byte_perm_S (w[22], w[21], selector); - w[33] = __byte_perm_S (w[21], w[20], selector); - w[32] = __byte_perm_S (w[20], w[19], selector); - w[31] = __byte_perm_S (w[19], w[18], selector); - w[30] = __byte_perm_S (w[18], w[17], selector); - w[29] = __byte_perm_S (w[17], w[16], selector); - w[28] = __byte_perm_S (w[16], w[15], selector); - w[27] = __byte_perm_S (w[15], w[14], selector); - w[26] = __byte_perm_S (w[14], w[13], selector); - w[25] = __byte_perm_S (w[13], w[12], selector); - w[24] = __byte_perm_S (w[12], w[11], selector); - w[23] = __byte_perm_S (w[11], w[10], selector); - w[22] = __byte_perm_S (w[10], w[ 9], selector); - w[21] = __byte_perm_S (w[ 9], w[ 8], selector); - w[20] = __byte_perm_S (w[ 8], w[ 7], selector); - w[19] = __byte_perm_S (w[ 7], w[ 6], selector); - w[18] = __byte_perm_S (w[ 6], w[ 5], selector); - w[17] = __byte_perm_S (w[ 5], w[ 4], selector); - w[16] = __byte_perm_S (w[ 4], w[ 3], selector); - w[15] = __byte_perm_S (w[ 3], w[ 2], selector); - w[14] = __byte_perm_S (w[ 2], w[ 1], selector); - w[13] = __byte_perm_S (w[ 1], w[ 0], selector); - w[12] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[51], w[50], selector); + w[62] = hc_byte_perm_S (w[50], w[49], selector); + w[61] = hc_byte_perm_S (w[49], w[48], selector); + w[60] = hc_byte_perm_S (w[48], w[47], selector); + w[59] = hc_byte_perm_S (w[47], w[46], selector); + w[58] = hc_byte_perm_S (w[46], w[45], selector); + w[57] = hc_byte_perm_S (w[45], w[44], selector); + w[56] = hc_byte_perm_S (w[44], w[43], selector); + w[55] = hc_byte_perm_S (w[43], w[42], selector); + w[54] = hc_byte_perm_S (w[42], w[41], selector); + w[53] = hc_byte_perm_S (w[41], w[40], selector); + w[52] = hc_byte_perm_S (w[40], w[39], selector); + w[51] = hc_byte_perm_S (w[39], w[38], selector); + w[50] = hc_byte_perm_S (w[38], w[37], selector); + w[49] = hc_byte_perm_S (w[37], w[36], selector); + w[48] = hc_byte_perm_S (w[36], w[35], selector); + w[47] = hc_byte_perm_S (w[35], w[34], selector); + w[46] = hc_byte_perm_S (w[34], w[33], selector); + w[45] = hc_byte_perm_S (w[33], w[32], selector); + w[44] = hc_byte_perm_S (w[32], w[31], selector); + w[43] = hc_byte_perm_S (w[31], w[30], selector); + w[42] = hc_byte_perm_S (w[30], w[29], selector); + w[41] = hc_byte_perm_S (w[29], w[28], selector); + w[40] = hc_byte_perm_S (w[28], w[27], selector); + w[39] = hc_byte_perm_S (w[27], w[26], selector); + w[38] = hc_byte_perm_S (w[26], w[25], selector); + w[37] = hc_byte_perm_S (w[25], w[24], selector); + w[36] = hc_byte_perm_S (w[24], w[23], selector); + w[35] = hc_byte_perm_S (w[23], w[22], selector); + w[34] = hc_byte_perm_S (w[22], w[21], selector); + w[33] = hc_byte_perm_S (w[21], w[20], selector); + w[32] = hc_byte_perm_S (w[20], w[19], selector); + w[31] = hc_byte_perm_S (w[19], w[18], selector); + w[30] = hc_byte_perm_S (w[18], w[17], selector); + w[29] = hc_byte_perm_S (w[17], w[16], selector); + w[28] = hc_byte_perm_S (w[16], w[15], selector); + w[27] = hc_byte_perm_S (w[15], w[14], selector); + w[26] = hc_byte_perm_S (w[14], w[13], selector); + w[25] = hc_byte_perm_S (w[13], w[12], selector); + w[24] = hc_byte_perm_S (w[12], w[11], selector); + w[23] = hc_byte_perm_S (w[11], w[10], selector); + w[22] = hc_byte_perm_S (w[10], w[ 9], selector); + w[21] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[20] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[19] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[18] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[17] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[16] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[15] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[14] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[13] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[12] = hc_byte_perm_S (w[ 0], 0, selector); w[11] = 0; w[10] = 0; w[ 9] = 0; @@ -56336,57 +56336,57 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 13: - w[63] = __byte_perm_S (w[50], w[49], selector); - w[62] = __byte_perm_S (w[49], w[48], selector); - w[61] = __byte_perm_S (w[48], w[47], selector); - w[60] = __byte_perm_S (w[47], w[46], selector); - w[59] = __byte_perm_S (w[46], w[45], selector); - w[58] = __byte_perm_S (w[45], w[44], selector); - w[57] = __byte_perm_S (w[44], w[43], selector); - w[56] = __byte_perm_S (w[43], w[42], selector); - w[55] = __byte_perm_S (w[42], w[41], selector); - w[54] = __byte_perm_S (w[41], w[40], selector); - w[53] = __byte_perm_S (w[40], w[39], selector); - w[52] = __byte_perm_S (w[39], w[38], selector); - w[51] = __byte_perm_S (w[38], w[37], selector); - w[50] = __byte_perm_S (w[37], w[36], selector); - w[49] = __byte_perm_S (w[36], w[35], selector); - w[48] = __byte_perm_S (w[35], w[34], selector); - w[47] = __byte_perm_S (w[34], w[33], selector); - w[46] = __byte_perm_S (w[33], w[32], selector); - w[45] = __byte_perm_S (w[32], w[31], selector); - w[44] = __byte_perm_S (w[31], w[30], selector); - w[43] = __byte_perm_S (w[30], w[29], selector); - w[42] = __byte_perm_S (w[29], w[28], selector); - w[41] = __byte_perm_S (w[28], w[27], selector); - w[40] = __byte_perm_S (w[27], w[26], selector); - w[39] = __byte_perm_S (w[26], w[25], selector); - w[38] = __byte_perm_S (w[25], w[24], selector); - w[37] = __byte_perm_S (w[24], w[23], selector); - w[36] = __byte_perm_S (w[23], w[22], selector); - w[35] = __byte_perm_S (w[22], w[21], selector); - w[34] = __byte_perm_S (w[21], w[20], selector); - w[33] = __byte_perm_S (w[20], w[19], selector); - w[32] = __byte_perm_S (w[19], w[18], selector); - w[31] = __byte_perm_S (w[18], w[17], selector); - w[30] = __byte_perm_S (w[17], w[16], selector); - w[29] = __byte_perm_S (w[16], w[15], selector); - w[28] = __byte_perm_S (w[15], w[14], selector); - w[27] = __byte_perm_S (w[14], w[13], selector); - w[26] = __byte_perm_S (w[13], w[12], selector); - w[25] = __byte_perm_S (w[12], w[11], selector); - w[24] = __byte_perm_S (w[11], w[10], selector); - w[23] = __byte_perm_S (w[10], w[ 9], selector); - w[22] = __byte_perm_S (w[ 9], w[ 8], selector); - w[21] = __byte_perm_S (w[ 8], w[ 7], selector); - w[20] = __byte_perm_S (w[ 7], w[ 6], selector); - w[19] = __byte_perm_S (w[ 6], w[ 5], selector); - w[18] = __byte_perm_S (w[ 5], w[ 4], selector); - w[17] = __byte_perm_S (w[ 4], w[ 3], selector); - w[16] = __byte_perm_S (w[ 3], w[ 2], selector); - w[15] = __byte_perm_S (w[ 2], w[ 1], selector); - w[14] = __byte_perm_S (w[ 1], w[ 0], selector); - w[13] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[50], w[49], selector); + w[62] = hc_byte_perm_S (w[49], w[48], selector); + w[61] = hc_byte_perm_S (w[48], w[47], selector); + w[60] = hc_byte_perm_S (w[47], w[46], selector); + w[59] = hc_byte_perm_S (w[46], w[45], selector); + w[58] = hc_byte_perm_S (w[45], w[44], selector); + w[57] = hc_byte_perm_S (w[44], w[43], selector); + w[56] = hc_byte_perm_S (w[43], w[42], selector); + w[55] = hc_byte_perm_S (w[42], w[41], selector); + w[54] = hc_byte_perm_S (w[41], w[40], selector); + w[53] = hc_byte_perm_S (w[40], w[39], selector); + w[52] = hc_byte_perm_S (w[39], w[38], selector); + w[51] = hc_byte_perm_S (w[38], w[37], selector); + w[50] = hc_byte_perm_S (w[37], w[36], selector); + w[49] = hc_byte_perm_S (w[36], w[35], selector); + w[48] = hc_byte_perm_S (w[35], w[34], selector); + w[47] = hc_byte_perm_S (w[34], w[33], selector); + w[46] = hc_byte_perm_S (w[33], w[32], selector); + w[45] = hc_byte_perm_S (w[32], w[31], selector); + w[44] = hc_byte_perm_S (w[31], w[30], selector); + w[43] = hc_byte_perm_S (w[30], w[29], selector); + w[42] = hc_byte_perm_S (w[29], w[28], selector); + w[41] = hc_byte_perm_S (w[28], w[27], selector); + w[40] = hc_byte_perm_S (w[27], w[26], selector); + w[39] = hc_byte_perm_S (w[26], w[25], selector); + w[38] = hc_byte_perm_S (w[25], w[24], selector); + w[37] = hc_byte_perm_S (w[24], w[23], selector); + w[36] = hc_byte_perm_S (w[23], w[22], selector); + w[35] = hc_byte_perm_S (w[22], w[21], selector); + w[34] = hc_byte_perm_S (w[21], w[20], selector); + w[33] = hc_byte_perm_S (w[20], w[19], selector); + w[32] = hc_byte_perm_S (w[19], w[18], selector); + w[31] = hc_byte_perm_S (w[18], w[17], selector); + w[30] = hc_byte_perm_S (w[17], w[16], selector); + w[29] = hc_byte_perm_S (w[16], w[15], selector); + w[28] = hc_byte_perm_S (w[15], w[14], selector); + w[27] = hc_byte_perm_S (w[14], w[13], selector); + w[26] = hc_byte_perm_S (w[13], w[12], selector); + w[25] = hc_byte_perm_S (w[12], w[11], selector); + w[24] = hc_byte_perm_S (w[11], w[10], selector); + w[23] = hc_byte_perm_S (w[10], w[ 9], selector); + w[22] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[21] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[20] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[19] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[18] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[17] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[16] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[15] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[14] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[13] = hc_byte_perm_S (w[ 0], 0, selector); w[12] = 0; w[11] = 0; w[10] = 0; @@ -56404,56 +56404,56 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 14: - w[63] = __byte_perm_S (w[49], w[48], selector); - w[62] = __byte_perm_S (w[48], w[47], selector); - w[61] = __byte_perm_S (w[47], w[46], selector); - w[60] = __byte_perm_S (w[46], w[45], selector); - w[59] = __byte_perm_S (w[45], w[44], selector); - w[58] = __byte_perm_S (w[44], w[43], selector); - w[57] = __byte_perm_S (w[43], w[42], selector); - w[56] = __byte_perm_S (w[42], w[41], selector); - w[55] = __byte_perm_S (w[41], w[40], selector); - w[54] = __byte_perm_S (w[40], w[39], selector); - w[53] = __byte_perm_S (w[39], w[38], selector); - w[52] = __byte_perm_S (w[38], w[37], selector); - w[51] = __byte_perm_S (w[37], w[36], selector); - w[50] = __byte_perm_S (w[36], w[35], selector); - w[49] = __byte_perm_S (w[35], w[34], selector); - w[48] = __byte_perm_S (w[34], w[33], selector); - w[47] = __byte_perm_S (w[33], w[32], selector); - w[46] = __byte_perm_S (w[32], w[31], selector); - w[45] = __byte_perm_S (w[31], w[30], selector); - w[44] = __byte_perm_S (w[30], w[29], selector); - w[43] = __byte_perm_S (w[29], w[28], selector); - w[42] = __byte_perm_S (w[28], w[27], selector); - w[41] = __byte_perm_S (w[27], w[26], selector); - w[40] = __byte_perm_S (w[26], w[25], selector); - w[39] = __byte_perm_S (w[25], w[24], selector); - w[38] = __byte_perm_S (w[24], w[23], selector); - w[37] = __byte_perm_S (w[23], w[22], selector); - w[36] = __byte_perm_S (w[22], w[21], selector); - w[35] = __byte_perm_S (w[21], w[20], selector); - w[34] = __byte_perm_S (w[20], w[19], selector); - w[33] = __byte_perm_S (w[19], w[18], selector); - w[32] = __byte_perm_S (w[18], w[17], selector); - w[31] = __byte_perm_S (w[17], w[16], selector); - w[30] = __byte_perm_S (w[16], w[15], selector); - w[29] = __byte_perm_S (w[15], w[14], selector); - w[28] = __byte_perm_S (w[14], w[13], selector); - w[27] = __byte_perm_S (w[13], w[12], selector); - w[26] = __byte_perm_S (w[12], w[11], selector); - w[25] = __byte_perm_S (w[11], w[10], selector); - w[24] = __byte_perm_S (w[10], w[ 9], selector); - w[23] = __byte_perm_S (w[ 9], w[ 8], selector); - w[22] = __byte_perm_S (w[ 8], w[ 7], selector); - w[21] = __byte_perm_S (w[ 7], w[ 6], selector); - w[20] = __byte_perm_S (w[ 6], w[ 5], selector); - w[19] = __byte_perm_S (w[ 5], w[ 4], selector); - w[18] = __byte_perm_S (w[ 4], w[ 3], selector); - w[17] = __byte_perm_S (w[ 3], w[ 2], selector); - w[16] = __byte_perm_S (w[ 2], w[ 1], selector); - w[15] = __byte_perm_S (w[ 1], w[ 0], selector); - w[14] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[49], w[48], selector); + w[62] = hc_byte_perm_S (w[48], w[47], selector); + w[61] = hc_byte_perm_S (w[47], w[46], selector); + w[60] = hc_byte_perm_S (w[46], w[45], selector); + w[59] = hc_byte_perm_S (w[45], w[44], selector); + w[58] = hc_byte_perm_S (w[44], w[43], selector); + w[57] = hc_byte_perm_S (w[43], w[42], selector); + w[56] = hc_byte_perm_S (w[42], w[41], selector); + w[55] = hc_byte_perm_S (w[41], w[40], selector); + w[54] = hc_byte_perm_S (w[40], w[39], selector); + w[53] = hc_byte_perm_S (w[39], w[38], selector); + w[52] = hc_byte_perm_S (w[38], w[37], selector); + w[51] = hc_byte_perm_S (w[37], w[36], selector); + w[50] = hc_byte_perm_S (w[36], w[35], selector); + w[49] = hc_byte_perm_S (w[35], w[34], selector); + w[48] = hc_byte_perm_S (w[34], w[33], selector); + w[47] = hc_byte_perm_S (w[33], w[32], selector); + w[46] = hc_byte_perm_S (w[32], w[31], selector); + w[45] = hc_byte_perm_S (w[31], w[30], selector); + w[44] = hc_byte_perm_S (w[30], w[29], selector); + w[43] = hc_byte_perm_S (w[29], w[28], selector); + w[42] = hc_byte_perm_S (w[28], w[27], selector); + w[41] = hc_byte_perm_S (w[27], w[26], selector); + w[40] = hc_byte_perm_S (w[26], w[25], selector); + w[39] = hc_byte_perm_S (w[25], w[24], selector); + w[38] = hc_byte_perm_S (w[24], w[23], selector); + w[37] = hc_byte_perm_S (w[23], w[22], selector); + w[36] = hc_byte_perm_S (w[22], w[21], selector); + w[35] = hc_byte_perm_S (w[21], w[20], selector); + w[34] = hc_byte_perm_S (w[20], w[19], selector); + w[33] = hc_byte_perm_S (w[19], w[18], selector); + w[32] = hc_byte_perm_S (w[18], w[17], selector); + w[31] = hc_byte_perm_S (w[17], w[16], selector); + w[30] = hc_byte_perm_S (w[16], w[15], selector); + w[29] = hc_byte_perm_S (w[15], w[14], selector); + w[28] = hc_byte_perm_S (w[14], w[13], selector); + w[27] = hc_byte_perm_S (w[13], w[12], selector); + w[26] = hc_byte_perm_S (w[12], w[11], selector); + w[25] = hc_byte_perm_S (w[11], w[10], selector); + w[24] = hc_byte_perm_S (w[10], w[ 9], selector); + w[23] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[22] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[21] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[20] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[19] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[18] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[17] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[16] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[15] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[14] = hc_byte_perm_S (w[ 0], 0, selector); w[13] = 0; w[12] = 0; w[11] = 0; @@ -56472,55 +56472,55 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 15: - w[63] = __byte_perm_S (w[48], w[47], selector); - w[62] = __byte_perm_S (w[47], w[46], selector); - w[61] = __byte_perm_S (w[46], w[45], selector); - w[60] = __byte_perm_S (w[45], w[44], selector); - w[59] = __byte_perm_S (w[44], w[43], selector); - w[58] = __byte_perm_S (w[43], w[42], selector); - w[57] = __byte_perm_S (w[42], w[41], selector); - w[56] = __byte_perm_S (w[41], w[40], selector); - w[55] = __byte_perm_S (w[40], w[39], selector); - w[54] = __byte_perm_S (w[39], w[38], selector); - w[53] = __byte_perm_S (w[38], w[37], selector); - w[52] = __byte_perm_S (w[37], w[36], selector); - w[51] = __byte_perm_S (w[36], w[35], selector); - w[50] = __byte_perm_S (w[35], w[34], selector); - w[49] = __byte_perm_S (w[34], w[33], selector); - w[48] = __byte_perm_S (w[33], w[32], selector); - w[47] = __byte_perm_S (w[32], w[31], selector); - w[46] = __byte_perm_S (w[31], w[30], selector); - w[45] = __byte_perm_S (w[30], w[29], selector); - w[44] = __byte_perm_S (w[29], w[28], selector); - w[43] = __byte_perm_S (w[28], w[27], selector); - w[42] = __byte_perm_S (w[27], w[26], selector); - w[41] = __byte_perm_S (w[26], w[25], selector); - w[40] = __byte_perm_S (w[25], w[24], selector); - w[39] = __byte_perm_S (w[24], w[23], selector); - w[38] = __byte_perm_S (w[23], w[22], selector); - w[37] = __byte_perm_S (w[22], w[21], selector); - w[36] = __byte_perm_S (w[21], w[20], selector); - w[35] = __byte_perm_S (w[20], w[19], selector); - w[34] = __byte_perm_S (w[19], w[18], selector); - w[33] = __byte_perm_S (w[18], w[17], selector); - w[32] = __byte_perm_S (w[17], w[16], selector); - w[31] = __byte_perm_S (w[16], w[15], selector); - w[30] = __byte_perm_S (w[15], w[14], selector); - w[29] = __byte_perm_S (w[14], w[13], selector); - w[28] = __byte_perm_S (w[13], w[12], selector); - w[27] = __byte_perm_S (w[12], w[11], selector); - w[26] = __byte_perm_S (w[11], w[10], selector); - w[25] = __byte_perm_S (w[10], w[ 9], selector); - w[24] = __byte_perm_S (w[ 9], w[ 8], selector); - w[23] = __byte_perm_S (w[ 8], w[ 7], selector); - w[22] = __byte_perm_S (w[ 7], w[ 6], selector); - w[21] = __byte_perm_S (w[ 6], w[ 5], selector); - w[20] = __byte_perm_S (w[ 5], w[ 4], selector); - w[19] = __byte_perm_S (w[ 4], w[ 3], selector); - w[18] = __byte_perm_S (w[ 3], w[ 2], selector); - w[17] = __byte_perm_S (w[ 2], w[ 1], selector); - w[16] = __byte_perm_S (w[ 1], w[ 0], selector); - w[15] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[48], w[47], selector); + w[62] = hc_byte_perm_S (w[47], w[46], selector); + w[61] = hc_byte_perm_S (w[46], w[45], selector); + w[60] = hc_byte_perm_S (w[45], w[44], selector); + w[59] = hc_byte_perm_S (w[44], w[43], selector); + w[58] = hc_byte_perm_S (w[43], w[42], selector); + w[57] = hc_byte_perm_S (w[42], w[41], selector); + w[56] = hc_byte_perm_S (w[41], w[40], selector); + w[55] = hc_byte_perm_S (w[40], w[39], selector); + w[54] = hc_byte_perm_S (w[39], w[38], selector); + w[53] = hc_byte_perm_S (w[38], w[37], selector); + w[52] = hc_byte_perm_S (w[37], w[36], selector); + w[51] = hc_byte_perm_S (w[36], w[35], selector); + w[50] = hc_byte_perm_S (w[35], w[34], selector); + w[49] = hc_byte_perm_S (w[34], w[33], selector); + w[48] = hc_byte_perm_S (w[33], w[32], selector); + w[47] = hc_byte_perm_S (w[32], w[31], selector); + w[46] = hc_byte_perm_S (w[31], w[30], selector); + w[45] = hc_byte_perm_S (w[30], w[29], selector); + w[44] = hc_byte_perm_S (w[29], w[28], selector); + w[43] = hc_byte_perm_S (w[28], w[27], selector); + w[42] = hc_byte_perm_S (w[27], w[26], selector); + w[41] = hc_byte_perm_S (w[26], w[25], selector); + w[40] = hc_byte_perm_S (w[25], w[24], selector); + w[39] = hc_byte_perm_S (w[24], w[23], selector); + w[38] = hc_byte_perm_S (w[23], w[22], selector); + w[37] = hc_byte_perm_S (w[22], w[21], selector); + w[36] = hc_byte_perm_S (w[21], w[20], selector); + w[35] = hc_byte_perm_S (w[20], w[19], selector); + w[34] = hc_byte_perm_S (w[19], w[18], selector); + w[33] = hc_byte_perm_S (w[18], w[17], selector); + w[32] = hc_byte_perm_S (w[17], w[16], selector); + w[31] = hc_byte_perm_S (w[16], w[15], selector); + w[30] = hc_byte_perm_S (w[15], w[14], selector); + w[29] = hc_byte_perm_S (w[14], w[13], selector); + w[28] = hc_byte_perm_S (w[13], w[12], selector); + w[27] = hc_byte_perm_S (w[12], w[11], selector); + w[26] = hc_byte_perm_S (w[11], w[10], selector); + w[25] = hc_byte_perm_S (w[10], w[ 9], selector); + w[24] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[23] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[22] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[21] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[20] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[19] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[18] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[17] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[16] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[15] = hc_byte_perm_S (w[ 0], 0, selector); w[14] = 0; w[13] = 0; w[12] = 0; @@ -56540,54 +56540,54 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 16: - w[63] = __byte_perm_S (w[47], w[46], selector); - w[62] = __byte_perm_S (w[46], w[45], selector); - w[61] = __byte_perm_S (w[45], w[44], selector); - w[60] = __byte_perm_S (w[44], w[43], selector); - w[59] = __byte_perm_S (w[43], w[42], selector); - w[58] = __byte_perm_S (w[42], w[41], selector); - w[57] = __byte_perm_S (w[41], w[40], selector); - w[56] = __byte_perm_S (w[40], w[39], selector); - w[55] = __byte_perm_S (w[39], w[38], selector); - w[54] = __byte_perm_S (w[38], w[37], selector); - w[53] = __byte_perm_S (w[37], w[36], selector); - w[52] = __byte_perm_S (w[36], w[35], selector); - w[51] = __byte_perm_S (w[35], w[34], selector); - w[50] = __byte_perm_S (w[34], w[33], selector); - w[49] = __byte_perm_S (w[33], w[32], selector); - w[48] = __byte_perm_S (w[32], w[31], selector); - w[47] = __byte_perm_S (w[31], w[30], selector); - w[46] = __byte_perm_S (w[30], w[29], selector); - w[45] = __byte_perm_S (w[29], w[28], selector); - w[44] = __byte_perm_S (w[28], w[27], selector); - w[43] = __byte_perm_S (w[27], w[26], selector); - w[42] = __byte_perm_S (w[26], w[25], selector); - w[41] = __byte_perm_S (w[25], w[24], selector); - w[40] = __byte_perm_S (w[24], w[23], selector); - w[39] = __byte_perm_S (w[23], w[22], selector); - w[38] = __byte_perm_S (w[22], w[21], selector); - w[37] = __byte_perm_S (w[21], w[20], selector); - w[36] = __byte_perm_S (w[20], w[19], selector); - w[35] = __byte_perm_S (w[19], w[18], selector); - w[34] = __byte_perm_S (w[18], w[17], selector); - w[33] = __byte_perm_S (w[17], w[16], selector); - w[32] = __byte_perm_S (w[16], w[15], selector); - w[31] = __byte_perm_S (w[15], w[14], selector); - w[30] = __byte_perm_S (w[14], w[13], selector); - w[29] = __byte_perm_S (w[13], w[12], selector); - w[28] = __byte_perm_S (w[12], w[11], selector); - w[27] = __byte_perm_S (w[11], w[10], selector); - w[26] = __byte_perm_S (w[10], w[ 9], selector); - w[25] = __byte_perm_S (w[ 9], w[ 8], selector); - w[24] = __byte_perm_S (w[ 8], w[ 7], selector); - w[23] = __byte_perm_S (w[ 7], w[ 6], selector); - w[22] = __byte_perm_S (w[ 6], w[ 5], selector); - w[21] = __byte_perm_S (w[ 5], w[ 4], selector); - w[20] = __byte_perm_S (w[ 4], w[ 3], selector); - w[19] = __byte_perm_S (w[ 3], w[ 2], selector); - w[18] = __byte_perm_S (w[ 2], w[ 1], selector); - w[17] = __byte_perm_S (w[ 1], w[ 0], selector); - w[16] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[47], w[46], selector); + w[62] = hc_byte_perm_S (w[46], w[45], selector); + w[61] = hc_byte_perm_S (w[45], w[44], selector); + w[60] = hc_byte_perm_S (w[44], w[43], selector); + w[59] = hc_byte_perm_S (w[43], w[42], selector); + w[58] = hc_byte_perm_S (w[42], w[41], selector); + w[57] = hc_byte_perm_S (w[41], w[40], selector); + w[56] = hc_byte_perm_S (w[40], w[39], selector); + w[55] = hc_byte_perm_S (w[39], w[38], selector); + w[54] = hc_byte_perm_S (w[38], w[37], selector); + w[53] = hc_byte_perm_S (w[37], w[36], selector); + w[52] = hc_byte_perm_S (w[36], w[35], selector); + w[51] = hc_byte_perm_S (w[35], w[34], selector); + w[50] = hc_byte_perm_S (w[34], w[33], selector); + w[49] = hc_byte_perm_S (w[33], w[32], selector); + w[48] = hc_byte_perm_S (w[32], w[31], selector); + w[47] = hc_byte_perm_S (w[31], w[30], selector); + w[46] = hc_byte_perm_S (w[30], w[29], selector); + w[45] = hc_byte_perm_S (w[29], w[28], selector); + w[44] = hc_byte_perm_S (w[28], w[27], selector); + w[43] = hc_byte_perm_S (w[27], w[26], selector); + w[42] = hc_byte_perm_S (w[26], w[25], selector); + w[41] = hc_byte_perm_S (w[25], w[24], selector); + w[40] = hc_byte_perm_S (w[24], w[23], selector); + w[39] = hc_byte_perm_S (w[23], w[22], selector); + w[38] = hc_byte_perm_S (w[22], w[21], selector); + w[37] = hc_byte_perm_S (w[21], w[20], selector); + w[36] = hc_byte_perm_S (w[20], w[19], selector); + w[35] = hc_byte_perm_S (w[19], w[18], selector); + w[34] = hc_byte_perm_S (w[18], w[17], selector); + w[33] = hc_byte_perm_S (w[17], w[16], selector); + w[32] = hc_byte_perm_S (w[16], w[15], selector); + w[31] = hc_byte_perm_S (w[15], w[14], selector); + w[30] = hc_byte_perm_S (w[14], w[13], selector); + w[29] = hc_byte_perm_S (w[13], w[12], selector); + w[28] = hc_byte_perm_S (w[12], w[11], selector); + w[27] = hc_byte_perm_S (w[11], w[10], selector); + w[26] = hc_byte_perm_S (w[10], w[ 9], selector); + w[25] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[24] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[23] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[22] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[21] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[20] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[19] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[18] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[17] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[16] = hc_byte_perm_S (w[ 0], 0, selector); w[15] = 0; w[14] = 0; w[13] = 0; @@ -56608,53 +56608,53 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 17: - w[63] = __byte_perm_S (w[46], w[45], selector); - w[62] = __byte_perm_S (w[45], w[44], selector); - w[61] = __byte_perm_S (w[44], w[43], selector); - w[60] = __byte_perm_S (w[43], w[42], selector); - w[59] = __byte_perm_S (w[42], w[41], selector); - w[58] = __byte_perm_S (w[41], w[40], selector); - w[57] = __byte_perm_S (w[40], w[39], selector); - w[56] = __byte_perm_S (w[39], w[38], selector); - w[55] = __byte_perm_S (w[38], w[37], selector); - w[54] = __byte_perm_S (w[37], w[36], selector); - w[53] = __byte_perm_S (w[36], w[35], selector); - w[52] = __byte_perm_S (w[35], w[34], selector); - w[51] = __byte_perm_S (w[34], w[33], selector); - w[50] = __byte_perm_S (w[33], w[32], selector); - w[49] = __byte_perm_S (w[32], w[31], selector); - w[48] = __byte_perm_S (w[31], w[30], selector); - w[47] = __byte_perm_S (w[30], w[29], selector); - w[46] = __byte_perm_S (w[29], w[28], selector); - w[45] = __byte_perm_S (w[28], w[27], selector); - w[44] = __byte_perm_S (w[27], w[26], selector); - w[43] = __byte_perm_S (w[26], w[25], selector); - w[42] = __byte_perm_S (w[25], w[24], selector); - w[41] = __byte_perm_S (w[24], w[23], selector); - w[40] = __byte_perm_S (w[23], w[22], selector); - w[39] = __byte_perm_S (w[22], w[21], selector); - w[38] = __byte_perm_S (w[21], w[20], selector); - w[37] = __byte_perm_S (w[20], w[19], selector); - w[36] = __byte_perm_S (w[19], w[18], selector); - w[35] = __byte_perm_S (w[18], w[17], selector); - w[34] = __byte_perm_S (w[17], w[16], selector); - w[33] = __byte_perm_S (w[16], w[15], selector); - w[32] = __byte_perm_S (w[15], w[14], selector); - w[31] = __byte_perm_S (w[14], w[13], selector); - w[30] = __byte_perm_S (w[13], w[12], selector); - w[29] = __byte_perm_S (w[12], w[11], selector); - w[28] = __byte_perm_S (w[11], w[10], selector); - w[27] = __byte_perm_S (w[10], w[ 9], selector); - w[26] = __byte_perm_S (w[ 9], w[ 8], selector); - w[25] = __byte_perm_S (w[ 8], w[ 7], selector); - w[24] = __byte_perm_S (w[ 7], w[ 6], selector); - w[23] = __byte_perm_S (w[ 6], w[ 5], selector); - w[22] = __byte_perm_S (w[ 5], w[ 4], selector); - w[21] = __byte_perm_S (w[ 4], w[ 3], selector); - w[20] = __byte_perm_S (w[ 3], w[ 2], selector); - w[19] = __byte_perm_S (w[ 2], w[ 1], selector); - w[18] = __byte_perm_S (w[ 1], w[ 0], selector); - w[17] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[46], w[45], selector); + w[62] = hc_byte_perm_S (w[45], w[44], selector); + w[61] = hc_byte_perm_S (w[44], w[43], selector); + w[60] = hc_byte_perm_S (w[43], w[42], selector); + w[59] = hc_byte_perm_S (w[42], w[41], selector); + w[58] = hc_byte_perm_S (w[41], w[40], selector); + w[57] = hc_byte_perm_S (w[40], w[39], selector); + w[56] = hc_byte_perm_S (w[39], w[38], selector); + w[55] = hc_byte_perm_S (w[38], w[37], selector); + w[54] = hc_byte_perm_S (w[37], w[36], selector); + w[53] = hc_byte_perm_S (w[36], w[35], selector); + w[52] = hc_byte_perm_S (w[35], w[34], selector); + w[51] = hc_byte_perm_S (w[34], w[33], selector); + w[50] = hc_byte_perm_S (w[33], w[32], selector); + w[49] = hc_byte_perm_S (w[32], w[31], selector); + w[48] = hc_byte_perm_S (w[31], w[30], selector); + w[47] = hc_byte_perm_S (w[30], w[29], selector); + w[46] = hc_byte_perm_S (w[29], w[28], selector); + w[45] = hc_byte_perm_S (w[28], w[27], selector); + w[44] = hc_byte_perm_S (w[27], w[26], selector); + w[43] = hc_byte_perm_S (w[26], w[25], selector); + w[42] = hc_byte_perm_S (w[25], w[24], selector); + w[41] = hc_byte_perm_S (w[24], w[23], selector); + w[40] = hc_byte_perm_S (w[23], w[22], selector); + w[39] = hc_byte_perm_S (w[22], w[21], selector); + w[38] = hc_byte_perm_S (w[21], w[20], selector); + w[37] = hc_byte_perm_S (w[20], w[19], selector); + w[36] = hc_byte_perm_S (w[19], w[18], selector); + w[35] = hc_byte_perm_S (w[18], w[17], selector); + w[34] = hc_byte_perm_S (w[17], w[16], selector); + w[33] = hc_byte_perm_S (w[16], w[15], selector); + w[32] = hc_byte_perm_S (w[15], w[14], selector); + w[31] = hc_byte_perm_S (w[14], w[13], selector); + w[30] = hc_byte_perm_S (w[13], w[12], selector); + w[29] = hc_byte_perm_S (w[12], w[11], selector); + w[28] = hc_byte_perm_S (w[11], w[10], selector); + w[27] = hc_byte_perm_S (w[10], w[ 9], selector); + w[26] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[25] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[24] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[23] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[22] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[21] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[20] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[19] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[18] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[17] = hc_byte_perm_S (w[ 0], 0, selector); w[16] = 0; w[15] = 0; w[14] = 0; @@ -56676,52 +56676,52 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 18: - w[63] = __byte_perm_S (w[45], w[44], selector); - w[62] = __byte_perm_S (w[44], w[43], selector); - w[61] = __byte_perm_S (w[43], w[42], selector); - w[60] = __byte_perm_S (w[42], w[41], selector); - w[59] = __byte_perm_S (w[41], w[40], selector); - w[58] = __byte_perm_S (w[40], w[39], selector); - w[57] = __byte_perm_S (w[39], w[38], selector); - w[56] = __byte_perm_S (w[38], w[37], selector); - w[55] = __byte_perm_S (w[37], w[36], selector); - w[54] = __byte_perm_S (w[36], w[35], selector); - w[53] = __byte_perm_S (w[35], w[34], selector); - w[52] = __byte_perm_S (w[34], w[33], selector); - w[51] = __byte_perm_S (w[33], w[32], selector); - w[50] = __byte_perm_S (w[32], w[31], selector); - w[49] = __byte_perm_S (w[31], w[30], selector); - w[48] = __byte_perm_S (w[30], w[29], selector); - w[47] = __byte_perm_S (w[29], w[28], selector); - w[46] = __byte_perm_S (w[28], w[27], selector); - w[45] = __byte_perm_S (w[27], w[26], selector); - w[44] = __byte_perm_S (w[26], w[25], selector); - w[43] = __byte_perm_S (w[25], w[24], selector); - w[42] = __byte_perm_S (w[24], w[23], selector); - w[41] = __byte_perm_S (w[23], w[22], selector); - w[40] = __byte_perm_S (w[22], w[21], selector); - w[39] = __byte_perm_S (w[21], w[20], selector); - w[38] = __byte_perm_S (w[20], w[19], selector); - w[37] = __byte_perm_S (w[19], w[18], selector); - w[36] = __byte_perm_S (w[18], w[17], selector); - w[35] = __byte_perm_S (w[17], w[16], selector); - w[34] = __byte_perm_S (w[16], w[15], selector); - w[33] = __byte_perm_S (w[15], w[14], selector); - w[32] = __byte_perm_S (w[14], w[13], selector); - w[31] = __byte_perm_S (w[13], w[12], selector); - w[30] = __byte_perm_S (w[12], w[11], selector); - w[29] = __byte_perm_S (w[11], w[10], selector); - w[28] = __byte_perm_S (w[10], w[ 9], selector); - w[27] = __byte_perm_S (w[ 9], w[ 8], selector); - w[26] = __byte_perm_S (w[ 8], w[ 7], selector); - w[25] = __byte_perm_S (w[ 7], w[ 6], selector); - w[24] = __byte_perm_S (w[ 6], w[ 5], selector); - w[23] = __byte_perm_S (w[ 5], w[ 4], selector); - w[22] = __byte_perm_S (w[ 4], w[ 3], selector); - w[21] = __byte_perm_S (w[ 3], w[ 2], selector); - w[20] = __byte_perm_S (w[ 2], w[ 1], selector); - w[19] = __byte_perm_S (w[ 1], w[ 0], selector); - w[18] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[45], w[44], selector); + w[62] = hc_byte_perm_S (w[44], w[43], selector); + w[61] = hc_byte_perm_S (w[43], w[42], selector); + w[60] = hc_byte_perm_S (w[42], w[41], selector); + w[59] = hc_byte_perm_S (w[41], w[40], selector); + w[58] = hc_byte_perm_S (w[40], w[39], selector); + w[57] = hc_byte_perm_S (w[39], w[38], selector); + w[56] = hc_byte_perm_S (w[38], w[37], selector); + w[55] = hc_byte_perm_S (w[37], w[36], selector); + w[54] = hc_byte_perm_S (w[36], w[35], selector); + w[53] = hc_byte_perm_S (w[35], w[34], selector); + w[52] = hc_byte_perm_S (w[34], w[33], selector); + w[51] = hc_byte_perm_S (w[33], w[32], selector); + w[50] = hc_byte_perm_S (w[32], w[31], selector); + w[49] = hc_byte_perm_S (w[31], w[30], selector); + w[48] = hc_byte_perm_S (w[30], w[29], selector); + w[47] = hc_byte_perm_S (w[29], w[28], selector); + w[46] = hc_byte_perm_S (w[28], w[27], selector); + w[45] = hc_byte_perm_S (w[27], w[26], selector); + w[44] = hc_byte_perm_S (w[26], w[25], selector); + w[43] = hc_byte_perm_S (w[25], w[24], selector); + w[42] = hc_byte_perm_S (w[24], w[23], selector); + w[41] = hc_byte_perm_S (w[23], w[22], selector); + w[40] = hc_byte_perm_S (w[22], w[21], selector); + w[39] = hc_byte_perm_S (w[21], w[20], selector); + w[38] = hc_byte_perm_S (w[20], w[19], selector); + w[37] = hc_byte_perm_S (w[19], w[18], selector); + w[36] = hc_byte_perm_S (w[18], w[17], selector); + w[35] = hc_byte_perm_S (w[17], w[16], selector); + w[34] = hc_byte_perm_S (w[16], w[15], selector); + w[33] = hc_byte_perm_S (w[15], w[14], selector); + w[32] = hc_byte_perm_S (w[14], w[13], selector); + w[31] = hc_byte_perm_S (w[13], w[12], selector); + w[30] = hc_byte_perm_S (w[12], w[11], selector); + w[29] = hc_byte_perm_S (w[11], w[10], selector); + w[28] = hc_byte_perm_S (w[10], w[ 9], selector); + w[27] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[26] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[25] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[24] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[23] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[22] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[21] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[20] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[19] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[18] = hc_byte_perm_S (w[ 0], 0, selector); w[17] = 0; w[16] = 0; w[15] = 0; @@ -56744,51 +56744,51 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 19: - w[63] = __byte_perm_S (w[44], w[43], selector); - w[62] = __byte_perm_S (w[43], w[42], selector); - w[61] = __byte_perm_S (w[42], w[41], selector); - w[60] = __byte_perm_S (w[41], w[40], selector); - w[59] = __byte_perm_S (w[40], w[39], selector); - w[58] = __byte_perm_S (w[39], w[38], selector); - w[57] = __byte_perm_S (w[38], w[37], selector); - w[56] = __byte_perm_S (w[37], w[36], selector); - w[55] = __byte_perm_S (w[36], w[35], selector); - w[54] = __byte_perm_S (w[35], w[34], selector); - w[53] = __byte_perm_S (w[34], w[33], selector); - w[52] = __byte_perm_S (w[33], w[32], selector); - w[51] = __byte_perm_S (w[32], w[31], selector); - w[50] = __byte_perm_S (w[31], w[30], selector); - w[49] = __byte_perm_S (w[30], w[29], selector); - w[48] = __byte_perm_S (w[29], w[28], selector); - w[47] = __byte_perm_S (w[28], w[27], selector); - w[46] = __byte_perm_S (w[27], w[26], selector); - w[45] = __byte_perm_S (w[26], w[25], selector); - w[44] = __byte_perm_S (w[25], w[24], selector); - w[43] = __byte_perm_S (w[24], w[23], selector); - w[42] = __byte_perm_S (w[23], w[22], selector); - w[41] = __byte_perm_S (w[22], w[21], selector); - w[40] = __byte_perm_S (w[21], w[20], selector); - w[39] = __byte_perm_S (w[20], w[19], selector); - w[38] = __byte_perm_S (w[19], w[18], selector); - w[37] = __byte_perm_S (w[18], w[17], selector); - w[36] = __byte_perm_S (w[17], w[16], selector); - w[35] = __byte_perm_S (w[16], w[15], selector); - w[34] = __byte_perm_S (w[15], w[14], selector); - w[33] = __byte_perm_S (w[14], w[13], selector); - w[32] = __byte_perm_S (w[13], w[12], selector); - w[31] = __byte_perm_S (w[12], w[11], selector); - w[30] = __byte_perm_S (w[11], w[10], selector); - w[29] = __byte_perm_S (w[10], w[ 9], selector); - w[28] = __byte_perm_S (w[ 9], w[ 8], selector); - w[27] = __byte_perm_S (w[ 8], w[ 7], selector); - w[26] = __byte_perm_S (w[ 7], w[ 6], selector); - w[25] = __byte_perm_S (w[ 6], w[ 5], selector); - w[24] = __byte_perm_S (w[ 5], w[ 4], selector); - w[23] = __byte_perm_S (w[ 4], w[ 3], selector); - w[22] = __byte_perm_S (w[ 3], w[ 2], selector); - w[21] = __byte_perm_S (w[ 2], w[ 1], selector); - w[20] = __byte_perm_S (w[ 1], w[ 0], selector); - w[19] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[44], w[43], selector); + w[62] = hc_byte_perm_S (w[43], w[42], selector); + w[61] = hc_byte_perm_S (w[42], w[41], selector); + w[60] = hc_byte_perm_S (w[41], w[40], selector); + w[59] = hc_byte_perm_S (w[40], w[39], selector); + w[58] = hc_byte_perm_S (w[39], w[38], selector); + w[57] = hc_byte_perm_S (w[38], w[37], selector); + w[56] = hc_byte_perm_S (w[37], w[36], selector); + w[55] = hc_byte_perm_S (w[36], w[35], selector); + w[54] = hc_byte_perm_S (w[35], w[34], selector); + w[53] = hc_byte_perm_S (w[34], w[33], selector); + w[52] = hc_byte_perm_S (w[33], w[32], selector); + w[51] = hc_byte_perm_S (w[32], w[31], selector); + w[50] = hc_byte_perm_S (w[31], w[30], selector); + w[49] = hc_byte_perm_S (w[30], w[29], selector); + w[48] = hc_byte_perm_S (w[29], w[28], selector); + w[47] = hc_byte_perm_S (w[28], w[27], selector); + w[46] = hc_byte_perm_S (w[27], w[26], selector); + w[45] = hc_byte_perm_S (w[26], w[25], selector); + w[44] = hc_byte_perm_S (w[25], w[24], selector); + w[43] = hc_byte_perm_S (w[24], w[23], selector); + w[42] = hc_byte_perm_S (w[23], w[22], selector); + w[41] = hc_byte_perm_S (w[22], w[21], selector); + w[40] = hc_byte_perm_S (w[21], w[20], selector); + w[39] = hc_byte_perm_S (w[20], w[19], selector); + w[38] = hc_byte_perm_S (w[19], w[18], selector); + w[37] = hc_byte_perm_S (w[18], w[17], selector); + w[36] = hc_byte_perm_S (w[17], w[16], selector); + w[35] = hc_byte_perm_S (w[16], w[15], selector); + w[34] = hc_byte_perm_S (w[15], w[14], selector); + w[33] = hc_byte_perm_S (w[14], w[13], selector); + w[32] = hc_byte_perm_S (w[13], w[12], selector); + w[31] = hc_byte_perm_S (w[12], w[11], selector); + w[30] = hc_byte_perm_S (w[11], w[10], selector); + w[29] = hc_byte_perm_S (w[10], w[ 9], selector); + w[28] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[27] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[26] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[25] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[24] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[23] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[22] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[21] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[20] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[19] = hc_byte_perm_S (w[ 0], 0, selector); w[18] = 0; w[17] = 0; w[16] = 0; @@ -56812,50 +56812,50 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 20: - w[63] = __byte_perm_S (w[43], w[42], selector); - w[62] = __byte_perm_S (w[42], w[41], selector); - w[61] = __byte_perm_S (w[41], w[40], selector); - w[60] = __byte_perm_S (w[40], w[39], selector); - w[59] = __byte_perm_S (w[39], w[38], selector); - w[58] = __byte_perm_S (w[38], w[37], selector); - w[57] = __byte_perm_S (w[37], w[36], selector); - w[56] = __byte_perm_S (w[36], w[35], selector); - w[55] = __byte_perm_S (w[35], w[34], selector); - w[54] = __byte_perm_S (w[34], w[33], selector); - w[53] = __byte_perm_S (w[33], w[32], selector); - w[52] = __byte_perm_S (w[32], w[31], selector); - w[51] = __byte_perm_S (w[31], w[30], selector); - w[50] = __byte_perm_S (w[30], w[29], selector); - w[49] = __byte_perm_S (w[29], w[28], selector); - w[48] = __byte_perm_S (w[28], w[27], selector); - w[47] = __byte_perm_S (w[27], w[26], selector); - w[46] = __byte_perm_S (w[26], w[25], selector); - w[45] = __byte_perm_S (w[25], w[24], selector); - w[44] = __byte_perm_S (w[24], w[23], selector); - w[43] = __byte_perm_S (w[23], w[22], selector); - w[42] = __byte_perm_S (w[22], w[21], selector); - w[41] = __byte_perm_S (w[21], w[20], selector); - w[40] = __byte_perm_S (w[20], w[19], selector); - w[39] = __byte_perm_S (w[19], w[18], selector); - w[38] = __byte_perm_S (w[18], w[17], selector); - w[37] = __byte_perm_S (w[17], w[16], selector); - w[36] = __byte_perm_S (w[16], w[15], selector); - w[35] = __byte_perm_S (w[15], w[14], selector); - w[34] = __byte_perm_S (w[14], w[13], selector); - w[33] = __byte_perm_S (w[13], w[12], selector); - w[32] = __byte_perm_S (w[12], w[11], selector); - w[31] = __byte_perm_S (w[11], w[10], selector); - w[30] = __byte_perm_S (w[10], w[ 9], selector); - w[29] = __byte_perm_S (w[ 9], w[ 8], selector); - w[28] = __byte_perm_S (w[ 8], w[ 7], selector); - w[27] = __byte_perm_S (w[ 7], w[ 6], selector); - w[26] = __byte_perm_S (w[ 6], w[ 5], selector); - w[25] = __byte_perm_S (w[ 5], w[ 4], selector); - w[24] = __byte_perm_S (w[ 4], w[ 3], selector); - w[23] = __byte_perm_S (w[ 3], w[ 2], selector); - w[22] = __byte_perm_S (w[ 2], w[ 1], selector); - w[21] = __byte_perm_S (w[ 1], w[ 0], selector); - w[20] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[43], w[42], selector); + w[62] = hc_byte_perm_S (w[42], w[41], selector); + w[61] = hc_byte_perm_S (w[41], w[40], selector); + w[60] = hc_byte_perm_S (w[40], w[39], selector); + w[59] = hc_byte_perm_S (w[39], w[38], selector); + w[58] = hc_byte_perm_S (w[38], w[37], selector); + w[57] = hc_byte_perm_S (w[37], w[36], selector); + w[56] = hc_byte_perm_S (w[36], w[35], selector); + w[55] = hc_byte_perm_S (w[35], w[34], selector); + w[54] = hc_byte_perm_S (w[34], w[33], selector); + w[53] = hc_byte_perm_S (w[33], w[32], selector); + w[52] = hc_byte_perm_S (w[32], w[31], selector); + w[51] = hc_byte_perm_S (w[31], w[30], selector); + w[50] = hc_byte_perm_S (w[30], w[29], selector); + w[49] = hc_byte_perm_S (w[29], w[28], selector); + w[48] = hc_byte_perm_S (w[28], w[27], selector); + w[47] = hc_byte_perm_S (w[27], w[26], selector); + w[46] = hc_byte_perm_S (w[26], w[25], selector); + w[45] = hc_byte_perm_S (w[25], w[24], selector); + w[44] = hc_byte_perm_S (w[24], w[23], selector); + w[43] = hc_byte_perm_S (w[23], w[22], selector); + w[42] = hc_byte_perm_S (w[22], w[21], selector); + w[41] = hc_byte_perm_S (w[21], w[20], selector); + w[40] = hc_byte_perm_S (w[20], w[19], selector); + w[39] = hc_byte_perm_S (w[19], w[18], selector); + w[38] = hc_byte_perm_S (w[18], w[17], selector); + w[37] = hc_byte_perm_S (w[17], w[16], selector); + w[36] = hc_byte_perm_S (w[16], w[15], selector); + w[35] = hc_byte_perm_S (w[15], w[14], selector); + w[34] = hc_byte_perm_S (w[14], w[13], selector); + w[33] = hc_byte_perm_S (w[13], w[12], selector); + w[32] = hc_byte_perm_S (w[12], w[11], selector); + w[31] = hc_byte_perm_S (w[11], w[10], selector); + w[30] = hc_byte_perm_S (w[10], w[ 9], selector); + w[29] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[28] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[27] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[26] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[25] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[24] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[23] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[22] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[21] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[20] = hc_byte_perm_S (w[ 0], 0, selector); w[19] = 0; w[18] = 0; w[17] = 0; @@ -56880,49 +56880,49 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 21: - w[63] = __byte_perm_S (w[42], w[41], selector); - w[62] = __byte_perm_S (w[41], w[40], selector); - w[61] = __byte_perm_S (w[40], w[39], selector); - w[60] = __byte_perm_S (w[39], w[38], selector); - w[59] = __byte_perm_S (w[38], w[37], selector); - w[58] = __byte_perm_S (w[37], w[36], selector); - w[57] = __byte_perm_S (w[36], w[35], selector); - w[56] = __byte_perm_S (w[35], w[34], selector); - w[55] = __byte_perm_S (w[34], w[33], selector); - w[54] = __byte_perm_S (w[33], w[32], selector); - w[53] = __byte_perm_S (w[32], w[31], selector); - w[52] = __byte_perm_S (w[31], w[30], selector); - w[51] = __byte_perm_S (w[30], w[29], selector); - w[50] = __byte_perm_S (w[29], w[28], selector); - w[49] = __byte_perm_S (w[28], w[27], selector); - w[48] = __byte_perm_S (w[27], w[26], selector); - w[47] = __byte_perm_S (w[26], w[25], selector); - w[46] = __byte_perm_S (w[25], w[24], selector); - w[45] = __byte_perm_S (w[24], w[23], selector); - w[44] = __byte_perm_S (w[23], w[22], selector); - w[43] = __byte_perm_S (w[22], w[21], selector); - w[42] = __byte_perm_S (w[21], w[20], selector); - w[41] = __byte_perm_S (w[20], w[19], selector); - w[40] = __byte_perm_S (w[19], w[18], selector); - w[39] = __byte_perm_S (w[18], w[17], selector); - w[38] = __byte_perm_S (w[17], w[16], selector); - w[37] = __byte_perm_S (w[16], w[15], selector); - w[36] = __byte_perm_S (w[15], w[14], selector); - w[35] = __byte_perm_S (w[14], w[13], selector); - w[34] = __byte_perm_S (w[13], w[12], selector); - w[33] = __byte_perm_S (w[12], w[11], selector); - w[32] = __byte_perm_S (w[11], w[10], selector); - w[31] = __byte_perm_S (w[10], w[ 9], selector); - w[30] = __byte_perm_S (w[ 9], w[ 8], selector); - w[29] = __byte_perm_S (w[ 8], w[ 7], selector); - w[28] = __byte_perm_S (w[ 7], w[ 6], selector); - w[27] = __byte_perm_S (w[ 6], w[ 5], selector); - w[26] = __byte_perm_S (w[ 5], w[ 4], selector); - w[25] = __byte_perm_S (w[ 4], w[ 3], selector); - w[24] = __byte_perm_S (w[ 3], w[ 2], selector); - w[23] = __byte_perm_S (w[ 2], w[ 1], selector); - w[22] = __byte_perm_S (w[ 1], w[ 0], selector); - w[21] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[42], w[41], selector); + w[62] = hc_byte_perm_S (w[41], w[40], selector); + w[61] = hc_byte_perm_S (w[40], w[39], selector); + w[60] = hc_byte_perm_S (w[39], w[38], selector); + w[59] = hc_byte_perm_S (w[38], w[37], selector); + w[58] = hc_byte_perm_S (w[37], w[36], selector); + w[57] = hc_byte_perm_S (w[36], w[35], selector); + w[56] = hc_byte_perm_S (w[35], w[34], selector); + w[55] = hc_byte_perm_S (w[34], w[33], selector); + w[54] = hc_byte_perm_S (w[33], w[32], selector); + w[53] = hc_byte_perm_S (w[32], w[31], selector); + w[52] = hc_byte_perm_S (w[31], w[30], selector); + w[51] = hc_byte_perm_S (w[30], w[29], selector); + w[50] = hc_byte_perm_S (w[29], w[28], selector); + w[49] = hc_byte_perm_S (w[28], w[27], selector); + w[48] = hc_byte_perm_S (w[27], w[26], selector); + w[47] = hc_byte_perm_S (w[26], w[25], selector); + w[46] = hc_byte_perm_S (w[25], w[24], selector); + w[45] = hc_byte_perm_S (w[24], w[23], selector); + w[44] = hc_byte_perm_S (w[23], w[22], selector); + w[43] = hc_byte_perm_S (w[22], w[21], selector); + w[42] = hc_byte_perm_S (w[21], w[20], selector); + w[41] = hc_byte_perm_S (w[20], w[19], selector); + w[40] = hc_byte_perm_S (w[19], w[18], selector); + w[39] = hc_byte_perm_S (w[18], w[17], selector); + w[38] = hc_byte_perm_S (w[17], w[16], selector); + w[37] = hc_byte_perm_S (w[16], w[15], selector); + w[36] = hc_byte_perm_S (w[15], w[14], selector); + w[35] = hc_byte_perm_S (w[14], w[13], selector); + w[34] = hc_byte_perm_S (w[13], w[12], selector); + w[33] = hc_byte_perm_S (w[12], w[11], selector); + w[32] = hc_byte_perm_S (w[11], w[10], selector); + w[31] = hc_byte_perm_S (w[10], w[ 9], selector); + w[30] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[29] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[28] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[27] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[26] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[25] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[24] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[23] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[22] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[21] = hc_byte_perm_S (w[ 0], 0, selector); w[20] = 0; w[19] = 0; w[18] = 0; @@ -56948,48 +56948,48 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 22: - w[63] = __byte_perm_S (w[41], w[40], selector); - w[62] = __byte_perm_S (w[40], w[39], selector); - w[61] = __byte_perm_S (w[39], w[38], selector); - w[60] = __byte_perm_S (w[38], w[37], selector); - w[59] = __byte_perm_S (w[37], w[36], selector); - w[58] = __byte_perm_S (w[36], w[35], selector); - w[57] = __byte_perm_S (w[35], w[34], selector); - w[56] = __byte_perm_S (w[34], w[33], selector); - w[55] = __byte_perm_S (w[33], w[32], selector); - w[54] = __byte_perm_S (w[32], w[31], selector); - w[53] = __byte_perm_S (w[31], w[30], selector); - w[52] = __byte_perm_S (w[30], w[29], selector); - w[51] = __byte_perm_S (w[29], w[28], selector); - w[50] = __byte_perm_S (w[28], w[27], selector); - w[49] = __byte_perm_S (w[27], w[26], selector); - w[48] = __byte_perm_S (w[26], w[25], selector); - w[47] = __byte_perm_S (w[25], w[24], selector); - w[46] = __byte_perm_S (w[24], w[23], selector); - w[45] = __byte_perm_S (w[23], w[22], selector); - w[44] = __byte_perm_S (w[22], w[21], selector); - w[43] = __byte_perm_S (w[21], w[20], selector); - w[42] = __byte_perm_S (w[20], w[19], selector); - w[41] = __byte_perm_S (w[19], w[18], selector); - w[40] = __byte_perm_S (w[18], w[17], selector); - w[39] = __byte_perm_S (w[17], w[16], selector); - w[38] = __byte_perm_S (w[16], w[15], selector); - w[37] = __byte_perm_S (w[15], w[14], selector); - w[36] = __byte_perm_S (w[14], w[13], selector); - w[35] = __byte_perm_S (w[13], w[12], selector); - w[34] = __byte_perm_S (w[12], w[11], selector); - w[33] = __byte_perm_S (w[11], w[10], selector); - w[32] = __byte_perm_S (w[10], w[ 9], selector); - w[31] = __byte_perm_S (w[ 9], w[ 8], selector); - w[30] = __byte_perm_S (w[ 8], w[ 7], selector); - w[29] = __byte_perm_S (w[ 7], w[ 6], selector); - w[28] = __byte_perm_S (w[ 6], w[ 5], selector); - w[27] = __byte_perm_S (w[ 5], w[ 4], selector); - w[26] = __byte_perm_S (w[ 4], w[ 3], selector); - w[25] = __byte_perm_S (w[ 3], w[ 2], selector); - w[24] = __byte_perm_S (w[ 2], w[ 1], selector); - w[23] = __byte_perm_S (w[ 1], w[ 0], selector); - w[22] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[41], w[40], selector); + w[62] = hc_byte_perm_S (w[40], w[39], selector); + w[61] = hc_byte_perm_S (w[39], w[38], selector); + w[60] = hc_byte_perm_S (w[38], w[37], selector); + w[59] = hc_byte_perm_S (w[37], w[36], selector); + w[58] = hc_byte_perm_S (w[36], w[35], selector); + w[57] = hc_byte_perm_S (w[35], w[34], selector); + w[56] = hc_byte_perm_S (w[34], w[33], selector); + w[55] = hc_byte_perm_S (w[33], w[32], selector); + w[54] = hc_byte_perm_S (w[32], w[31], selector); + w[53] = hc_byte_perm_S (w[31], w[30], selector); + w[52] = hc_byte_perm_S (w[30], w[29], selector); + w[51] = hc_byte_perm_S (w[29], w[28], selector); + w[50] = hc_byte_perm_S (w[28], w[27], selector); + w[49] = hc_byte_perm_S (w[27], w[26], selector); + w[48] = hc_byte_perm_S (w[26], w[25], selector); + w[47] = hc_byte_perm_S (w[25], w[24], selector); + w[46] = hc_byte_perm_S (w[24], w[23], selector); + w[45] = hc_byte_perm_S (w[23], w[22], selector); + w[44] = hc_byte_perm_S (w[22], w[21], selector); + w[43] = hc_byte_perm_S (w[21], w[20], selector); + w[42] = hc_byte_perm_S (w[20], w[19], selector); + w[41] = hc_byte_perm_S (w[19], w[18], selector); + w[40] = hc_byte_perm_S (w[18], w[17], selector); + w[39] = hc_byte_perm_S (w[17], w[16], selector); + w[38] = hc_byte_perm_S (w[16], w[15], selector); + w[37] = hc_byte_perm_S (w[15], w[14], selector); + w[36] = hc_byte_perm_S (w[14], w[13], selector); + w[35] = hc_byte_perm_S (w[13], w[12], selector); + w[34] = hc_byte_perm_S (w[12], w[11], selector); + w[33] = hc_byte_perm_S (w[11], w[10], selector); + w[32] = hc_byte_perm_S (w[10], w[ 9], selector); + w[31] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[30] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[29] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[28] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[27] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[26] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[25] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[24] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[23] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[22] = hc_byte_perm_S (w[ 0], 0, selector); w[21] = 0; w[20] = 0; w[19] = 0; @@ -57016,47 +57016,47 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 23: - w[63] = __byte_perm_S (w[40], w[39], selector); - w[62] = __byte_perm_S (w[39], w[38], selector); - w[61] = __byte_perm_S (w[38], w[37], selector); - w[60] = __byte_perm_S (w[37], w[36], selector); - w[59] = __byte_perm_S (w[36], w[35], selector); - w[58] = __byte_perm_S (w[35], w[34], selector); - w[57] = __byte_perm_S (w[34], w[33], selector); - w[56] = __byte_perm_S (w[33], w[32], selector); - w[55] = __byte_perm_S (w[32], w[31], selector); - w[54] = __byte_perm_S (w[31], w[30], selector); - w[53] = __byte_perm_S (w[30], w[29], selector); - w[52] = __byte_perm_S (w[29], w[28], selector); - w[51] = __byte_perm_S (w[28], w[27], selector); - w[50] = __byte_perm_S (w[27], w[26], selector); - w[49] = __byte_perm_S (w[26], w[25], selector); - w[48] = __byte_perm_S (w[25], w[24], selector); - w[47] = __byte_perm_S (w[24], w[23], selector); - w[46] = __byte_perm_S (w[23], w[22], selector); - w[45] = __byte_perm_S (w[22], w[21], selector); - w[44] = __byte_perm_S (w[21], w[20], selector); - w[43] = __byte_perm_S (w[20], w[19], selector); - w[42] = __byte_perm_S (w[19], w[18], selector); - w[41] = __byte_perm_S (w[18], w[17], selector); - w[40] = __byte_perm_S (w[17], w[16], selector); - w[39] = __byte_perm_S (w[16], w[15], selector); - w[38] = __byte_perm_S (w[15], w[14], selector); - w[37] = __byte_perm_S (w[14], w[13], selector); - w[36] = __byte_perm_S (w[13], w[12], selector); - w[35] = __byte_perm_S (w[12], w[11], selector); - w[34] = __byte_perm_S (w[11], w[10], selector); - w[33] = __byte_perm_S (w[10], w[ 9], selector); - w[32] = __byte_perm_S (w[ 9], w[ 8], selector); - w[31] = __byte_perm_S (w[ 8], w[ 7], selector); - w[30] = __byte_perm_S (w[ 7], w[ 6], selector); - w[29] = __byte_perm_S (w[ 6], w[ 5], selector); - w[28] = __byte_perm_S (w[ 5], w[ 4], selector); - w[27] = __byte_perm_S (w[ 4], w[ 3], selector); - w[26] = __byte_perm_S (w[ 3], w[ 2], selector); - w[25] = __byte_perm_S (w[ 2], w[ 1], selector); - w[24] = __byte_perm_S (w[ 1], w[ 0], selector); - w[23] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[40], w[39], selector); + w[62] = hc_byte_perm_S (w[39], w[38], selector); + w[61] = hc_byte_perm_S (w[38], w[37], selector); + w[60] = hc_byte_perm_S (w[37], w[36], selector); + w[59] = hc_byte_perm_S (w[36], w[35], selector); + w[58] = hc_byte_perm_S (w[35], w[34], selector); + w[57] = hc_byte_perm_S (w[34], w[33], selector); + w[56] = hc_byte_perm_S (w[33], w[32], selector); + w[55] = hc_byte_perm_S (w[32], w[31], selector); + w[54] = hc_byte_perm_S (w[31], w[30], selector); + w[53] = hc_byte_perm_S (w[30], w[29], selector); + w[52] = hc_byte_perm_S (w[29], w[28], selector); + w[51] = hc_byte_perm_S (w[28], w[27], selector); + w[50] = hc_byte_perm_S (w[27], w[26], selector); + w[49] = hc_byte_perm_S (w[26], w[25], selector); + w[48] = hc_byte_perm_S (w[25], w[24], selector); + w[47] = hc_byte_perm_S (w[24], w[23], selector); + w[46] = hc_byte_perm_S (w[23], w[22], selector); + w[45] = hc_byte_perm_S (w[22], w[21], selector); + w[44] = hc_byte_perm_S (w[21], w[20], selector); + w[43] = hc_byte_perm_S (w[20], w[19], selector); + w[42] = hc_byte_perm_S (w[19], w[18], selector); + w[41] = hc_byte_perm_S (w[18], w[17], selector); + w[40] = hc_byte_perm_S (w[17], w[16], selector); + w[39] = hc_byte_perm_S (w[16], w[15], selector); + w[38] = hc_byte_perm_S (w[15], w[14], selector); + w[37] = hc_byte_perm_S (w[14], w[13], selector); + w[36] = hc_byte_perm_S (w[13], w[12], selector); + w[35] = hc_byte_perm_S (w[12], w[11], selector); + w[34] = hc_byte_perm_S (w[11], w[10], selector); + w[33] = hc_byte_perm_S (w[10], w[ 9], selector); + w[32] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[31] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[30] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[29] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[28] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[27] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[26] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[25] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[24] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[23] = hc_byte_perm_S (w[ 0], 0, selector); w[22] = 0; w[21] = 0; w[20] = 0; @@ -57084,46 +57084,46 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 24: - w[63] = __byte_perm_S (w[39], w[38], selector); - w[62] = __byte_perm_S (w[38], w[37], selector); - w[61] = __byte_perm_S (w[37], w[36], selector); - w[60] = __byte_perm_S (w[36], w[35], selector); - w[59] = __byte_perm_S (w[35], w[34], selector); - w[58] = __byte_perm_S (w[34], w[33], selector); - w[57] = __byte_perm_S (w[33], w[32], selector); - w[56] = __byte_perm_S (w[32], w[31], selector); - w[55] = __byte_perm_S (w[31], w[30], selector); - w[54] = __byte_perm_S (w[30], w[29], selector); - w[53] = __byte_perm_S (w[29], w[28], selector); - w[52] = __byte_perm_S (w[28], w[27], selector); - w[51] = __byte_perm_S (w[27], w[26], selector); - w[50] = __byte_perm_S (w[26], w[25], selector); - w[49] = __byte_perm_S (w[25], w[24], selector); - w[48] = __byte_perm_S (w[24], w[23], selector); - w[47] = __byte_perm_S (w[23], w[22], selector); - w[46] = __byte_perm_S (w[22], w[21], selector); - w[45] = __byte_perm_S (w[21], w[20], selector); - w[44] = __byte_perm_S (w[20], w[19], selector); - w[43] = __byte_perm_S (w[19], w[18], selector); - w[42] = __byte_perm_S (w[18], w[17], selector); - w[41] = __byte_perm_S (w[17], w[16], selector); - w[40] = __byte_perm_S (w[16], w[15], selector); - w[39] = __byte_perm_S (w[15], w[14], selector); - w[38] = __byte_perm_S (w[14], w[13], selector); - w[37] = __byte_perm_S (w[13], w[12], selector); - w[36] = __byte_perm_S (w[12], w[11], selector); - w[35] = __byte_perm_S (w[11], w[10], selector); - w[34] = __byte_perm_S (w[10], w[ 9], selector); - w[33] = __byte_perm_S (w[ 9], w[ 8], selector); - w[32] = __byte_perm_S (w[ 8], w[ 7], selector); - w[31] = __byte_perm_S (w[ 7], w[ 6], selector); - w[30] = __byte_perm_S (w[ 6], w[ 5], selector); - w[29] = __byte_perm_S (w[ 5], w[ 4], selector); - w[28] = __byte_perm_S (w[ 4], w[ 3], selector); - w[27] = __byte_perm_S (w[ 3], w[ 2], selector); - w[26] = __byte_perm_S (w[ 2], w[ 1], selector); - w[25] = __byte_perm_S (w[ 1], w[ 0], selector); - w[24] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[39], w[38], selector); + w[62] = hc_byte_perm_S (w[38], w[37], selector); + w[61] = hc_byte_perm_S (w[37], w[36], selector); + w[60] = hc_byte_perm_S (w[36], w[35], selector); + w[59] = hc_byte_perm_S (w[35], w[34], selector); + w[58] = hc_byte_perm_S (w[34], w[33], selector); + w[57] = hc_byte_perm_S (w[33], w[32], selector); + w[56] = hc_byte_perm_S (w[32], w[31], selector); + w[55] = hc_byte_perm_S (w[31], w[30], selector); + w[54] = hc_byte_perm_S (w[30], w[29], selector); + w[53] = hc_byte_perm_S (w[29], w[28], selector); + w[52] = hc_byte_perm_S (w[28], w[27], selector); + w[51] = hc_byte_perm_S (w[27], w[26], selector); + w[50] = hc_byte_perm_S (w[26], w[25], selector); + w[49] = hc_byte_perm_S (w[25], w[24], selector); + w[48] = hc_byte_perm_S (w[24], w[23], selector); + w[47] = hc_byte_perm_S (w[23], w[22], selector); + w[46] = hc_byte_perm_S (w[22], w[21], selector); + w[45] = hc_byte_perm_S (w[21], w[20], selector); + w[44] = hc_byte_perm_S (w[20], w[19], selector); + w[43] = hc_byte_perm_S (w[19], w[18], selector); + w[42] = hc_byte_perm_S (w[18], w[17], selector); + w[41] = hc_byte_perm_S (w[17], w[16], selector); + w[40] = hc_byte_perm_S (w[16], w[15], selector); + w[39] = hc_byte_perm_S (w[15], w[14], selector); + w[38] = hc_byte_perm_S (w[14], w[13], selector); + w[37] = hc_byte_perm_S (w[13], w[12], selector); + w[36] = hc_byte_perm_S (w[12], w[11], selector); + w[35] = hc_byte_perm_S (w[11], w[10], selector); + w[34] = hc_byte_perm_S (w[10], w[ 9], selector); + w[33] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[32] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[31] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[30] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[29] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[28] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[27] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[26] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[25] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[24] = hc_byte_perm_S (w[ 0], 0, selector); w[23] = 0; w[22] = 0; w[21] = 0; @@ -57152,45 +57152,45 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 25: - w[63] = __byte_perm_S (w[38], w[37], selector); - w[62] = __byte_perm_S (w[37], w[36], selector); - w[61] = __byte_perm_S (w[36], w[35], selector); - w[60] = __byte_perm_S (w[35], w[34], selector); - w[59] = __byte_perm_S (w[34], w[33], selector); - w[58] = __byte_perm_S (w[33], w[32], selector); - w[57] = __byte_perm_S (w[32], w[31], selector); - w[56] = __byte_perm_S (w[31], w[30], selector); - w[55] = __byte_perm_S (w[30], w[29], selector); - w[54] = __byte_perm_S (w[29], w[28], selector); - w[53] = __byte_perm_S (w[28], w[27], selector); - w[52] = __byte_perm_S (w[27], w[26], selector); - w[51] = __byte_perm_S (w[26], w[25], selector); - w[50] = __byte_perm_S (w[25], w[24], selector); - w[49] = __byte_perm_S (w[24], w[23], selector); - w[48] = __byte_perm_S (w[23], w[22], selector); - w[47] = __byte_perm_S (w[22], w[21], selector); - w[46] = __byte_perm_S (w[21], w[20], selector); - w[45] = __byte_perm_S (w[20], w[19], selector); - w[44] = __byte_perm_S (w[19], w[18], selector); - w[43] = __byte_perm_S (w[18], w[17], selector); - w[42] = __byte_perm_S (w[17], w[16], selector); - w[41] = __byte_perm_S (w[16], w[15], selector); - w[40] = __byte_perm_S (w[15], w[14], selector); - w[39] = __byte_perm_S (w[14], w[13], selector); - w[38] = __byte_perm_S (w[13], w[12], selector); - w[37] = __byte_perm_S (w[12], w[11], selector); - w[36] = __byte_perm_S (w[11], w[10], selector); - w[35] = __byte_perm_S (w[10], w[ 9], selector); - w[34] = __byte_perm_S (w[ 9], w[ 8], selector); - w[33] = __byte_perm_S (w[ 8], w[ 7], selector); - w[32] = __byte_perm_S (w[ 7], w[ 6], selector); - w[31] = __byte_perm_S (w[ 6], w[ 5], selector); - w[30] = __byte_perm_S (w[ 5], w[ 4], selector); - w[29] = __byte_perm_S (w[ 4], w[ 3], selector); - w[28] = __byte_perm_S (w[ 3], w[ 2], selector); - w[27] = __byte_perm_S (w[ 2], w[ 1], selector); - w[26] = __byte_perm_S (w[ 1], w[ 0], selector); - w[25] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[38], w[37], selector); + w[62] = hc_byte_perm_S (w[37], w[36], selector); + w[61] = hc_byte_perm_S (w[36], w[35], selector); + w[60] = hc_byte_perm_S (w[35], w[34], selector); + w[59] = hc_byte_perm_S (w[34], w[33], selector); + w[58] = hc_byte_perm_S (w[33], w[32], selector); + w[57] = hc_byte_perm_S (w[32], w[31], selector); + w[56] = hc_byte_perm_S (w[31], w[30], selector); + w[55] = hc_byte_perm_S (w[30], w[29], selector); + w[54] = hc_byte_perm_S (w[29], w[28], selector); + w[53] = hc_byte_perm_S (w[28], w[27], selector); + w[52] = hc_byte_perm_S (w[27], w[26], selector); + w[51] = hc_byte_perm_S (w[26], w[25], selector); + w[50] = hc_byte_perm_S (w[25], w[24], selector); + w[49] = hc_byte_perm_S (w[24], w[23], selector); + w[48] = hc_byte_perm_S (w[23], w[22], selector); + w[47] = hc_byte_perm_S (w[22], w[21], selector); + w[46] = hc_byte_perm_S (w[21], w[20], selector); + w[45] = hc_byte_perm_S (w[20], w[19], selector); + w[44] = hc_byte_perm_S (w[19], w[18], selector); + w[43] = hc_byte_perm_S (w[18], w[17], selector); + w[42] = hc_byte_perm_S (w[17], w[16], selector); + w[41] = hc_byte_perm_S (w[16], w[15], selector); + w[40] = hc_byte_perm_S (w[15], w[14], selector); + w[39] = hc_byte_perm_S (w[14], w[13], selector); + w[38] = hc_byte_perm_S (w[13], w[12], selector); + w[37] = hc_byte_perm_S (w[12], w[11], selector); + w[36] = hc_byte_perm_S (w[11], w[10], selector); + w[35] = hc_byte_perm_S (w[10], w[ 9], selector); + w[34] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[33] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[32] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[31] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[30] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[29] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[28] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[27] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[26] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[25] = hc_byte_perm_S (w[ 0], 0, selector); w[24] = 0; w[23] = 0; w[22] = 0; @@ -57220,44 +57220,44 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 26: - w[63] = __byte_perm_S (w[37], w[36], selector); - w[62] = __byte_perm_S (w[36], w[35], selector); - w[61] = __byte_perm_S (w[35], w[34], selector); - w[60] = __byte_perm_S (w[34], w[33], selector); - w[59] = __byte_perm_S (w[33], w[32], selector); - w[58] = __byte_perm_S (w[32], w[31], selector); - w[57] = __byte_perm_S (w[31], w[30], selector); - w[56] = __byte_perm_S (w[30], w[29], selector); - w[55] = __byte_perm_S (w[29], w[28], selector); - w[54] = __byte_perm_S (w[28], w[27], selector); - w[53] = __byte_perm_S (w[27], w[26], selector); - w[52] = __byte_perm_S (w[26], w[25], selector); - w[51] = __byte_perm_S (w[25], w[24], selector); - w[50] = __byte_perm_S (w[24], w[23], selector); - w[49] = __byte_perm_S (w[23], w[22], selector); - w[48] = __byte_perm_S (w[22], w[21], selector); - w[47] = __byte_perm_S (w[21], w[20], selector); - w[46] = __byte_perm_S (w[20], w[19], selector); - w[45] = __byte_perm_S (w[19], w[18], selector); - w[44] = __byte_perm_S (w[18], w[17], selector); - w[43] = __byte_perm_S (w[17], w[16], selector); - w[42] = __byte_perm_S (w[16], w[15], selector); - w[41] = __byte_perm_S (w[15], w[14], selector); - w[40] = __byte_perm_S (w[14], w[13], selector); - w[39] = __byte_perm_S (w[13], w[12], selector); - w[38] = __byte_perm_S (w[12], w[11], selector); - w[37] = __byte_perm_S (w[11], w[10], selector); - w[36] = __byte_perm_S (w[10], w[ 9], selector); - w[35] = __byte_perm_S (w[ 9], w[ 8], selector); - w[34] = __byte_perm_S (w[ 8], w[ 7], selector); - w[33] = __byte_perm_S (w[ 7], w[ 6], selector); - w[32] = __byte_perm_S (w[ 6], w[ 5], selector); - w[31] = __byte_perm_S (w[ 5], w[ 4], selector); - w[30] = __byte_perm_S (w[ 4], w[ 3], selector); - w[29] = __byte_perm_S (w[ 3], w[ 2], selector); - w[28] = __byte_perm_S (w[ 2], w[ 1], selector); - w[27] = __byte_perm_S (w[ 1], w[ 0], selector); - w[26] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[37], w[36], selector); + w[62] = hc_byte_perm_S (w[36], w[35], selector); + w[61] = hc_byte_perm_S (w[35], w[34], selector); + w[60] = hc_byte_perm_S (w[34], w[33], selector); + w[59] = hc_byte_perm_S (w[33], w[32], selector); + w[58] = hc_byte_perm_S (w[32], w[31], selector); + w[57] = hc_byte_perm_S (w[31], w[30], selector); + w[56] = hc_byte_perm_S (w[30], w[29], selector); + w[55] = hc_byte_perm_S (w[29], w[28], selector); + w[54] = hc_byte_perm_S (w[28], w[27], selector); + w[53] = hc_byte_perm_S (w[27], w[26], selector); + w[52] = hc_byte_perm_S (w[26], w[25], selector); + w[51] = hc_byte_perm_S (w[25], w[24], selector); + w[50] = hc_byte_perm_S (w[24], w[23], selector); + w[49] = hc_byte_perm_S (w[23], w[22], selector); + w[48] = hc_byte_perm_S (w[22], w[21], selector); + w[47] = hc_byte_perm_S (w[21], w[20], selector); + w[46] = hc_byte_perm_S (w[20], w[19], selector); + w[45] = hc_byte_perm_S (w[19], w[18], selector); + w[44] = hc_byte_perm_S (w[18], w[17], selector); + w[43] = hc_byte_perm_S (w[17], w[16], selector); + w[42] = hc_byte_perm_S (w[16], w[15], selector); + w[41] = hc_byte_perm_S (w[15], w[14], selector); + w[40] = hc_byte_perm_S (w[14], w[13], selector); + w[39] = hc_byte_perm_S (w[13], w[12], selector); + w[38] = hc_byte_perm_S (w[12], w[11], selector); + w[37] = hc_byte_perm_S (w[11], w[10], selector); + w[36] = hc_byte_perm_S (w[10], w[ 9], selector); + w[35] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[34] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[33] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[32] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[31] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[30] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[29] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[28] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[27] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[26] = hc_byte_perm_S (w[ 0], 0, selector); w[25] = 0; w[24] = 0; w[23] = 0; @@ -57288,43 +57288,43 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 27: - w[63] = __byte_perm_S (w[36], w[35], selector); - w[62] = __byte_perm_S (w[35], w[34], selector); - w[61] = __byte_perm_S (w[34], w[33], selector); - w[60] = __byte_perm_S (w[33], w[32], selector); - w[59] = __byte_perm_S (w[32], w[31], selector); - w[58] = __byte_perm_S (w[31], w[30], selector); - w[57] = __byte_perm_S (w[30], w[29], selector); - w[56] = __byte_perm_S (w[29], w[28], selector); - w[55] = __byte_perm_S (w[28], w[27], selector); - w[54] = __byte_perm_S (w[27], w[26], selector); - w[53] = __byte_perm_S (w[26], w[25], selector); - w[52] = __byte_perm_S (w[25], w[24], selector); - w[51] = __byte_perm_S (w[24], w[23], selector); - w[50] = __byte_perm_S (w[23], w[22], selector); - w[49] = __byte_perm_S (w[22], w[21], selector); - w[48] = __byte_perm_S (w[21], w[20], selector); - w[47] = __byte_perm_S (w[20], w[19], selector); - w[46] = __byte_perm_S (w[19], w[18], selector); - w[45] = __byte_perm_S (w[18], w[17], selector); - w[44] = __byte_perm_S (w[17], w[16], selector); - w[43] = __byte_perm_S (w[16], w[15], selector); - w[42] = __byte_perm_S (w[15], w[14], selector); - w[41] = __byte_perm_S (w[14], w[13], selector); - w[40] = __byte_perm_S (w[13], w[12], selector); - w[39] = __byte_perm_S (w[12], w[11], selector); - w[38] = __byte_perm_S (w[11], w[10], selector); - w[37] = __byte_perm_S (w[10], w[ 9], selector); - w[36] = __byte_perm_S (w[ 9], w[ 8], selector); - w[35] = __byte_perm_S (w[ 8], w[ 7], selector); - w[34] = __byte_perm_S (w[ 7], w[ 6], selector); - w[33] = __byte_perm_S (w[ 6], w[ 5], selector); - w[32] = __byte_perm_S (w[ 5], w[ 4], selector); - w[31] = __byte_perm_S (w[ 4], w[ 3], selector); - w[30] = __byte_perm_S (w[ 3], w[ 2], selector); - w[29] = __byte_perm_S (w[ 2], w[ 1], selector); - w[28] = __byte_perm_S (w[ 1], w[ 0], selector); - w[27] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[36], w[35], selector); + w[62] = hc_byte_perm_S (w[35], w[34], selector); + w[61] = hc_byte_perm_S (w[34], w[33], selector); + w[60] = hc_byte_perm_S (w[33], w[32], selector); + w[59] = hc_byte_perm_S (w[32], w[31], selector); + w[58] = hc_byte_perm_S (w[31], w[30], selector); + w[57] = hc_byte_perm_S (w[30], w[29], selector); + w[56] = hc_byte_perm_S (w[29], w[28], selector); + w[55] = hc_byte_perm_S (w[28], w[27], selector); + w[54] = hc_byte_perm_S (w[27], w[26], selector); + w[53] = hc_byte_perm_S (w[26], w[25], selector); + w[52] = hc_byte_perm_S (w[25], w[24], selector); + w[51] = hc_byte_perm_S (w[24], w[23], selector); + w[50] = hc_byte_perm_S (w[23], w[22], selector); + w[49] = hc_byte_perm_S (w[22], w[21], selector); + w[48] = hc_byte_perm_S (w[21], w[20], selector); + w[47] = hc_byte_perm_S (w[20], w[19], selector); + w[46] = hc_byte_perm_S (w[19], w[18], selector); + w[45] = hc_byte_perm_S (w[18], w[17], selector); + w[44] = hc_byte_perm_S (w[17], w[16], selector); + w[43] = hc_byte_perm_S (w[16], w[15], selector); + w[42] = hc_byte_perm_S (w[15], w[14], selector); + w[41] = hc_byte_perm_S (w[14], w[13], selector); + w[40] = hc_byte_perm_S (w[13], w[12], selector); + w[39] = hc_byte_perm_S (w[12], w[11], selector); + w[38] = hc_byte_perm_S (w[11], w[10], selector); + w[37] = hc_byte_perm_S (w[10], w[ 9], selector); + w[36] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[35] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[34] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[33] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[32] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[31] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[30] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[29] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[28] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[27] = hc_byte_perm_S (w[ 0], 0, selector); w[26] = 0; w[25] = 0; w[24] = 0; @@ -57356,42 +57356,42 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 28: - w[63] = __byte_perm_S (w[35], w[34], selector); - w[62] = __byte_perm_S (w[34], w[33], selector); - w[61] = __byte_perm_S (w[33], w[32], selector); - w[60] = __byte_perm_S (w[32], w[31], selector); - w[59] = __byte_perm_S (w[31], w[30], selector); - w[58] = __byte_perm_S (w[30], w[29], selector); - w[57] = __byte_perm_S (w[29], w[28], selector); - w[56] = __byte_perm_S (w[28], w[27], selector); - w[55] = __byte_perm_S (w[27], w[26], selector); - w[54] = __byte_perm_S (w[26], w[25], selector); - w[53] = __byte_perm_S (w[25], w[24], selector); - w[52] = __byte_perm_S (w[24], w[23], selector); - w[51] = __byte_perm_S (w[23], w[22], selector); - w[50] = __byte_perm_S (w[22], w[21], selector); - w[49] = __byte_perm_S (w[21], w[20], selector); - w[48] = __byte_perm_S (w[20], w[19], selector); - w[47] = __byte_perm_S (w[19], w[18], selector); - w[46] = __byte_perm_S (w[18], w[17], selector); - w[45] = __byte_perm_S (w[17], w[16], selector); - w[44] = __byte_perm_S (w[16], w[15], selector); - w[43] = __byte_perm_S (w[15], w[14], selector); - w[42] = __byte_perm_S (w[14], w[13], selector); - w[41] = __byte_perm_S (w[13], w[12], selector); - w[40] = __byte_perm_S (w[12], w[11], selector); - w[39] = __byte_perm_S (w[11], w[10], selector); - w[38] = __byte_perm_S (w[10], w[ 9], selector); - w[37] = __byte_perm_S (w[ 9], w[ 8], selector); - w[36] = __byte_perm_S (w[ 8], w[ 7], selector); - w[35] = __byte_perm_S (w[ 7], w[ 6], selector); - w[34] = __byte_perm_S (w[ 6], w[ 5], selector); - w[33] = __byte_perm_S (w[ 5], w[ 4], selector); - w[32] = __byte_perm_S (w[ 4], w[ 3], selector); - w[31] = __byte_perm_S (w[ 3], w[ 2], selector); - w[30] = __byte_perm_S (w[ 2], w[ 1], selector); - w[29] = __byte_perm_S (w[ 1], w[ 0], selector); - w[28] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[35], w[34], selector); + w[62] = hc_byte_perm_S (w[34], w[33], selector); + w[61] = hc_byte_perm_S (w[33], w[32], selector); + w[60] = hc_byte_perm_S (w[32], w[31], selector); + w[59] = hc_byte_perm_S (w[31], w[30], selector); + w[58] = hc_byte_perm_S (w[30], w[29], selector); + w[57] = hc_byte_perm_S (w[29], w[28], selector); + w[56] = hc_byte_perm_S (w[28], w[27], selector); + w[55] = hc_byte_perm_S (w[27], w[26], selector); + w[54] = hc_byte_perm_S (w[26], w[25], selector); + w[53] = hc_byte_perm_S (w[25], w[24], selector); + w[52] = hc_byte_perm_S (w[24], w[23], selector); + w[51] = hc_byte_perm_S (w[23], w[22], selector); + w[50] = hc_byte_perm_S (w[22], w[21], selector); + w[49] = hc_byte_perm_S (w[21], w[20], selector); + w[48] = hc_byte_perm_S (w[20], w[19], selector); + w[47] = hc_byte_perm_S (w[19], w[18], selector); + w[46] = hc_byte_perm_S (w[18], w[17], selector); + w[45] = hc_byte_perm_S (w[17], w[16], selector); + w[44] = hc_byte_perm_S (w[16], w[15], selector); + w[43] = hc_byte_perm_S (w[15], w[14], selector); + w[42] = hc_byte_perm_S (w[14], w[13], selector); + w[41] = hc_byte_perm_S (w[13], w[12], selector); + w[40] = hc_byte_perm_S (w[12], w[11], selector); + w[39] = hc_byte_perm_S (w[11], w[10], selector); + w[38] = hc_byte_perm_S (w[10], w[ 9], selector); + w[37] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[36] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[35] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[34] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[33] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[32] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[31] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[30] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[29] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[28] = hc_byte_perm_S (w[ 0], 0, selector); w[27] = 0; w[26] = 0; w[25] = 0; @@ -57424,41 +57424,41 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 29: - w[63] = __byte_perm_S (w[34], w[33], selector); - w[62] = __byte_perm_S (w[33], w[32], selector); - w[61] = __byte_perm_S (w[32], w[31], selector); - w[60] = __byte_perm_S (w[31], w[30], selector); - w[59] = __byte_perm_S (w[30], w[29], selector); - w[58] = __byte_perm_S (w[29], w[28], selector); - w[57] = __byte_perm_S (w[28], w[27], selector); - w[56] = __byte_perm_S (w[27], w[26], selector); - w[55] = __byte_perm_S (w[26], w[25], selector); - w[54] = __byte_perm_S (w[25], w[24], selector); - w[53] = __byte_perm_S (w[24], w[23], selector); - w[52] = __byte_perm_S (w[23], w[22], selector); - w[51] = __byte_perm_S (w[22], w[21], selector); - w[50] = __byte_perm_S (w[21], w[20], selector); - w[49] = __byte_perm_S (w[20], w[19], selector); - w[48] = __byte_perm_S (w[19], w[18], selector); - w[47] = __byte_perm_S (w[18], w[17], selector); - w[46] = __byte_perm_S (w[17], w[16], selector); - w[45] = __byte_perm_S (w[16], w[15], selector); - w[44] = __byte_perm_S (w[15], w[14], selector); - w[43] = __byte_perm_S (w[14], w[13], selector); - w[42] = __byte_perm_S (w[13], w[12], selector); - w[41] = __byte_perm_S (w[12], w[11], selector); - w[40] = __byte_perm_S (w[11], w[10], selector); - w[39] = __byte_perm_S (w[10], w[ 9], selector); - w[38] = __byte_perm_S (w[ 9], w[ 8], selector); - w[37] = __byte_perm_S (w[ 8], w[ 7], selector); - w[36] = __byte_perm_S (w[ 7], w[ 6], selector); - w[35] = __byte_perm_S (w[ 6], w[ 5], selector); - w[34] = __byte_perm_S (w[ 5], w[ 4], selector); - w[33] = __byte_perm_S (w[ 4], w[ 3], selector); - w[32] = __byte_perm_S (w[ 3], w[ 2], selector); - w[31] = __byte_perm_S (w[ 2], w[ 1], selector); - w[30] = __byte_perm_S (w[ 1], w[ 0], selector); - w[29] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[34], w[33], selector); + w[62] = hc_byte_perm_S (w[33], w[32], selector); + w[61] = hc_byte_perm_S (w[32], w[31], selector); + w[60] = hc_byte_perm_S (w[31], w[30], selector); + w[59] = hc_byte_perm_S (w[30], w[29], selector); + w[58] = hc_byte_perm_S (w[29], w[28], selector); + w[57] = hc_byte_perm_S (w[28], w[27], selector); + w[56] = hc_byte_perm_S (w[27], w[26], selector); + w[55] = hc_byte_perm_S (w[26], w[25], selector); + w[54] = hc_byte_perm_S (w[25], w[24], selector); + w[53] = hc_byte_perm_S (w[24], w[23], selector); + w[52] = hc_byte_perm_S (w[23], w[22], selector); + w[51] = hc_byte_perm_S (w[22], w[21], selector); + w[50] = hc_byte_perm_S (w[21], w[20], selector); + w[49] = hc_byte_perm_S (w[20], w[19], selector); + w[48] = hc_byte_perm_S (w[19], w[18], selector); + w[47] = hc_byte_perm_S (w[18], w[17], selector); + w[46] = hc_byte_perm_S (w[17], w[16], selector); + w[45] = hc_byte_perm_S (w[16], w[15], selector); + w[44] = hc_byte_perm_S (w[15], w[14], selector); + w[43] = hc_byte_perm_S (w[14], w[13], selector); + w[42] = hc_byte_perm_S (w[13], w[12], selector); + w[41] = hc_byte_perm_S (w[12], w[11], selector); + w[40] = hc_byte_perm_S (w[11], w[10], selector); + w[39] = hc_byte_perm_S (w[10], w[ 9], selector); + w[38] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[37] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[36] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[35] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[34] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[33] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[32] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[31] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[30] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[29] = hc_byte_perm_S (w[ 0], 0, selector); w[28] = 0; w[27] = 0; w[26] = 0; @@ -57492,40 +57492,40 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 30: - w[63] = __byte_perm_S (w[33], w[32], selector); - w[62] = __byte_perm_S (w[32], w[31], selector); - w[61] = __byte_perm_S (w[31], w[30], selector); - w[60] = __byte_perm_S (w[30], w[29], selector); - w[59] = __byte_perm_S (w[29], w[28], selector); - w[58] = __byte_perm_S (w[28], w[27], selector); - w[57] = __byte_perm_S (w[27], w[26], selector); - w[56] = __byte_perm_S (w[26], w[25], selector); - w[55] = __byte_perm_S (w[25], w[24], selector); - w[54] = __byte_perm_S (w[24], w[23], selector); - w[53] = __byte_perm_S (w[23], w[22], selector); - w[52] = __byte_perm_S (w[22], w[21], selector); - w[51] = __byte_perm_S (w[21], w[20], selector); - w[50] = __byte_perm_S (w[20], w[19], selector); - w[49] = __byte_perm_S (w[19], w[18], selector); - w[48] = __byte_perm_S (w[18], w[17], selector); - w[47] = __byte_perm_S (w[17], w[16], selector); - w[46] = __byte_perm_S (w[16], w[15], selector); - w[45] = __byte_perm_S (w[15], w[14], selector); - w[44] = __byte_perm_S (w[14], w[13], selector); - w[43] = __byte_perm_S (w[13], w[12], selector); - w[42] = __byte_perm_S (w[12], w[11], selector); - w[41] = __byte_perm_S (w[11], w[10], selector); - w[40] = __byte_perm_S (w[10], w[ 9], selector); - w[39] = __byte_perm_S (w[ 9], w[ 8], selector); - w[38] = __byte_perm_S (w[ 8], w[ 7], selector); - w[37] = __byte_perm_S (w[ 7], w[ 6], selector); - w[36] = __byte_perm_S (w[ 6], w[ 5], selector); - w[35] = __byte_perm_S (w[ 5], w[ 4], selector); - w[34] = __byte_perm_S (w[ 4], w[ 3], selector); - w[33] = __byte_perm_S (w[ 3], w[ 2], selector); - w[32] = __byte_perm_S (w[ 2], w[ 1], selector); - w[31] = __byte_perm_S (w[ 1], w[ 0], selector); - w[30] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[33], w[32], selector); + w[62] = hc_byte_perm_S (w[32], w[31], selector); + w[61] = hc_byte_perm_S (w[31], w[30], selector); + w[60] = hc_byte_perm_S (w[30], w[29], selector); + w[59] = hc_byte_perm_S (w[29], w[28], selector); + w[58] = hc_byte_perm_S (w[28], w[27], selector); + w[57] = hc_byte_perm_S (w[27], w[26], selector); + w[56] = hc_byte_perm_S (w[26], w[25], selector); + w[55] = hc_byte_perm_S (w[25], w[24], selector); + w[54] = hc_byte_perm_S (w[24], w[23], selector); + w[53] = hc_byte_perm_S (w[23], w[22], selector); + w[52] = hc_byte_perm_S (w[22], w[21], selector); + w[51] = hc_byte_perm_S (w[21], w[20], selector); + w[50] = hc_byte_perm_S (w[20], w[19], selector); + w[49] = hc_byte_perm_S (w[19], w[18], selector); + w[48] = hc_byte_perm_S (w[18], w[17], selector); + w[47] = hc_byte_perm_S (w[17], w[16], selector); + w[46] = hc_byte_perm_S (w[16], w[15], selector); + w[45] = hc_byte_perm_S (w[15], w[14], selector); + w[44] = hc_byte_perm_S (w[14], w[13], selector); + w[43] = hc_byte_perm_S (w[13], w[12], selector); + w[42] = hc_byte_perm_S (w[12], w[11], selector); + w[41] = hc_byte_perm_S (w[11], w[10], selector); + w[40] = hc_byte_perm_S (w[10], w[ 9], selector); + w[39] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[38] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[37] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[36] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[35] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[34] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[33] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[32] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[31] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[30] = hc_byte_perm_S (w[ 0], 0, selector); w[29] = 0; w[28] = 0; w[27] = 0; @@ -57560,39 +57560,39 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 31: - w[63] = __byte_perm_S (w[32], w[31], selector); - w[62] = __byte_perm_S (w[31], w[30], selector); - w[61] = __byte_perm_S (w[30], w[29], selector); - w[60] = __byte_perm_S (w[29], w[28], selector); - w[59] = __byte_perm_S (w[28], w[27], selector); - w[58] = __byte_perm_S (w[27], w[26], selector); - w[57] = __byte_perm_S (w[26], w[25], selector); - w[56] = __byte_perm_S (w[25], w[24], selector); - w[55] = __byte_perm_S (w[24], w[23], selector); - w[54] = __byte_perm_S (w[23], w[22], selector); - w[53] = __byte_perm_S (w[22], w[21], selector); - w[52] = __byte_perm_S (w[21], w[20], selector); - w[51] = __byte_perm_S (w[20], w[19], selector); - w[50] = __byte_perm_S (w[19], w[18], selector); - w[49] = __byte_perm_S (w[18], w[17], selector); - w[48] = __byte_perm_S (w[17], w[16], selector); - w[47] = __byte_perm_S (w[16], w[15], selector); - w[46] = __byte_perm_S (w[15], w[14], selector); - w[45] = __byte_perm_S (w[14], w[13], selector); - w[44] = __byte_perm_S (w[13], w[12], selector); - w[43] = __byte_perm_S (w[12], w[11], selector); - w[42] = __byte_perm_S (w[11], w[10], selector); - w[41] = __byte_perm_S (w[10], w[ 9], selector); - w[40] = __byte_perm_S (w[ 9], w[ 8], selector); - w[39] = __byte_perm_S (w[ 8], w[ 7], selector); - w[38] = __byte_perm_S (w[ 7], w[ 6], selector); - w[37] = __byte_perm_S (w[ 6], w[ 5], selector); - w[36] = __byte_perm_S (w[ 5], w[ 4], selector); - w[35] = __byte_perm_S (w[ 4], w[ 3], selector); - w[34] = __byte_perm_S (w[ 3], w[ 2], selector); - w[33] = __byte_perm_S (w[ 2], w[ 1], selector); - w[32] = __byte_perm_S (w[ 1], w[ 0], selector); - w[31] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[32], w[31], selector); + w[62] = hc_byte_perm_S (w[31], w[30], selector); + w[61] = hc_byte_perm_S (w[30], w[29], selector); + w[60] = hc_byte_perm_S (w[29], w[28], selector); + w[59] = hc_byte_perm_S (w[28], w[27], selector); + w[58] = hc_byte_perm_S (w[27], w[26], selector); + w[57] = hc_byte_perm_S (w[26], w[25], selector); + w[56] = hc_byte_perm_S (w[25], w[24], selector); + w[55] = hc_byte_perm_S (w[24], w[23], selector); + w[54] = hc_byte_perm_S (w[23], w[22], selector); + w[53] = hc_byte_perm_S (w[22], w[21], selector); + w[52] = hc_byte_perm_S (w[21], w[20], selector); + w[51] = hc_byte_perm_S (w[20], w[19], selector); + w[50] = hc_byte_perm_S (w[19], w[18], selector); + w[49] = hc_byte_perm_S (w[18], w[17], selector); + w[48] = hc_byte_perm_S (w[17], w[16], selector); + w[47] = hc_byte_perm_S (w[16], w[15], selector); + w[46] = hc_byte_perm_S (w[15], w[14], selector); + w[45] = hc_byte_perm_S (w[14], w[13], selector); + w[44] = hc_byte_perm_S (w[13], w[12], selector); + w[43] = hc_byte_perm_S (w[12], w[11], selector); + w[42] = hc_byte_perm_S (w[11], w[10], selector); + w[41] = hc_byte_perm_S (w[10], w[ 9], selector); + w[40] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[39] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[38] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[37] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[36] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[35] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[34] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[33] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[32] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[31] = hc_byte_perm_S (w[ 0], 0, selector); w[30] = 0; w[29] = 0; w[28] = 0; @@ -57628,38 +57628,38 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 32: - w[63] = __byte_perm_S (w[31], w[30], selector); - w[62] = __byte_perm_S (w[30], w[29], selector); - w[61] = __byte_perm_S (w[29], w[28], selector); - w[60] = __byte_perm_S (w[28], w[27], selector); - w[59] = __byte_perm_S (w[27], w[26], selector); - w[58] = __byte_perm_S (w[26], w[25], selector); - w[57] = __byte_perm_S (w[25], w[24], selector); - w[56] = __byte_perm_S (w[24], w[23], selector); - w[55] = __byte_perm_S (w[23], w[22], selector); - w[54] = __byte_perm_S (w[22], w[21], selector); - w[53] = __byte_perm_S (w[21], w[20], selector); - w[52] = __byte_perm_S (w[20], w[19], selector); - w[51] = __byte_perm_S (w[19], w[18], selector); - w[50] = __byte_perm_S (w[18], w[17], selector); - w[49] = __byte_perm_S (w[17], w[16], selector); - w[48] = __byte_perm_S (w[16], w[15], selector); - w[47] = __byte_perm_S (w[15], w[14], selector); - w[46] = __byte_perm_S (w[14], w[13], selector); - w[45] = __byte_perm_S (w[13], w[12], selector); - w[44] = __byte_perm_S (w[12], w[11], selector); - w[43] = __byte_perm_S (w[11], w[10], selector); - w[42] = __byte_perm_S (w[10], w[ 9], selector); - w[41] = __byte_perm_S (w[ 9], w[ 8], selector); - w[40] = __byte_perm_S (w[ 8], w[ 7], selector); - w[39] = __byte_perm_S (w[ 7], w[ 6], selector); - w[38] = __byte_perm_S (w[ 6], w[ 5], selector); - w[37] = __byte_perm_S (w[ 5], w[ 4], selector); - w[36] = __byte_perm_S (w[ 4], w[ 3], selector); - w[35] = __byte_perm_S (w[ 3], w[ 2], selector); - w[34] = __byte_perm_S (w[ 2], w[ 1], selector); - w[33] = __byte_perm_S (w[ 1], w[ 0], selector); - w[32] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[31], w[30], selector); + w[62] = hc_byte_perm_S (w[30], w[29], selector); + w[61] = hc_byte_perm_S (w[29], w[28], selector); + w[60] = hc_byte_perm_S (w[28], w[27], selector); + w[59] = hc_byte_perm_S (w[27], w[26], selector); + w[58] = hc_byte_perm_S (w[26], w[25], selector); + w[57] = hc_byte_perm_S (w[25], w[24], selector); + w[56] = hc_byte_perm_S (w[24], w[23], selector); + w[55] = hc_byte_perm_S (w[23], w[22], selector); + w[54] = hc_byte_perm_S (w[22], w[21], selector); + w[53] = hc_byte_perm_S (w[21], w[20], selector); + w[52] = hc_byte_perm_S (w[20], w[19], selector); + w[51] = hc_byte_perm_S (w[19], w[18], selector); + w[50] = hc_byte_perm_S (w[18], w[17], selector); + w[49] = hc_byte_perm_S (w[17], w[16], selector); + w[48] = hc_byte_perm_S (w[16], w[15], selector); + w[47] = hc_byte_perm_S (w[15], w[14], selector); + w[46] = hc_byte_perm_S (w[14], w[13], selector); + w[45] = hc_byte_perm_S (w[13], w[12], selector); + w[44] = hc_byte_perm_S (w[12], w[11], selector); + w[43] = hc_byte_perm_S (w[11], w[10], selector); + w[42] = hc_byte_perm_S (w[10], w[ 9], selector); + w[41] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[40] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[39] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[38] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[37] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[36] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[35] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[34] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[33] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[32] = hc_byte_perm_S (w[ 0], 0, selector); w[31] = 0; w[30] = 0; w[29] = 0; @@ -57696,37 +57696,37 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 33: - w[63] = __byte_perm_S (w[30], w[29], selector); - w[62] = __byte_perm_S (w[29], w[28], selector); - w[61] = __byte_perm_S (w[28], w[27], selector); - w[60] = __byte_perm_S (w[27], w[26], selector); - w[59] = __byte_perm_S (w[26], w[25], selector); - w[58] = __byte_perm_S (w[25], w[24], selector); - w[57] = __byte_perm_S (w[24], w[23], selector); - w[56] = __byte_perm_S (w[23], w[22], selector); - w[55] = __byte_perm_S (w[22], w[21], selector); - w[54] = __byte_perm_S (w[21], w[20], selector); - w[53] = __byte_perm_S (w[20], w[19], selector); - w[52] = __byte_perm_S (w[19], w[18], selector); - w[51] = __byte_perm_S (w[18], w[17], selector); - w[50] = __byte_perm_S (w[17], w[16], selector); - w[49] = __byte_perm_S (w[16], w[15], selector); - w[48] = __byte_perm_S (w[15], w[14], selector); - w[47] = __byte_perm_S (w[14], w[13], selector); - w[46] = __byte_perm_S (w[13], w[12], selector); - w[45] = __byte_perm_S (w[12], w[11], selector); - w[44] = __byte_perm_S (w[11], w[10], selector); - w[43] = __byte_perm_S (w[10], w[ 9], selector); - w[42] = __byte_perm_S (w[ 9], w[ 8], selector); - w[41] = __byte_perm_S (w[ 8], w[ 7], selector); - w[40] = __byte_perm_S (w[ 7], w[ 6], selector); - w[39] = __byte_perm_S (w[ 6], w[ 5], selector); - w[38] = __byte_perm_S (w[ 5], w[ 4], selector); - w[37] = __byte_perm_S (w[ 4], w[ 3], selector); - w[36] = __byte_perm_S (w[ 3], w[ 2], selector); - w[35] = __byte_perm_S (w[ 2], w[ 1], selector); - w[34] = __byte_perm_S (w[ 1], w[ 0], selector); - w[33] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[30], w[29], selector); + w[62] = hc_byte_perm_S (w[29], w[28], selector); + w[61] = hc_byte_perm_S (w[28], w[27], selector); + w[60] = hc_byte_perm_S (w[27], w[26], selector); + w[59] = hc_byte_perm_S (w[26], w[25], selector); + w[58] = hc_byte_perm_S (w[25], w[24], selector); + w[57] = hc_byte_perm_S (w[24], w[23], selector); + w[56] = hc_byte_perm_S (w[23], w[22], selector); + w[55] = hc_byte_perm_S (w[22], w[21], selector); + w[54] = hc_byte_perm_S (w[21], w[20], selector); + w[53] = hc_byte_perm_S (w[20], w[19], selector); + w[52] = hc_byte_perm_S (w[19], w[18], selector); + w[51] = hc_byte_perm_S (w[18], w[17], selector); + w[50] = hc_byte_perm_S (w[17], w[16], selector); + w[49] = hc_byte_perm_S (w[16], w[15], selector); + w[48] = hc_byte_perm_S (w[15], w[14], selector); + w[47] = hc_byte_perm_S (w[14], w[13], selector); + w[46] = hc_byte_perm_S (w[13], w[12], selector); + w[45] = hc_byte_perm_S (w[12], w[11], selector); + w[44] = hc_byte_perm_S (w[11], w[10], selector); + w[43] = hc_byte_perm_S (w[10], w[ 9], selector); + w[42] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[41] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[40] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[39] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[38] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[37] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[36] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[35] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[34] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[33] = hc_byte_perm_S (w[ 0], 0, selector); w[32] = 0; w[31] = 0; w[30] = 0; @@ -57764,36 +57764,36 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 34: - w[63] = __byte_perm_S (w[29], w[28], selector); - w[62] = __byte_perm_S (w[28], w[27], selector); - w[61] = __byte_perm_S (w[27], w[26], selector); - w[60] = __byte_perm_S (w[26], w[25], selector); - w[59] = __byte_perm_S (w[25], w[24], selector); - w[58] = __byte_perm_S (w[24], w[23], selector); - w[57] = __byte_perm_S (w[23], w[22], selector); - w[56] = __byte_perm_S (w[22], w[21], selector); - w[55] = __byte_perm_S (w[21], w[20], selector); - w[54] = __byte_perm_S (w[20], w[19], selector); - w[53] = __byte_perm_S (w[19], w[18], selector); - w[52] = __byte_perm_S (w[18], w[17], selector); - w[51] = __byte_perm_S (w[17], w[16], selector); - w[50] = __byte_perm_S (w[16], w[15], selector); - w[49] = __byte_perm_S (w[15], w[14], selector); - w[48] = __byte_perm_S (w[14], w[13], selector); - w[47] = __byte_perm_S (w[13], w[12], selector); - w[46] = __byte_perm_S (w[12], w[11], selector); - w[45] = __byte_perm_S (w[11], w[10], selector); - w[44] = __byte_perm_S (w[10], w[ 9], selector); - w[43] = __byte_perm_S (w[ 9], w[ 8], selector); - w[42] = __byte_perm_S (w[ 8], w[ 7], selector); - w[41] = __byte_perm_S (w[ 7], w[ 6], selector); - w[40] = __byte_perm_S (w[ 6], w[ 5], selector); - w[39] = __byte_perm_S (w[ 5], w[ 4], selector); - w[38] = __byte_perm_S (w[ 4], w[ 3], selector); - w[37] = __byte_perm_S (w[ 3], w[ 2], selector); - w[36] = __byte_perm_S (w[ 2], w[ 1], selector); - w[35] = __byte_perm_S (w[ 1], w[ 0], selector); - w[34] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[29], w[28], selector); + w[62] = hc_byte_perm_S (w[28], w[27], selector); + w[61] = hc_byte_perm_S (w[27], w[26], selector); + w[60] = hc_byte_perm_S (w[26], w[25], selector); + w[59] = hc_byte_perm_S (w[25], w[24], selector); + w[58] = hc_byte_perm_S (w[24], w[23], selector); + w[57] = hc_byte_perm_S (w[23], w[22], selector); + w[56] = hc_byte_perm_S (w[22], w[21], selector); + w[55] = hc_byte_perm_S (w[21], w[20], selector); + w[54] = hc_byte_perm_S (w[20], w[19], selector); + w[53] = hc_byte_perm_S (w[19], w[18], selector); + w[52] = hc_byte_perm_S (w[18], w[17], selector); + w[51] = hc_byte_perm_S (w[17], w[16], selector); + w[50] = hc_byte_perm_S (w[16], w[15], selector); + w[49] = hc_byte_perm_S (w[15], w[14], selector); + w[48] = hc_byte_perm_S (w[14], w[13], selector); + w[47] = hc_byte_perm_S (w[13], w[12], selector); + w[46] = hc_byte_perm_S (w[12], w[11], selector); + w[45] = hc_byte_perm_S (w[11], w[10], selector); + w[44] = hc_byte_perm_S (w[10], w[ 9], selector); + w[43] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[42] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[41] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[40] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[39] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[38] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[37] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[36] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[35] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[34] = hc_byte_perm_S (w[ 0], 0, selector); w[33] = 0; w[32] = 0; w[31] = 0; @@ -57832,35 +57832,35 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 35: - w[63] = __byte_perm_S (w[28], w[27], selector); - w[62] = __byte_perm_S (w[27], w[26], selector); - w[61] = __byte_perm_S (w[26], w[25], selector); - w[60] = __byte_perm_S (w[25], w[24], selector); - w[59] = __byte_perm_S (w[24], w[23], selector); - w[58] = __byte_perm_S (w[23], w[22], selector); - w[57] = __byte_perm_S (w[22], w[21], selector); - w[56] = __byte_perm_S (w[21], w[20], selector); - w[55] = __byte_perm_S (w[20], w[19], selector); - w[54] = __byte_perm_S (w[19], w[18], selector); - w[53] = __byte_perm_S (w[18], w[17], selector); - w[52] = __byte_perm_S (w[17], w[16], selector); - w[51] = __byte_perm_S (w[16], w[15], selector); - w[50] = __byte_perm_S (w[15], w[14], selector); - w[49] = __byte_perm_S (w[14], w[13], selector); - w[48] = __byte_perm_S (w[13], w[12], selector); - w[47] = __byte_perm_S (w[12], w[11], selector); - w[46] = __byte_perm_S (w[11], w[10], selector); - w[45] = __byte_perm_S (w[10], w[ 9], selector); - w[44] = __byte_perm_S (w[ 9], w[ 8], selector); - w[43] = __byte_perm_S (w[ 8], w[ 7], selector); - w[42] = __byte_perm_S (w[ 7], w[ 6], selector); - w[41] = __byte_perm_S (w[ 6], w[ 5], selector); - w[40] = __byte_perm_S (w[ 5], w[ 4], selector); - w[39] = __byte_perm_S (w[ 4], w[ 3], selector); - w[38] = __byte_perm_S (w[ 3], w[ 2], selector); - w[37] = __byte_perm_S (w[ 2], w[ 1], selector); - w[36] = __byte_perm_S (w[ 1], w[ 0], selector); - w[35] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[28], w[27], selector); + w[62] = hc_byte_perm_S (w[27], w[26], selector); + w[61] = hc_byte_perm_S (w[26], w[25], selector); + w[60] = hc_byte_perm_S (w[25], w[24], selector); + w[59] = hc_byte_perm_S (w[24], w[23], selector); + w[58] = hc_byte_perm_S (w[23], w[22], selector); + w[57] = hc_byte_perm_S (w[22], w[21], selector); + w[56] = hc_byte_perm_S (w[21], w[20], selector); + w[55] = hc_byte_perm_S (w[20], w[19], selector); + w[54] = hc_byte_perm_S (w[19], w[18], selector); + w[53] = hc_byte_perm_S (w[18], w[17], selector); + w[52] = hc_byte_perm_S (w[17], w[16], selector); + w[51] = hc_byte_perm_S (w[16], w[15], selector); + w[50] = hc_byte_perm_S (w[15], w[14], selector); + w[49] = hc_byte_perm_S (w[14], w[13], selector); + w[48] = hc_byte_perm_S (w[13], w[12], selector); + w[47] = hc_byte_perm_S (w[12], w[11], selector); + w[46] = hc_byte_perm_S (w[11], w[10], selector); + w[45] = hc_byte_perm_S (w[10], w[ 9], selector); + w[44] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[43] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[42] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[41] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[40] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[39] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[38] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[37] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[36] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[35] = hc_byte_perm_S (w[ 0], 0, selector); w[34] = 0; w[33] = 0; w[32] = 0; @@ -57900,34 +57900,34 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 36: - w[63] = __byte_perm_S (w[27], w[26], selector); - w[62] = __byte_perm_S (w[26], w[25], selector); - w[61] = __byte_perm_S (w[25], w[24], selector); - w[60] = __byte_perm_S (w[24], w[23], selector); - w[59] = __byte_perm_S (w[23], w[22], selector); - w[58] = __byte_perm_S (w[22], w[21], selector); - w[57] = __byte_perm_S (w[21], w[20], selector); - w[56] = __byte_perm_S (w[20], w[19], selector); - w[55] = __byte_perm_S (w[19], w[18], selector); - w[54] = __byte_perm_S (w[18], w[17], selector); - w[53] = __byte_perm_S (w[17], w[16], selector); - w[52] = __byte_perm_S (w[16], w[15], selector); - w[51] = __byte_perm_S (w[15], w[14], selector); - w[50] = __byte_perm_S (w[14], w[13], selector); - w[49] = __byte_perm_S (w[13], w[12], selector); - w[48] = __byte_perm_S (w[12], w[11], selector); - w[47] = __byte_perm_S (w[11], w[10], selector); - w[46] = __byte_perm_S (w[10], w[ 9], selector); - w[45] = __byte_perm_S (w[ 9], w[ 8], selector); - w[44] = __byte_perm_S (w[ 8], w[ 7], selector); - w[43] = __byte_perm_S (w[ 7], w[ 6], selector); - w[42] = __byte_perm_S (w[ 6], w[ 5], selector); - w[41] = __byte_perm_S (w[ 5], w[ 4], selector); - w[40] = __byte_perm_S (w[ 4], w[ 3], selector); - w[39] = __byte_perm_S (w[ 3], w[ 2], selector); - w[38] = __byte_perm_S (w[ 2], w[ 1], selector); - w[37] = __byte_perm_S (w[ 1], w[ 0], selector); - w[36] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[27], w[26], selector); + w[62] = hc_byte_perm_S (w[26], w[25], selector); + w[61] = hc_byte_perm_S (w[25], w[24], selector); + w[60] = hc_byte_perm_S (w[24], w[23], selector); + w[59] = hc_byte_perm_S (w[23], w[22], selector); + w[58] = hc_byte_perm_S (w[22], w[21], selector); + w[57] = hc_byte_perm_S (w[21], w[20], selector); + w[56] = hc_byte_perm_S (w[20], w[19], selector); + w[55] = hc_byte_perm_S (w[19], w[18], selector); + w[54] = hc_byte_perm_S (w[18], w[17], selector); + w[53] = hc_byte_perm_S (w[17], w[16], selector); + w[52] = hc_byte_perm_S (w[16], w[15], selector); + w[51] = hc_byte_perm_S (w[15], w[14], selector); + w[50] = hc_byte_perm_S (w[14], w[13], selector); + w[49] = hc_byte_perm_S (w[13], w[12], selector); + w[48] = hc_byte_perm_S (w[12], w[11], selector); + w[47] = hc_byte_perm_S (w[11], w[10], selector); + w[46] = hc_byte_perm_S (w[10], w[ 9], selector); + w[45] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[44] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[43] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[42] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[41] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[40] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[39] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[38] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[37] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[36] = hc_byte_perm_S (w[ 0], 0, selector); w[35] = 0; w[34] = 0; w[33] = 0; @@ -57968,33 +57968,33 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 37: - w[63] = __byte_perm_S (w[26], w[25], selector); - w[62] = __byte_perm_S (w[25], w[24], selector); - w[61] = __byte_perm_S (w[24], w[23], selector); - w[60] = __byte_perm_S (w[23], w[22], selector); - w[59] = __byte_perm_S (w[22], w[21], selector); - w[58] = __byte_perm_S (w[21], w[20], selector); - w[57] = __byte_perm_S (w[20], w[19], selector); - w[56] = __byte_perm_S (w[19], w[18], selector); - w[55] = __byte_perm_S (w[18], w[17], selector); - w[54] = __byte_perm_S (w[17], w[16], selector); - w[53] = __byte_perm_S (w[16], w[15], selector); - w[52] = __byte_perm_S (w[15], w[14], selector); - w[51] = __byte_perm_S (w[14], w[13], selector); - w[50] = __byte_perm_S (w[13], w[12], selector); - w[49] = __byte_perm_S (w[12], w[11], selector); - w[48] = __byte_perm_S (w[11], w[10], selector); - w[47] = __byte_perm_S (w[10], w[ 9], selector); - w[46] = __byte_perm_S (w[ 9], w[ 8], selector); - w[45] = __byte_perm_S (w[ 8], w[ 7], selector); - w[44] = __byte_perm_S (w[ 7], w[ 6], selector); - w[43] = __byte_perm_S (w[ 6], w[ 5], selector); - w[42] = __byte_perm_S (w[ 5], w[ 4], selector); - w[41] = __byte_perm_S (w[ 4], w[ 3], selector); - w[40] = __byte_perm_S (w[ 3], w[ 2], selector); - w[39] = __byte_perm_S (w[ 2], w[ 1], selector); - w[38] = __byte_perm_S (w[ 1], w[ 0], selector); - w[37] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[26], w[25], selector); + w[62] = hc_byte_perm_S (w[25], w[24], selector); + w[61] = hc_byte_perm_S (w[24], w[23], selector); + w[60] = hc_byte_perm_S (w[23], w[22], selector); + w[59] = hc_byte_perm_S (w[22], w[21], selector); + w[58] = hc_byte_perm_S (w[21], w[20], selector); + w[57] = hc_byte_perm_S (w[20], w[19], selector); + w[56] = hc_byte_perm_S (w[19], w[18], selector); + w[55] = hc_byte_perm_S (w[18], w[17], selector); + w[54] = hc_byte_perm_S (w[17], w[16], selector); + w[53] = hc_byte_perm_S (w[16], w[15], selector); + w[52] = hc_byte_perm_S (w[15], w[14], selector); + w[51] = hc_byte_perm_S (w[14], w[13], selector); + w[50] = hc_byte_perm_S (w[13], w[12], selector); + w[49] = hc_byte_perm_S (w[12], w[11], selector); + w[48] = hc_byte_perm_S (w[11], w[10], selector); + w[47] = hc_byte_perm_S (w[10], w[ 9], selector); + w[46] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[45] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[44] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[43] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[42] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[41] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[40] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[39] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[38] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[37] = hc_byte_perm_S (w[ 0], 0, selector); w[36] = 0; w[35] = 0; w[34] = 0; @@ -58036,32 +58036,32 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 38: - w[63] = __byte_perm_S (w[25], w[24], selector); - w[62] = __byte_perm_S (w[24], w[23], selector); - w[61] = __byte_perm_S (w[23], w[22], selector); - w[60] = __byte_perm_S (w[22], w[21], selector); - w[59] = __byte_perm_S (w[21], w[20], selector); - w[58] = __byte_perm_S (w[20], w[19], selector); - w[57] = __byte_perm_S (w[19], w[18], selector); - w[56] = __byte_perm_S (w[18], w[17], selector); - w[55] = __byte_perm_S (w[17], w[16], selector); - w[54] = __byte_perm_S (w[16], w[15], selector); - w[53] = __byte_perm_S (w[15], w[14], selector); - w[52] = __byte_perm_S (w[14], w[13], selector); - w[51] = __byte_perm_S (w[13], w[12], selector); - w[50] = __byte_perm_S (w[12], w[11], selector); - w[49] = __byte_perm_S (w[11], w[10], selector); - w[48] = __byte_perm_S (w[10], w[ 9], selector); - w[47] = __byte_perm_S (w[ 9], w[ 8], selector); - w[46] = __byte_perm_S (w[ 8], w[ 7], selector); - w[45] = __byte_perm_S (w[ 7], w[ 6], selector); - w[44] = __byte_perm_S (w[ 6], w[ 5], selector); - w[43] = __byte_perm_S (w[ 5], w[ 4], selector); - w[42] = __byte_perm_S (w[ 4], w[ 3], selector); - w[41] = __byte_perm_S (w[ 3], w[ 2], selector); - w[40] = __byte_perm_S (w[ 2], w[ 1], selector); - w[39] = __byte_perm_S (w[ 1], w[ 0], selector); - w[38] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[25], w[24], selector); + w[62] = hc_byte_perm_S (w[24], w[23], selector); + w[61] = hc_byte_perm_S (w[23], w[22], selector); + w[60] = hc_byte_perm_S (w[22], w[21], selector); + w[59] = hc_byte_perm_S (w[21], w[20], selector); + w[58] = hc_byte_perm_S (w[20], w[19], selector); + w[57] = hc_byte_perm_S (w[19], w[18], selector); + w[56] = hc_byte_perm_S (w[18], w[17], selector); + w[55] = hc_byte_perm_S (w[17], w[16], selector); + w[54] = hc_byte_perm_S (w[16], w[15], selector); + w[53] = hc_byte_perm_S (w[15], w[14], selector); + w[52] = hc_byte_perm_S (w[14], w[13], selector); + w[51] = hc_byte_perm_S (w[13], w[12], selector); + w[50] = hc_byte_perm_S (w[12], w[11], selector); + w[49] = hc_byte_perm_S (w[11], w[10], selector); + w[48] = hc_byte_perm_S (w[10], w[ 9], selector); + w[47] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[46] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[45] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[44] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[43] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[42] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[41] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[40] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[39] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[38] = hc_byte_perm_S (w[ 0], 0, selector); w[37] = 0; w[36] = 0; w[35] = 0; @@ -58104,31 +58104,31 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 39: - w[63] = __byte_perm_S (w[24], w[23], selector); - w[62] = __byte_perm_S (w[23], w[22], selector); - w[61] = __byte_perm_S (w[22], w[21], selector); - w[60] = __byte_perm_S (w[21], w[20], selector); - w[59] = __byte_perm_S (w[20], w[19], selector); - w[58] = __byte_perm_S (w[19], w[18], selector); - w[57] = __byte_perm_S (w[18], w[17], selector); - w[56] = __byte_perm_S (w[17], w[16], selector); - w[55] = __byte_perm_S (w[16], w[15], selector); - w[54] = __byte_perm_S (w[15], w[14], selector); - w[53] = __byte_perm_S (w[14], w[13], selector); - w[52] = __byte_perm_S (w[13], w[12], selector); - w[51] = __byte_perm_S (w[12], w[11], selector); - w[50] = __byte_perm_S (w[11], w[10], selector); - w[49] = __byte_perm_S (w[10], w[ 9], selector); - w[48] = __byte_perm_S (w[ 9], w[ 8], selector); - w[47] = __byte_perm_S (w[ 8], w[ 7], selector); - w[46] = __byte_perm_S (w[ 7], w[ 6], selector); - w[45] = __byte_perm_S (w[ 6], w[ 5], selector); - w[44] = __byte_perm_S (w[ 5], w[ 4], selector); - w[43] = __byte_perm_S (w[ 4], w[ 3], selector); - w[42] = __byte_perm_S (w[ 3], w[ 2], selector); - w[41] = __byte_perm_S (w[ 2], w[ 1], selector); - w[40] = __byte_perm_S (w[ 1], w[ 0], selector); - w[39] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[24], w[23], selector); + w[62] = hc_byte_perm_S (w[23], w[22], selector); + w[61] = hc_byte_perm_S (w[22], w[21], selector); + w[60] = hc_byte_perm_S (w[21], w[20], selector); + w[59] = hc_byte_perm_S (w[20], w[19], selector); + w[58] = hc_byte_perm_S (w[19], w[18], selector); + w[57] = hc_byte_perm_S (w[18], w[17], selector); + w[56] = hc_byte_perm_S (w[17], w[16], selector); + w[55] = hc_byte_perm_S (w[16], w[15], selector); + w[54] = hc_byte_perm_S (w[15], w[14], selector); + w[53] = hc_byte_perm_S (w[14], w[13], selector); + w[52] = hc_byte_perm_S (w[13], w[12], selector); + w[51] = hc_byte_perm_S (w[12], w[11], selector); + w[50] = hc_byte_perm_S (w[11], w[10], selector); + w[49] = hc_byte_perm_S (w[10], w[ 9], selector); + w[48] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[47] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[46] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[45] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[44] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[43] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[42] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[41] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[40] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[39] = hc_byte_perm_S (w[ 0], 0, selector); w[38] = 0; w[37] = 0; w[36] = 0; @@ -58172,30 +58172,30 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 40: - w[63] = __byte_perm_S (w[23], w[22], selector); - w[62] = __byte_perm_S (w[22], w[21], selector); - w[61] = __byte_perm_S (w[21], w[20], selector); - w[60] = __byte_perm_S (w[20], w[19], selector); - w[59] = __byte_perm_S (w[19], w[18], selector); - w[58] = __byte_perm_S (w[18], w[17], selector); - w[57] = __byte_perm_S (w[17], w[16], selector); - w[56] = __byte_perm_S (w[16], w[15], selector); - w[55] = __byte_perm_S (w[15], w[14], selector); - w[54] = __byte_perm_S (w[14], w[13], selector); - w[53] = __byte_perm_S (w[13], w[12], selector); - w[52] = __byte_perm_S (w[12], w[11], selector); - w[51] = __byte_perm_S (w[11], w[10], selector); - w[50] = __byte_perm_S (w[10], w[ 9], selector); - w[49] = __byte_perm_S (w[ 9], w[ 8], selector); - w[48] = __byte_perm_S (w[ 8], w[ 7], selector); - w[47] = __byte_perm_S (w[ 7], w[ 6], selector); - w[46] = __byte_perm_S (w[ 6], w[ 5], selector); - w[45] = __byte_perm_S (w[ 5], w[ 4], selector); - w[44] = __byte_perm_S (w[ 4], w[ 3], selector); - w[43] = __byte_perm_S (w[ 3], w[ 2], selector); - w[42] = __byte_perm_S (w[ 2], w[ 1], selector); - w[41] = __byte_perm_S (w[ 1], w[ 0], selector); - w[40] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[23], w[22], selector); + w[62] = hc_byte_perm_S (w[22], w[21], selector); + w[61] = hc_byte_perm_S (w[21], w[20], selector); + w[60] = hc_byte_perm_S (w[20], w[19], selector); + w[59] = hc_byte_perm_S (w[19], w[18], selector); + w[58] = hc_byte_perm_S (w[18], w[17], selector); + w[57] = hc_byte_perm_S (w[17], w[16], selector); + w[56] = hc_byte_perm_S (w[16], w[15], selector); + w[55] = hc_byte_perm_S (w[15], w[14], selector); + w[54] = hc_byte_perm_S (w[14], w[13], selector); + w[53] = hc_byte_perm_S (w[13], w[12], selector); + w[52] = hc_byte_perm_S (w[12], w[11], selector); + w[51] = hc_byte_perm_S (w[11], w[10], selector); + w[50] = hc_byte_perm_S (w[10], w[ 9], selector); + w[49] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[48] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[47] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[46] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[45] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[44] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[43] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[42] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[41] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[40] = hc_byte_perm_S (w[ 0], 0, selector); w[39] = 0; w[38] = 0; w[37] = 0; @@ -58240,29 +58240,29 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 41: - w[63] = __byte_perm_S (w[22], w[21], selector); - w[62] = __byte_perm_S (w[21], w[20], selector); - w[61] = __byte_perm_S (w[20], w[19], selector); - w[60] = __byte_perm_S (w[19], w[18], selector); - w[59] = __byte_perm_S (w[18], w[17], selector); - w[58] = __byte_perm_S (w[17], w[16], selector); - w[57] = __byte_perm_S (w[16], w[15], selector); - w[56] = __byte_perm_S (w[15], w[14], selector); - w[55] = __byte_perm_S (w[14], w[13], selector); - w[54] = __byte_perm_S (w[13], w[12], selector); - w[53] = __byte_perm_S (w[12], w[11], selector); - w[52] = __byte_perm_S (w[11], w[10], selector); - w[51] = __byte_perm_S (w[10], w[ 9], selector); - w[50] = __byte_perm_S (w[ 9], w[ 8], selector); - w[49] = __byte_perm_S (w[ 8], w[ 7], selector); - w[48] = __byte_perm_S (w[ 7], w[ 6], selector); - w[47] = __byte_perm_S (w[ 6], w[ 5], selector); - w[46] = __byte_perm_S (w[ 5], w[ 4], selector); - w[45] = __byte_perm_S (w[ 4], w[ 3], selector); - w[44] = __byte_perm_S (w[ 3], w[ 2], selector); - w[43] = __byte_perm_S (w[ 2], w[ 1], selector); - w[42] = __byte_perm_S (w[ 1], w[ 0], selector); - w[41] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[22], w[21], selector); + w[62] = hc_byte_perm_S (w[21], w[20], selector); + w[61] = hc_byte_perm_S (w[20], w[19], selector); + w[60] = hc_byte_perm_S (w[19], w[18], selector); + w[59] = hc_byte_perm_S (w[18], w[17], selector); + w[58] = hc_byte_perm_S (w[17], w[16], selector); + w[57] = hc_byte_perm_S (w[16], w[15], selector); + w[56] = hc_byte_perm_S (w[15], w[14], selector); + w[55] = hc_byte_perm_S (w[14], w[13], selector); + w[54] = hc_byte_perm_S (w[13], w[12], selector); + w[53] = hc_byte_perm_S (w[12], w[11], selector); + w[52] = hc_byte_perm_S (w[11], w[10], selector); + w[51] = hc_byte_perm_S (w[10], w[ 9], selector); + w[50] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[49] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[48] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[47] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[46] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[45] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[44] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[43] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[42] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[41] = hc_byte_perm_S (w[ 0], 0, selector); w[40] = 0; w[39] = 0; w[38] = 0; @@ -58308,28 +58308,28 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 42: - w[63] = __byte_perm_S (w[21], w[20], selector); - w[62] = __byte_perm_S (w[20], w[19], selector); - w[61] = __byte_perm_S (w[19], w[18], selector); - w[60] = __byte_perm_S (w[18], w[17], selector); - w[59] = __byte_perm_S (w[17], w[16], selector); - w[58] = __byte_perm_S (w[16], w[15], selector); - w[57] = __byte_perm_S (w[15], w[14], selector); - w[56] = __byte_perm_S (w[14], w[13], selector); - w[55] = __byte_perm_S (w[13], w[12], selector); - w[54] = __byte_perm_S (w[12], w[11], selector); - w[53] = __byte_perm_S (w[11], w[10], selector); - w[52] = __byte_perm_S (w[10], w[ 9], selector); - w[51] = __byte_perm_S (w[ 9], w[ 8], selector); - w[50] = __byte_perm_S (w[ 8], w[ 7], selector); - w[49] = __byte_perm_S (w[ 7], w[ 6], selector); - w[48] = __byte_perm_S (w[ 6], w[ 5], selector); - w[47] = __byte_perm_S (w[ 5], w[ 4], selector); - w[46] = __byte_perm_S (w[ 4], w[ 3], selector); - w[45] = __byte_perm_S (w[ 3], w[ 2], selector); - w[44] = __byte_perm_S (w[ 2], w[ 1], selector); - w[43] = __byte_perm_S (w[ 1], w[ 0], selector); - w[42] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[21], w[20], selector); + w[62] = hc_byte_perm_S (w[20], w[19], selector); + w[61] = hc_byte_perm_S (w[19], w[18], selector); + w[60] = hc_byte_perm_S (w[18], w[17], selector); + w[59] = hc_byte_perm_S (w[17], w[16], selector); + w[58] = hc_byte_perm_S (w[16], w[15], selector); + w[57] = hc_byte_perm_S (w[15], w[14], selector); + w[56] = hc_byte_perm_S (w[14], w[13], selector); + w[55] = hc_byte_perm_S (w[13], w[12], selector); + w[54] = hc_byte_perm_S (w[12], w[11], selector); + w[53] = hc_byte_perm_S (w[11], w[10], selector); + w[52] = hc_byte_perm_S (w[10], w[ 9], selector); + w[51] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[50] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[49] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[48] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[47] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[46] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[45] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[44] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[43] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[42] = hc_byte_perm_S (w[ 0], 0, selector); w[41] = 0; w[40] = 0; w[39] = 0; @@ -58376,27 +58376,27 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 43: - w[63] = __byte_perm_S (w[20], w[19], selector); - w[62] = __byte_perm_S (w[19], w[18], selector); - w[61] = __byte_perm_S (w[18], w[17], selector); - w[60] = __byte_perm_S (w[17], w[16], selector); - w[59] = __byte_perm_S (w[16], w[15], selector); - w[58] = __byte_perm_S (w[15], w[14], selector); - w[57] = __byte_perm_S (w[14], w[13], selector); - w[56] = __byte_perm_S (w[13], w[12], selector); - w[55] = __byte_perm_S (w[12], w[11], selector); - w[54] = __byte_perm_S (w[11], w[10], selector); - w[53] = __byte_perm_S (w[10], w[ 9], selector); - w[52] = __byte_perm_S (w[ 9], w[ 8], selector); - w[51] = __byte_perm_S (w[ 8], w[ 7], selector); - w[50] = __byte_perm_S (w[ 7], w[ 6], selector); - w[49] = __byte_perm_S (w[ 6], w[ 5], selector); - w[48] = __byte_perm_S (w[ 5], w[ 4], selector); - w[47] = __byte_perm_S (w[ 4], w[ 3], selector); - w[46] = __byte_perm_S (w[ 3], w[ 2], selector); - w[45] = __byte_perm_S (w[ 2], w[ 1], selector); - w[44] = __byte_perm_S (w[ 1], w[ 0], selector); - w[43] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[20], w[19], selector); + w[62] = hc_byte_perm_S (w[19], w[18], selector); + w[61] = hc_byte_perm_S (w[18], w[17], selector); + w[60] = hc_byte_perm_S (w[17], w[16], selector); + w[59] = hc_byte_perm_S (w[16], w[15], selector); + w[58] = hc_byte_perm_S (w[15], w[14], selector); + w[57] = hc_byte_perm_S (w[14], w[13], selector); + w[56] = hc_byte_perm_S (w[13], w[12], selector); + w[55] = hc_byte_perm_S (w[12], w[11], selector); + w[54] = hc_byte_perm_S (w[11], w[10], selector); + w[53] = hc_byte_perm_S (w[10], w[ 9], selector); + w[52] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[51] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[50] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[49] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[48] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[47] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[46] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[45] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[44] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[43] = hc_byte_perm_S (w[ 0], 0, selector); w[42] = 0; w[41] = 0; w[40] = 0; @@ -58444,26 +58444,26 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 44: - w[63] = __byte_perm_S (w[19], w[18], selector); - w[62] = __byte_perm_S (w[18], w[17], selector); - w[61] = __byte_perm_S (w[17], w[16], selector); - w[60] = __byte_perm_S (w[16], w[15], selector); - w[59] = __byte_perm_S (w[15], w[14], selector); - w[58] = __byte_perm_S (w[14], w[13], selector); - w[57] = __byte_perm_S (w[13], w[12], selector); - w[56] = __byte_perm_S (w[12], w[11], selector); - w[55] = __byte_perm_S (w[11], w[10], selector); - w[54] = __byte_perm_S (w[10], w[ 9], selector); - w[53] = __byte_perm_S (w[ 9], w[ 8], selector); - w[52] = __byte_perm_S (w[ 8], w[ 7], selector); - w[51] = __byte_perm_S (w[ 7], w[ 6], selector); - w[50] = __byte_perm_S (w[ 6], w[ 5], selector); - w[49] = __byte_perm_S (w[ 5], w[ 4], selector); - w[48] = __byte_perm_S (w[ 4], w[ 3], selector); - w[47] = __byte_perm_S (w[ 3], w[ 2], selector); - w[46] = __byte_perm_S (w[ 2], w[ 1], selector); - w[45] = __byte_perm_S (w[ 1], w[ 0], selector); - w[44] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[19], w[18], selector); + w[62] = hc_byte_perm_S (w[18], w[17], selector); + w[61] = hc_byte_perm_S (w[17], w[16], selector); + w[60] = hc_byte_perm_S (w[16], w[15], selector); + w[59] = hc_byte_perm_S (w[15], w[14], selector); + w[58] = hc_byte_perm_S (w[14], w[13], selector); + w[57] = hc_byte_perm_S (w[13], w[12], selector); + w[56] = hc_byte_perm_S (w[12], w[11], selector); + w[55] = hc_byte_perm_S (w[11], w[10], selector); + w[54] = hc_byte_perm_S (w[10], w[ 9], selector); + w[53] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[52] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[51] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[50] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[49] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[48] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[47] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[46] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[45] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[44] = hc_byte_perm_S (w[ 0], 0, selector); w[43] = 0; w[42] = 0; w[41] = 0; @@ -58512,25 +58512,25 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 45: - w[63] = __byte_perm_S (w[18], w[17], selector); - w[62] = __byte_perm_S (w[17], w[16], selector); - w[61] = __byte_perm_S (w[16], w[15], selector); - w[60] = __byte_perm_S (w[15], w[14], selector); - w[59] = __byte_perm_S (w[14], w[13], selector); - w[58] = __byte_perm_S (w[13], w[12], selector); - w[57] = __byte_perm_S (w[12], w[11], selector); - w[56] = __byte_perm_S (w[11], w[10], selector); - w[55] = __byte_perm_S (w[10], w[ 9], selector); - w[54] = __byte_perm_S (w[ 9], w[ 8], selector); - w[53] = __byte_perm_S (w[ 8], w[ 7], selector); - w[52] = __byte_perm_S (w[ 7], w[ 6], selector); - w[51] = __byte_perm_S (w[ 6], w[ 5], selector); - w[50] = __byte_perm_S (w[ 5], w[ 4], selector); - w[49] = __byte_perm_S (w[ 4], w[ 3], selector); - w[48] = __byte_perm_S (w[ 3], w[ 2], selector); - w[47] = __byte_perm_S (w[ 2], w[ 1], selector); - w[46] = __byte_perm_S (w[ 1], w[ 0], selector); - w[45] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[18], w[17], selector); + w[62] = hc_byte_perm_S (w[17], w[16], selector); + w[61] = hc_byte_perm_S (w[16], w[15], selector); + w[60] = hc_byte_perm_S (w[15], w[14], selector); + w[59] = hc_byte_perm_S (w[14], w[13], selector); + w[58] = hc_byte_perm_S (w[13], w[12], selector); + w[57] = hc_byte_perm_S (w[12], w[11], selector); + w[56] = hc_byte_perm_S (w[11], w[10], selector); + w[55] = hc_byte_perm_S (w[10], w[ 9], selector); + w[54] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[53] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[52] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[51] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[50] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[49] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[48] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[47] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[46] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[45] = hc_byte_perm_S (w[ 0], 0, selector); w[44] = 0; w[43] = 0; w[42] = 0; @@ -58580,24 +58580,24 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 46: - w[63] = __byte_perm_S (w[17], w[16], selector); - w[62] = __byte_perm_S (w[16], w[15], selector); - w[61] = __byte_perm_S (w[15], w[14], selector); - w[60] = __byte_perm_S (w[14], w[13], selector); - w[59] = __byte_perm_S (w[13], w[12], selector); - w[58] = __byte_perm_S (w[12], w[11], selector); - w[57] = __byte_perm_S (w[11], w[10], selector); - w[56] = __byte_perm_S (w[10], w[ 9], selector); - w[55] = __byte_perm_S (w[ 9], w[ 8], selector); - w[54] = __byte_perm_S (w[ 8], w[ 7], selector); - w[53] = __byte_perm_S (w[ 7], w[ 6], selector); - w[52] = __byte_perm_S (w[ 6], w[ 5], selector); - w[51] = __byte_perm_S (w[ 5], w[ 4], selector); - w[50] = __byte_perm_S (w[ 4], w[ 3], selector); - w[49] = __byte_perm_S (w[ 3], w[ 2], selector); - w[48] = __byte_perm_S (w[ 2], w[ 1], selector); - w[47] = __byte_perm_S (w[ 1], w[ 0], selector); - w[46] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[17], w[16], selector); + w[62] = hc_byte_perm_S (w[16], w[15], selector); + w[61] = hc_byte_perm_S (w[15], w[14], selector); + w[60] = hc_byte_perm_S (w[14], w[13], selector); + w[59] = hc_byte_perm_S (w[13], w[12], selector); + w[58] = hc_byte_perm_S (w[12], w[11], selector); + w[57] = hc_byte_perm_S (w[11], w[10], selector); + w[56] = hc_byte_perm_S (w[10], w[ 9], selector); + w[55] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[54] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[53] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[52] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[51] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[50] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[49] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[48] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[47] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[46] = hc_byte_perm_S (w[ 0], 0, selector); w[45] = 0; w[44] = 0; w[43] = 0; @@ -58648,23 +58648,23 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 47: - w[63] = __byte_perm_S (w[16], w[15], selector); - w[62] = __byte_perm_S (w[15], w[14], selector); - w[61] = __byte_perm_S (w[14], w[13], selector); - w[60] = __byte_perm_S (w[13], w[12], selector); - w[59] = __byte_perm_S (w[12], w[11], selector); - w[58] = __byte_perm_S (w[11], w[10], selector); - w[57] = __byte_perm_S (w[10], w[ 9], selector); - w[56] = __byte_perm_S (w[ 9], w[ 8], selector); - w[55] = __byte_perm_S (w[ 8], w[ 7], selector); - w[54] = __byte_perm_S (w[ 7], w[ 6], selector); - w[53] = __byte_perm_S (w[ 6], w[ 5], selector); - w[52] = __byte_perm_S (w[ 5], w[ 4], selector); - w[51] = __byte_perm_S (w[ 4], w[ 3], selector); - w[50] = __byte_perm_S (w[ 3], w[ 2], selector); - w[49] = __byte_perm_S (w[ 2], w[ 1], selector); - w[48] = __byte_perm_S (w[ 1], w[ 0], selector); - w[47] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[16], w[15], selector); + w[62] = hc_byte_perm_S (w[15], w[14], selector); + w[61] = hc_byte_perm_S (w[14], w[13], selector); + w[60] = hc_byte_perm_S (w[13], w[12], selector); + w[59] = hc_byte_perm_S (w[12], w[11], selector); + w[58] = hc_byte_perm_S (w[11], w[10], selector); + w[57] = hc_byte_perm_S (w[10], w[ 9], selector); + w[56] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[55] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[54] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[53] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[52] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[51] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[50] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[49] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[48] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[47] = hc_byte_perm_S (w[ 0], 0, selector); w[46] = 0; w[45] = 0; w[44] = 0; @@ -58716,22 +58716,22 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 48: - w[63] = __byte_perm_S (w[15], w[14], selector); - w[62] = __byte_perm_S (w[14], w[13], selector); - w[61] = __byte_perm_S (w[13], w[12], selector); - w[60] = __byte_perm_S (w[12], w[11], selector); - w[59] = __byte_perm_S (w[11], w[10], selector); - w[58] = __byte_perm_S (w[10], w[ 9], selector); - w[57] = __byte_perm_S (w[ 9], w[ 8], selector); - w[56] = __byte_perm_S (w[ 8], w[ 7], selector); - w[55] = __byte_perm_S (w[ 7], w[ 6], selector); - w[54] = __byte_perm_S (w[ 6], w[ 5], selector); - w[53] = __byte_perm_S (w[ 5], w[ 4], selector); - w[52] = __byte_perm_S (w[ 4], w[ 3], selector); - w[51] = __byte_perm_S (w[ 3], w[ 2], selector); - w[50] = __byte_perm_S (w[ 2], w[ 1], selector); - w[49] = __byte_perm_S (w[ 1], w[ 0], selector); - w[48] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[15], w[14], selector); + w[62] = hc_byte_perm_S (w[14], w[13], selector); + w[61] = hc_byte_perm_S (w[13], w[12], selector); + w[60] = hc_byte_perm_S (w[12], w[11], selector); + w[59] = hc_byte_perm_S (w[11], w[10], selector); + w[58] = hc_byte_perm_S (w[10], w[ 9], selector); + w[57] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[56] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[55] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[54] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[53] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[52] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[51] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[50] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[49] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[48] = hc_byte_perm_S (w[ 0], 0, selector); w[47] = 0; w[46] = 0; w[45] = 0; @@ -58784,21 +58784,21 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 49: - w[63] = __byte_perm_S (w[14], w[13], selector); - w[62] = __byte_perm_S (w[13], w[12], selector); - w[61] = __byte_perm_S (w[12], w[11], selector); - w[60] = __byte_perm_S (w[11], w[10], selector); - w[59] = __byte_perm_S (w[10], w[ 9], selector); - w[58] = __byte_perm_S (w[ 9], w[ 8], selector); - w[57] = __byte_perm_S (w[ 8], w[ 7], selector); - w[56] = __byte_perm_S (w[ 7], w[ 6], selector); - w[55] = __byte_perm_S (w[ 6], w[ 5], selector); - w[54] = __byte_perm_S (w[ 5], w[ 4], selector); - w[53] = __byte_perm_S (w[ 4], w[ 3], selector); - w[52] = __byte_perm_S (w[ 3], w[ 2], selector); - w[51] = __byte_perm_S (w[ 2], w[ 1], selector); - w[50] = __byte_perm_S (w[ 1], w[ 0], selector); - w[49] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[14], w[13], selector); + w[62] = hc_byte_perm_S (w[13], w[12], selector); + w[61] = hc_byte_perm_S (w[12], w[11], selector); + w[60] = hc_byte_perm_S (w[11], w[10], selector); + w[59] = hc_byte_perm_S (w[10], w[ 9], selector); + w[58] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[57] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[56] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[55] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[54] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[53] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[52] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[51] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[50] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[49] = hc_byte_perm_S (w[ 0], 0, selector); w[48] = 0; w[47] = 0; w[46] = 0; @@ -58852,20 +58852,20 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 50: - w[63] = __byte_perm_S (w[13], w[12], selector); - w[62] = __byte_perm_S (w[12], w[11], selector); - w[61] = __byte_perm_S (w[11], w[10], selector); - w[60] = __byte_perm_S (w[10], w[ 9], selector); - w[59] = __byte_perm_S (w[ 9], w[ 8], selector); - w[58] = __byte_perm_S (w[ 8], w[ 7], selector); - w[57] = __byte_perm_S (w[ 7], w[ 6], selector); - w[56] = __byte_perm_S (w[ 6], w[ 5], selector); - w[55] = __byte_perm_S (w[ 5], w[ 4], selector); - w[54] = __byte_perm_S (w[ 4], w[ 3], selector); - w[53] = __byte_perm_S (w[ 3], w[ 2], selector); - w[52] = __byte_perm_S (w[ 2], w[ 1], selector); - w[51] = __byte_perm_S (w[ 1], w[ 0], selector); - w[50] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[13], w[12], selector); + w[62] = hc_byte_perm_S (w[12], w[11], selector); + w[61] = hc_byte_perm_S (w[11], w[10], selector); + w[60] = hc_byte_perm_S (w[10], w[ 9], selector); + w[59] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[58] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[57] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[56] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[55] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[54] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[53] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[52] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[51] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[50] = hc_byte_perm_S (w[ 0], 0, selector); w[49] = 0; w[48] = 0; w[47] = 0; @@ -58920,19 +58920,19 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 51: - w[63] = __byte_perm_S (w[12], w[11], selector); - w[62] = __byte_perm_S (w[11], w[10], selector); - w[61] = __byte_perm_S (w[10], w[ 9], selector); - w[60] = __byte_perm_S (w[ 9], w[ 8], selector); - w[59] = __byte_perm_S (w[ 8], w[ 7], selector); - w[58] = __byte_perm_S (w[ 7], w[ 6], selector); - w[57] = __byte_perm_S (w[ 6], w[ 5], selector); - w[56] = __byte_perm_S (w[ 5], w[ 4], selector); - w[55] = __byte_perm_S (w[ 4], w[ 3], selector); - w[54] = __byte_perm_S (w[ 3], w[ 2], selector); - w[53] = __byte_perm_S (w[ 2], w[ 1], selector); - w[52] = __byte_perm_S (w[ 1], w[ 0], selector); - w[51] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[12], w[11], selector); + w[62] = hc_byte_perm_S (w[11], w[10], selector); + w[61] = hc_byte_perm_S (w[10], w[ 9], selector); + w[60] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[59] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[58] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[57] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[56] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[55] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[54] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[53] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[52] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[51] = hc_byte_perm_S (w[ 0], 0, selector); w[50] = 0; w[49] = 0; w[48] = 0; @@ -58988,18 +58988,18 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 52: - w[63] = __byte_perm_S (w[11], w[10], selector); - w[62] = __byte_perm_S (w[10], w[ 9], selector); - w[61] = __byte_perm_S (w[ 9], w[ 8], selector); - w[60] = __byte_perm_S (w[ 8], w[ 7], selector); - w[59] = __byte_perm_S (w[ 7], w[ 6], selector); - w[58] = __byte_perm_S (w[ 6], w[ 5], selector); - w[57] = __byte_perm_S (w[ 5], w[ 4], selector); - w[56] = __byte_perm_S (w[ 4], w[ 3], selector); - w[55] = __byte_perm_S (w[ 3], w[ 2], selector); - w[54] = __byte_perm_S (w[ 2], w[ 1], selector); - w[53] = __byte_perm_S (w[ 1], w[ 0], selector); - w[52] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[11], w[10], selector); + w[62] = hc_byte_perm_S (w[10], w[ 9], selector); + w[61] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[60] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[59] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[58] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[57] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[56] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[55] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[54] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[53] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[52] = hc_byte_perm_S (w[ 0], 0, selector); w[51] = 0; w[50] = 0; w[49] = 0; @@ -59056,17 +59056,17 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 53: - w[63] = __byte_perm_S (w[10], w[ 9], selector); - w[62] = __byte_perm_S (w[ 9], w[ 8], selector); - w[61] = __byte_perm_S (w[ 8], w[ 7], selector); - w[60] = __byte_perm_S (w[ 7], w[ 6], selector); - w[59] = __byte_perm_S (w[ 6], w[ 5], selector); - w[58] = __byte_perm_S (w[ 5], w[ 4], selector); - w[57] = __byte_perm_S (w[ 4], w[ 3], selector); - w[56] = __byte_perm_S (w[ 3], w[ 2], selector); - w[55] = __byte_perm_S (w[ 2], w[ 1], selector); - w[54] = __byte_perm_S (w[ 1], w[ 0], selector); - w[53] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[10], w[ 9], selector); + w[62] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[61] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[60] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[59] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[58] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[57] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[56] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[55] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[54] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[53] = hc_byte_perm_S (w[ 0], 0, selector); w[52] = 0; w[51] = 0; w[50] = 0; @@ -59124,16 +59124,16 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 54: - w[63] = __byte_perm_S (w[ 9], w[ 8], selector); - w[62] = __byte_perm_S (w[ 8], w[ 7], selector); - w[61] = __byte_perm_S (w[ 7], w[ 6], selector); - w[60] = __byte_perm_S (w[ 6], w[ 5], selector); - w[59] = __byte_perm_S (w[ 5], w[ 4], selector); - w[58] = __byte_perm_S (w[ 4], w[ 3], selector); - w[57] = __byte_perm_S (w[ 3], w[ 2], selector); - w[56] = __byte_perm_S (w[ 2], w[ 1], selector); - w[55] = __byte_perm_S (w[ 1], w[ 0], selector); - w[54] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[ 9], w[ 8], selector); + w[62] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[61] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[60] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[59] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[58] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[57] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[56] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[55] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[54] = hc_byte_perm_S (w[ 0], 0, selector); w[53] = 0; w[52] = 0; w[51] = 0; @@ -59192,15 +59192,15 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 55: - w[63] = __byte_perm_S (w[ 8], w[ 7], selector); - w[62] = __byte_perm_S (w[ 7], w[ 6], selector); - w[61] = __byte_perm_S (w[ 6], w[ 5], selector); - w[60] = __byte_perm_S (w[ 5], w[ 4], selector); - w[59] = __byte_perm_S (w[ 4], w[ 3], selector); - w[58] = __byte_perm_S (w[ 3], w[ 2], selector); - w[57] = __byte_perm_S (w[ 2], w[ 1], selector); - w[56] = __byte_perm_S (w[ 1], w[ 0], selector); - w[55] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[ 8], w[ 7], selector); + w[62] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[61] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[60] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[59] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[58] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[57] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[56] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[55] = hc_byte_perm_S (w[ 0], 0, selector); w[54] = 0; w[53] = 0; w[52] = 0; @@ -59260,14 +59260,14 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 56: - w[63] = __byte_perm_S (w[ 7], w[ 6], selector); - w[62] = __byte_perm_S (w[ 6], w[ 5], selector); - w[61] = __byte_perm_S (w[ 5], w[ 4], selector); - w[60] = __byte_perm_S (w[ 4], w[ 3], selector); - w[59] = __byte_perm_S (w[ 3], w[ 2], selector); - w[58] = __byte_perm_S (w[ 2], w[ 1], selector); - w[57] = __byte_perm_S (w[ 1], w[ 0], selector); - w[56] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[ 7], w[ 6], selector); + w[62] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[61] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[60] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[59] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[58] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[57] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[56] = hc_byte_perm_S (w[ 0], 0, selector); w[55] = 0; w[54] = 0; w[53] = 0; @@ -59328,13 +59328,13 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 57: - w[63] = __byte_perm_S (w[ 6], w[ 5], selector); - w[62] = __byte_perm_S (w[ 5], w[ 4], selector); - w[61] = __byte_perm_S (w[ 4], w[ 3], selector); - w[60] = __byte_perm_S (w[ 3], w[ 2], selector); - w[59] = __byte_perm_S (w[ 2], w[ 1], selector); - w[58] = __byte_perm_S (w[ 1], w[ 0], selector); - w[57] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[ 6], w[ 5], selector); + w[62] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[61] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[60] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[59] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[58] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[57] = hc_byte_perm_S (w[ 0], 0, selector); w[56] = 0; w[55] = 0; w[54] = 0; @@ -59396,12 +59396,12 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 58: - w[63] = __byte_perm_S (w[ 5], w[ 4], selector); - w[62] = __byte_perm_S (w[ 4], w[ 3], selector); - w[61] = __byte_perm_S (w[ 3], w[ 2], selector); - w[60] = __byte_perm_S (w[ 2], w[ 1], selector); - w[59] = __byte_perm_S (w[ 1], w[ 0], selector); - w[58] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[ 5], w[ 4], selector); + w[62] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[61] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[60] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[59] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[58] = hc_byte_perm_S (w[ 0], 0, selector); w[57] = 0; w[56] = 0; w[55] = 0; @@ -59464,11 +59464,11 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 59: - w[63] = __byte_perm_S (w[ 4], w[ 3], selector); - w[62] = __byte_perm_S (w[ 3], w[ 2], selector); - w[61] = __byte_perm_S (w[ 2], w[ 1], selector); - w[60] = __byte_perm_S (w[ 1], w[ 0], selector); - w[59] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[ 4], w[ 3], selector); + w[62] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[61] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[60] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[59] = hc_byte_perm_S (w[ 0], 0, selector); w[58] = 0; w[57] = 0; w[56] = 0; @@ -59532,10 +59532,10 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 60: - w[63] = __byte_perm_S (w[ 3], w[ 2], selector); - w[62] = __byte_perm_S (w[ 2], w[ 1], selector); - w[61] = __byte_perm_S (w[ 1], w[ 0], selector); - w[60] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[ 3], w[ 2], selector); + w[62] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[61] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[60] = hc_byte_perm_S (w[ 0], 0, selector); w[59] = 0; w[58] = 0; w[57] = 0; @@ -59600,9 +59600,9 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 61: - w[63] = __byte_perm_S (w[ 2], w[ 1], selector); - w[62] = __byte_perm_S (w[ 1], w[ 0], selector); - w[61] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[ 2], w[ 1], selector); + w[62] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[61] = hc_byte_perm_S (w[ 0], 0, selector); w[60] = 0; w[59] = 0; w[58] = 0; @@ -59668,8 +59668,8 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 62: - w[63] = __byte_perm_S (w[ 1], w[ 0], selector); - w[62] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[ 1], w[ 0], selector); + w[62] = hc_byte_perm_S (w[ 0], 0, selector); w[61] = 0; w[60] = 0; w[59] = 0; @@ -59736,7 +59736,7 @@ DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 *w, const u32 offset) break; case 63: - w[63] = __byte_perm_S (w[ 0], 0, selector); + w[63] = hc_byte_perm_S (w[ 0], 0, selector); w[62] = 0; w[61] = 0; w[60] = 0; diff --git a/OpenCL/inc_hash_functions.cl b/OpenCL/inc_hash_functions.cl index b895af3f1..adb4d3dba 100644 --- a/OpenCL/inc_hash_functions.cl +++ b/OpenCL/inc_hash_functions.cl @@ -34,20 +34,20 @@ #define MD4_STEP_S(f,a,b,c,d,x,K,s) \ { \ a += K; \ - a = __add3_S (a, x, f (b, c, d)); \ + a = hc_add3_S (a, x, f (b, c, d)); \ a = rotl32_S (a, s); \ } #define MD4_STEP(f,a,b,c,d,x,K,s) \ { \ a += K; \ - a = __add3 (a, x, f (b, c, d)); \ + a = hc_add3 (a, x, f (b, c, d)); \ a = rotl32 (a, s); \ } #define MD4_STEP0(f,a,b,c,d,K,s) \ { \ - a = __add3 (a, K, f (b, c, d)); \ + a = hc_add3 (a, K, f (b, c, d)); \ a = rotl32 (a, s); \ } @@ -92,7 +92,7 @@ #define MD5_STEP_S(f,a,b,c,d,x,K,s) \ { \ a += K; \ - a = __add3_S (a, x, f (b, c, d)); \ + a = hc_add3_S (a, x, f (b, c, d)); \ a = rotl32_S (a, s); \ a += b; \ } @@ -100,14 +100,14 @@ #define MD5_STEP(f,a,b,c,d,x,K,s) \ { \ a += K; \ - a = __add3 (a, x, f (b, c, d)); \ + a = hc_add3 (a, x, f (b, c, d)); \ a = rotl32 (a, s); \ a += b; \ } #define MD5_STEP0(f,a,b,c,d,K,s) \ { \ - a = __add3 (a, K, f (b, c, d)); \ + a = hc_add3 (a, K, f (b, c, d)); \ a = rotl32 (a, s); \ a += b; \ } @@ -139,7 +139,7 @@ #define SHA1_STEP_S(f,a,b,c,d,e,x) \ { \ e += K; \ - e = __add3_S (e, x, f (b, c, d)); \ + e = hc_add3_S (e, x, f (b, c, d)); \ e += rotl32_S (a, 5u); \ b = rotl32_S (b, 30u); \ } @@ -147,7 +147,7 @@ #define SHA1_STEP(f,a,b,c,d,e,x) \ { \ e += K; \ - e = __add3 (e, x, f (b, c, d)); \ + e = hc_add3 (e, x, f (b, c, d)); \ e += rotl32 (a, 5u); \ b = rotl32 (b, 30u); \ } @@ -155,7 +155,7 @@ /* #define SHA1_STEP0(f,a,b,c,d,e,x) \ { \ - e = __add3 (e, K, f (b, c, d)); \ + e = hc_add3 (e, K, f (b, c, d)); \ e += rotl32 (a, 5u); \ b = rotl32 (b, 30u); \ } @@ -163,7 +163,7 @@ #define SHA1_STEPX(f,a,b,c,d,e,x) \ { \ - e = __add3 (e, x, f (b, c, d)); \ + e = hc_add3 (e, x, f (b, c, d)); \ e += rotl32 (a, 5u); \ b = rotl32 (b, 30u); \ } @@ -203,20 +203,20 @@ #define SHA256_STEP_S(F0,F1,a,b,c,d,e,f,g,h,x,K) \ { \ - h = __add3_S (h, K, x); \ - h = __add3_S (h, SHA256_S3_S (e), F1 (e,f,g)); \ + h = hc_add3_S (h, K, x); \ + h = hc_add3_S (h, SHA256_S3_S (e), F1 (e,f,g)); \ d += h; \ - h = __add3_S (h, SHA256_S2_S (a), F0 (a,b,c)); \ + h = hc_add3_S (h, SHA256_S2_S (a), F0 (a,b,c)); \ } #define SHA256_EXPAND_S(x,y,z,w) (SHA256_S1_S (x) + y + SHA256_S0_S (z) + w) #define SHA256_STEP(F0,F1,a,b,c,d,e,f,g,h,x,K) \ { \ - h = __add3 (h, K, x); \ - h = __add3 (h, SHA256_S3 (e), F1 (e,f,g)); \ + h = hc_add3 (h, K, x); \ + h = hc_add3 (h, SHA256_S3 (e), F1 (e,f,g)); \ d += h; \ - h = __add3 (h, SHA256_S2 (a), F0 (a,b,c)); \ + h = hc_add3 (h, SHA256_S2 (a), F0 (a,b,c)); \ } #define SHA256_EXPAND(x,y,z,w) (SHA256_S1 (x) + y + SHA256_S0 (z) + w) diff --git a/OpenCL/inc_rp_optimized.cl b/OpenCL/inc_rp_optimized.cl index 81863b06e..d274ffb98 100644 --- a/OpenCL/inc_rp_optimized.cl +++ b/OpenCL/inc_rp_optimized.cl @@ -136,26 +136,26 @@ void truncate_left (u32 *buf0, u32 *buf1, const u32 offset) void lshift_block (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1) { - out0[0] = amd_bytealign_S (in0[1], in0[0], 1); - out0[1] = amd_bytealign_S (in0[2], in0[1], 1); - out0[2] = amd_bytealign_S (in0[3], in0[2], 1); - out0[3] = amd_bytealign_S (in1[0], in0[3], 1); - out1[0] = amd_bytealign_S (in1[1], in1[0], 1); - out1[1] = amd_bytealign_S (in1[2], in1[1], 1); - out1[2] = amd_bytealign_S (in1[3], in1[2], 1); - out1[3] = amd_bytealign_S ( 0, in1[3], 1); + out0[0] = hc_bytealign_S (in0[1], in0[0], 1); + out0[1] = hc_bytealign_S (in0[2], in0[1], 1); + out0[2] = hc_bytealign_S (in0[3], in0[2], 1); + out0[3] = hc_bytealign_S (in1[0], in0[3], 1); + out1[0] = hc_bytealign_S (in1[1], in1[0], 1); + out1[1] = hc_bytealign_S (in1[2], in1[1], 1); + out1[2] = hc_bytealign_S (in1[3], in1[2], 1); + out1[3] = hc_bytealign_S ( 0, in1[3], 1); } void rshift_block (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1) { - out1[3] = amd_bytealign_S (in1[3], in1[2], 3); - out1[2] = amd_bytealign_S (in1[2], in1[1], 3); - out1[1] = amd_bytealign_S (in1[1], in1[0], 3); - out1[0] = amd_bytealign_S (in1[0], in0[3], 3); - out0[3] = amd_bytealign_S (in0[3], in0[2], 3); - out0[2] = amd_bytealign_S (in0[2], in0[1], 3); - out0[1] = amd_bytealign_S (in0[1], in0[0], 3); - out0[0] = amd_bytealign_S (in0[0], 0, 3); + out1[3] = hc_bytealign_S (in1[3], in1[2], 3); + out1[2] = hc_bytealign_S (in1[2], in1[1], 3); + out1[1] = hc_bytealign_S (in1[1], in1[0], 3); + out1[0] = hc_bytealign_S (in1[0], in0[3], 3); + out0[3] = hc_bytealign_S (in0[3], in0[2], 3); + out0[2] = hc_bytealign_S (in0[2], in0[1], 3); + out0[1] = hc_bytealign_S (in0[1], in0[0], 3); + out0[0] = hc_bytealign_S (in0[0], 0, 3); } void lshift_block_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const u32 num) @@ -171,32 +171,32 @@ void lshift_block_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const out1[2] = in1[2]; out1[3] = in1[3]; break; - case 1: out0[0] = amd_bytealign_S (in0[1], in0[0], 1); - out0[1] = amd_bytealign_S (in0[2], in0[1], 1); - out0[2] = amd_bytealign_S (in0[3], in0[2], 1); - out0[3] = amd_bytealign_S (in1[0], in0[3], 1); - out1[0] = amd_bytealign_S (in1[1], in1[0], 1); - out1[1] = amd_bytealign_S (in1[2], in1[1], 1); - out1[2] = amd_bytealign_S (in1[3], in1[2], 1); - out1[3] = amd_bytealign_S ( 0, in1[3], 1); + case 1: out0[0] = hc_bytealign_S (in0[1], in0[0], 1); + out0[1] = hc_bytealign_S (in0[2], in0[1], 1); + out0[2] = hc_bytealign_S (in0[3], in0[2], 1); + out0[3] = hc_bytealign_S (in1[0], in0[3], 1); + out1[0] = hc_bytealign_S (in1[1], in1[0], 1); + out1[1] = hc_bytealign_S (in1[2], in1[1], 1); + out1[2] = hc_bytealign_S (in1[3], in1[2], 1); + out1[3] = hc_bytealign_S ( 0, in1[3], 1); break; - case 2: out0[0] = amd_bytealign_S (in0[1], in0[0], 2); - out0[1] = amd_bytealign_S (in0[2], in0[1], 2); - out0[2] = amd_bytealign_S (in0[3], in0[2], 2); - out0[3] = amd_bytealign_S (in1[0], in0[3], 2); - out1[0] = amd_bytealign_S (in1[1], in1[0], 2); - out1[1] = amd_bytealign_S (in1[2], in1[1], 2); - out1[2] = amd_bytealign_S (in1[3], in1[2], 2); - out1[3] = amd_bytealign_S ( 0, in1[3], 2); + case 2: out0[0] = hc_bytealign_S (in0[1], in0[0], 2); + out0[1] = hc_bytealign_S (in0[2], in0[1], 2); + out0[2] = hc_bytealign_S (in0[3], in0[2], 2); + out0[3] = hc_bytealign_S (in1[0], in0[3], 2); + out1[0] = hc_bytealign_S (in1[1], in1[0], 2); + out1[1] = hc_bytealign_S (in1[2], in1[1], 2); + out1[2] = hc_bytealign_S (in1[3], in1[2], 2); + out1[3] = hc_bytealign_S ( 0, in1[3], 2); break; - case 3: out0[0] = amd_bytealign_S (in0[1], in0[0], 3); - out0[1] = amd_bytealign_S (in0[2], in0[1], 3); - out0[2] = amd_bytealign_S (in0[3], in0[2], 3); - out0[3] = amd_bytealign_S (in1[0], in0[3], 3); - out1[0] = amd_bytealign_S (in1[1], in1[0], 3); - out1[1] = amd_bytealign_S (in1[2], in1[1], 3); - out1[2] = amd_bytealign_S (in1[3], in1[2], 3); - out1[3] = amd_bytealign_S ( 0, in1[3], 3); + case 3: out0[0] = hc_bytealign_S (in0[1], in0[0], 3); + out0[1] = hc_bytealign_S (in0[2], in0[1], 3); + out0[2] = hc_bytealign_S (in0[3], in0[2], 3); + out0[3] = hc_bytealign_S (in1[0], in0[3], 3); + out1[0] = hc_bytealign_S (in1[1], in1[0], 3); + out1[1] = hc_bytealign_S (in1[2], in1[1], 3); + out1[2] = hc_bytealign_S (in1[3], in1[2], 3); + out1[3] = hc_bytealign_S ( 0, in1[3], 3); break; case 4: out0[0] = in0[1]; out0[1] = in0[2]; @@ -207,31 +207,31 @@ void lshift_block_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const out1[2] = in1[3]; out1[3] = 0; break; - case 5: out0[0] = amd_bytealign_S (in0[2], in0[1], 1); - out0[1] = amd_bytealign_S (in0[3], in0[2], 1); - out0[2] = amd_bytealign_S (in1[0], in0[3], 1); - out0[3] = amd_bytealign_S (in1[1], in1[0], 1); - out1[0] = amd_bytealign_S (in1[2], in1[1], 1); - out1[1] = amd_bytealign_S (in1[3], in1[2], 1); - out1[2] = amd_bytealign_S ( 0, in1[3], 1); + case 5: out0[0] = hc_bytealign_S (in0[2], in0[1], 1); + out0[1] = hc_bytealign_S (in0[3], in0[2], 1); + out0[2] = hc_bytealign_S (in1[0], in0[3], 1); + out0[3] = hc_bytealign_S (in1[1], in1[0], 1); + out1[0] = hc_bytealign_S (in1[2], in1[1], 1); + out1[1] = hc_bytealign_S (in1[3], in1[2], 1); + out1[2] = hc_bytealign_S ( 0, in1[3], 1); out1[3] = 0; break; - case 6: out0[0] = amd_bytealign_S (in0[2], in0[1], 2); - out0[1] = amd_bytealign_S (in0[3], in0[2], 2); - out0[2] = amd_bytealign_S (in1[0], in0[3], 2); - out0[3] = amd_bytealign_S (in1[1], in1[0], 2); - out1[0] = amd_bytealign_S (in1[2], in1[1], 2); - out1[1] = amd_bytealign_S (in1[3], in1[2], 2); - out1[2] = amd_bytealign_S ( 0, in1[3], 2); + case 6: out0[0] = hc_bytealign_S (in0[2], in0[1], 2); + out0[1] = hc_bytealign_S (in0[3], in0[2], 2); + out0[2] = hc_bytealign_S (in1[0], in0[3], 2); + out0[3] = hc_bytealign_S (in1[1], in1[0], 2); + out1[0] = hc_bytealign_S (in1[2], in1[1], 2); + out1[1] = hc_bytealign_S (in1[3], in1[2], 2); + out1[2] = hc_bytealign_S ( 0, in1[3], 2); out1[3] = 0; break; - case 7: out0[0] = amd_bytealign_S (in0[2], in0[1], 3); - out0[1] = amd_bytealign_S (in0[3], in0[2], 3); - out0[2] = amd_bytealign_S (in1[0], in0[3], 3); - out0[3] = amd_bytealign_S (in1[1], in1[0], 3); - out1[0] = amd_bytealign_S (in1[2], in1[1], 3); - out1[1] = amd_bytealign_S (in1[3], in1[2], 3); - out1[2] = amd_bytealign_S ( 0, in1[3], 3); + case 7: out0[0] = hc_bytealign_S (in0[2], in0[1], 3); + out0[1] = hc_bytealign_S (in0[3], in0[2], 3); + out0[2] = hc_bytealign_S (in1[0], in0[3], 3); + out0[3] = hc_bytealign_S (in1[1], in1[0], 3); + out1[0] = hc_bytealign_S (in1[2], in1[1], 3); + out1[1] = hc_bytealign_S (in1[3], in1[2], 3); + out1[2] = hc_bytealign_S ( 0, in1[3], 3); out1[3] = 0; break; case 8: out0[0] = in0[2]; @@ -243,30 +243,30 @@ void lshift_block_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const out1[2] = 0; out1[3] = 0; break; - case 9: out0[0] = amd_bytealign_S (in0[3], in0[2], 1); - out0[1] = amd_bytealign_S (in1[0], in0[3], 1); - out0[2] = amd_bytealign_S (in1[1], in1[0], 1); - out0[3] = amd_bytealign_S (in1[2], in1[1], 1); - out1[0] = amd_bytealign_S (in1[3], in1[2], 1); - out1[1] = amd_bytealign_S ( 0, in1[3], 1); + case 9: out0[0] = hc_bytealign_S (in0[3], in0[2], 1); + out0[1] = hc_bytealign_S (in1[0], in0[3], 1); + out0[2] = hc_bytealign_S (in1[1], in1[0], 1); + out0[3] = hc_bytealign_S (in1[2], in1[1], 1); + out1[0] = hc_bytealign_S (in1[3], in1[2], 1); + out1[1] = hc_bytealign_S ( 0, in1[3], 1); out1[2] = 0; out1[3] = 0; break; - case 10: out0[0] = amd_bytealign_S (in0[3], in0[2], 2); - out0[1] = amd_bytealign_S (in1[0], in0[3], 2); - out0[2] = amd_bytealign_S (in1[1], in1[0], 2); - out0[3] = amd_bytealign_S (in1[2], in1[1], 2); - out1[0] = amd_bytealign_S (in1[3], in1[2], 2); - out1[1] = amd_bytealign_S ( 0, in1[3], 2); + case 10: out0[0] = hc_bytealign_S (in0[3], in0[2], 2); + out0[1] = hc_bytealign_S (in1[0], in0[3], 2); + out0[2] = hc_bytealign_S (in1[1], in1[0], 2); + out0[3] = hc_bytealign_S (in1[2], in1[1], 2); + out1[0] = hc_bytealign_S (in1[3], in1[2], 2); + out1[1] = hc_bytealign_S ( 0, in1[3], 2); out1[2] = 0; out1[3] = 0; break; - case 11: out0[0] = amd_bytealign_S (in0[3], in0[2], 3); - out0[1] = amd_bytealign_S (in1[0], in0[3], 3); - out0[2] = amd_bytealign_S (in1[1], in1[0], 3); - out0[3] = amd_bytealign_S (in1[2], in1[1], 3); - out1[0] = amd_bytealign_S (in1[3], in1[2], 3); - out1[1] = amd_bytealign_S ( 0, in1[3], 3); + case 11: out0[0] = hc_bytealign_S (in0[3], in0[2], 3); + out0[1] = hc_bytealign_S (in1[0], in0[3], 3); + out0[2] = hc_bytealign_S (in1[1], in1[0], 3); + out0[3] = hc_bytealign_S (in1[2], in1[1], 3); + out1[0] = hc_bytealign_S (in1[3], in1[2], 3); + out1[1] = hc_bytealign_S ( 0, in1[3], 3); out1[2] = 0; out1[3] = 0; break; @@ -279,29 +279,29 @@ void lshift_block_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const out1[2] = 0; out1[3] = 0; break; - case 13: out0[0] = amd_bytealign_S (in1[0], in0[3], 1); - out0[1] = amd_bytealign_S (in1[1], in1[0], 1); - out0[2] = amd_bytealign_S (in1[2], in1[1], 1); - out0[3] = amd_bytealign_S (in1[3], in1[2], 1); - out1[0] = amd_bytealign_S ( 0, in1[3], 1); + case 13: out0[0] = hc_bytealign_S (in1[0], in0[3], 1); + out0[1] = hc_bytealign_S (in1[1], in1[0], 1); + out0[2] = hc_bytealign_S (in1[2], in1[1], 1); + out0[3] = hc_bytealign_S (in1[3], in1[2], 1); + out1[0] = hc_bytealign_S ( 0, in1[3], 1); out1[1] = 0; out1[2] = 0; out1[3] = 0; break; - case 14: out0[0] = amd_bytealign_S (in1[0], in0[3], 2); - out0[1] = amd_bytealign_S (in1[1], in1[0], 2); - out0[2] = amd_bytealign_S (in1[2], in1[1], 2); - out0[3] = amd_bytealign_S (in1[3], in1[2], 2); - out1[0] = amd_bytealign_S ( 0, in1[3], 2); + case 14: out0[0] = hc_bytealign_S (in1[0], in0[3], 2); + out0[1] = hc_bytealign_S (in1[1], in1[0], 2); + out0[2] = hc_bytealign_S (in1[2], in1[1], 2); + out0[3] = hc_bytealign_S (in1[3], in1[2], 2); + out1[0] = hc_bytealign_S ( 0, in1[3], 2); out1[1] = 0; out1[2] = 0; out1[3] = 0; break; - case 15: out0[0] = amd_bytealign_S (in1[0], in0[3], 3); - out0[1] = amd_bytealign_S (in1[1], in1[0], 3); - out0[2] = amd_bytealign_S (in1[2], in1[1], 3); - out0[3] = amd_bytealign_S (in1[3], in1[2], 3); - out1[0] = amd_bytealign_S ( 0, in1[3], 3); + case 15: out0[0] = hc_bytealign_S (in1[0], in0[3], 3); + out0[1] = hc_bytealign_S (in1[1], in1[0], 3); + out0[2] = hc_bytealign_S (in1[2], in1[1], 3); + out0[3] = hc_bytealign_S (in1[3], in1[2], 3); + out1[0] = hc_bytealign_S ( 0, in1[3], 3); out1[1] = 0; out1[2] = 0; out1[3] = 0; @@ -315,28 +315,28 @@ void lshift_block_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const out1[2] = 0; out1[3] = 0; break; - case 17: out0[0] = amd_bytealign_S (in1[1], in1[0], 1); - out0[1] = amd_bytealign_S (in1[2], in1[1], 1); - out0[2] = amd_bytealign_S (in1[3], in1[2], 1); - out0[3] = amd_bytealign_S ( 0, in1[3], 1); + case 17: out0[0] = hc_bytealign_S (in1[1], in1[0], 1); + out0[1] = hc_bytealign_S (in1[2], in1[1], 1); + out0[2] = hc_bytealign_S (in1[3], in1[2], 1); + out0[3] = hc_bytealign_S ( 0, in1[3], 1); out1[0] = 0; out1[1] = 0; out1[2] = 0; out1[3] = 0; break; - case 18: out0[0] = amd_bytealign_S (in1[1], in1[0], 2); - out0[1] = amd_bytealign_S (in1[2], in1[1], 2); - out0[2] = amd_bytealign_S (in1[3], in1[2], 2); - out0[3] = amd_bytealign_S ( 0, in1[3], 2); + case 18: out0[0] = hc_bytealign_S (in1[1], in1[0], 2); + out0[1] = hc_bytealign_S (in1[2], in1[1], 2); + out0[2] = hc_bytealign_S (in1[3], in1[2], 2); + out0[3] = hc_bytealign_S ( 0, in1[3], 2); out1[0] = 0; out1[1] = 0; out1[2] = 0; out1[3] = 0; break; - case 19: out0[0] = amd_bytealign_S (in1[1], in1[0], 3); - out0[1] = amd_bytealign_S (in1[2], in1[1], 3); - out0[2] = amd_bytealign_S (in1[3], in1[2], 3); - out0[3] = amd_bytealign_S ( 0, in1[3], 3); + case 19: out0[0] = hc_bytealign_S (in1[1], in1[0], 3); + out0[1] = hc_bytealign_S (in1[2], in1[1], 3); + out0[2] = hc_bytealign_S (in1[3], in1[2], 3); + out0[3] = hc_bytealign_S ( 0, in1[3], 3); out1[0] = 0; out1[1] = 0; out1[2] = 0; @@ -351,27 +351,27 @@ void lshift_block_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const out1[2] = 0; out1[3] = 0; break; - case 21: out0[0] = amd_bytealign_S (in1[2], in1[1], 1); - out0[1] = amd_bytealign_S (in1[3], in1[2], 1); - out0[2] = amd_bytealign_S ( 0, in1[3], 1); + case 21: out0[0] = hc_bytealign_S (in1[2], in1[1], 1); + out0[1] = hc_bytealign_S (in1[3], in1[2], 1); + out0[2] = hc_bytealign_S ( 0, in1[3], 1); out0[3] = 0; out1[0] = 0; out1[1] = 0; out1[2] = 0; out1[3] = 0; break; - case 22: out0[0] = amd_bytealign_S (in1[2], in1[1], 2); - out0[1] = amd_bytealign_S (in1[3], in1[2], 2); - out0[2] = amd_bytealign_S ( 0, in1[3], 2); + case 22: out0[0] = hc_bytealign_S (in1[2], in1[1], 2); + out0[1] = hc_bytealign_S (in1[3], in1[2], 2); + out0[2] = hc_bytealign_S ( 0, in1[3], 2); out0[3] = 0; out1[0] = 0; out1[1] = 0; out1[2] = 0; out1[3] = 0; break; - case 23: out0[0] = amd_bytealign_S (in1[2], in1[1], 3); - out0[1] = amd_bytealign_S (in1[3], in1[2], 3); - out0[2] = amd_bytealign_S ( 0, in1[3], 3); + case 23: out0[0] = hc_bytealign_S (in1[2], in1[1], 3); + out0[1] = hc_bytealign_S (in1[3], in1[2], 3); + out0[2] = hc_bytealign_S ( 0, in1[3], 3); out0[3] = 0; out1[0] = 0; out1[1] = 0; @@ -387,8 +387,8 @@ void lshift_block_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const out1[2] = 0; out1[3] = 0; break; - case 25: out0[0] = amd_bytealign_S (in1[3], in1[2], 1); - out0[1] = amd_bytealign_S ( 0, in1[3], 1); + case 25: out0[0] = hc_bytealign_S (in1[3], in1[2], 1); + out0[1] = hc_bytealign_S ( 0, in1[3], 1); out0[2] = 0; out0[3] = 0; out1[0] = 0; @@ -396,8 +396,8 @@ void lshift_block_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const out1[2] = 0; out1[3] = 0; break; - case 26: out0[0] = amd_bytealign_S (in1[3], in1[2], 2); - out0[1] = amd_bytealign_S ( 0, in1[3], 2); + case 26: out0[0] = hc_bytealign_S (in1[3], in1[2], 2); + out0[1] = hc_bytealign_S ( 0, in1[3], 2); out0[2] = 0; out0[3] = 0; out1[0] = 0; @@ -405,8 +405,8 @@ void lshift_block_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const out1[2] = 0; out1[3] = 0; break; - case 27: out0[0] = amd_bytealign_S (in1[3], in1[2], 3); - out0[1] = amd_bytealign_S ( 0, in1[3], 3); + case 27: out0[0] = hc_bytealign_S (in1[3], in1[2], 3); + out0[1] = hc_bytealign_S ( 0, in1[3], 3); out0[2] = 0; out0[3] = 0; out1[0] = 0; @@ -423,7 +423,7 @@ void lshift_block_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const out1[2] = 0; out1[3] = 0; break; - case 29: out0[0] = amd_bytealign_S ( 0, in1[3], 1); + case 29: out0[0] = hc_bytealign_S ( 0, in1[3], 1); out0[1] = 0; out0[2] = 0; out0[3] = 0; @@ -432,7 +432,7 @@ void lshift_block_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const out1[2] = 0; out1[3] = 0; break; - case 30: out0[0] = amd_bytealign_S ( 0, in1[3], 2); + case 30: out0[0] = hc_bytealign_S ( 0, in1[3], 2); out0[1] = 0; out0[2] = 0; out0[3] = 0; @@ -441,7 +441,7 @@ void lshift_block_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const out1[2] = 0; out1[3] = 0; break; - case 31: out0[0] = amd_bytealign_S ( 0, in1[3], 3); + case 31: out0[0] = hc_bytealign_S ( 0, in1[3], 3); out0[1] = 0; out0[2] = 0; out0[3] = 0; @@ -466,32 +466,32 @@ void rshift_block_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const out0[1] = in0[1]; out0[0] = in0[0]; break; - case 1: out1[3] = amd_bytealign_S (in1[3], in1[2], 3); - out1[2] = amd_bytealign_S (in1[2], in1[1], 3); - out1[1] = amd_bytealign_S (in1[1], in1[0], 3); - out1[0] = amd_bytealign_S (in1[0], in0[3], 3); - out0[3] = amd_bytealign_S (in0[3], in0[2], 3); - out0[2] = amd_bytealign_S (in0[2], in0[1], 3); - out0[1] = amd_bytealign_S (in0[1], in0[0], 3); - out0[0] = amd_bytealign_S (in0[0], 0, 3); + case 1: out1[3] = hc_bytealign_S (in1[3], in1[2], 3); + out1[2] = hc_bytealign_S (in1[2], in1[1], 3); + out1[1] = hc_bytealign_S (in1[1], in1[0], 3); + out1[0] = hc_bytealign_S (in1[0], in0[3], 3); + out0[3] = hc_bytealign_S (in0[3], in0[2], 3); + out0[2] = hc_bytealign_S (in0[2], in0[1], 3); + out0[1] = hc_bytealign_S (in0[1], in0[0], 3); + out0[0] = hc_bytealign_S (in0[0], 0, 3); break; - case 2: out1[3] = amd_bytealign_S (in1[3], in1[2], 2); - out1[2] = amd_bytealign_S (in1[2], in1[1], 2); - out1[1] = amd_bytealign_S (in1[1], in1[0], 2); - out1[0] = amd_bytealign_S (in1[0], in0[3], 2); - out0[3] = amd_bytealign_S (in0[3], in0[2], 2); - out0[2] = amd_bytealign_S (in0[2], in0[1], 2); - out0[1] = amd_bytealign_S (in0[1], in0[0], 2); - out0[0] = amd_bytealign_S (in0[0], 0, 2); + case 2: out1[3] = hc_bytealign_S (in1[3], in1[2], 2); + out1[2] = hc_bytealign_S (in1[2], in1[1], 2); + out1[1] = hc_bytealign_S (in1[1], in1[0], 2); + out1[0] = hc_bytealign_S (in1[0], in0[3], 2); + out0[3] = hc_bytealign_S (in0[3], in0[2], 2); + out0[2] = hc_bytealign_S (in0[2], in0[1], 2); + out0[1] = hc_bytealign_S (in0[1], in0[0], 2); + out0[0] = hc_bytealign_S (in0[0], 0, 2); break; - case 3: out1[3] = amd_bytealign_S (in1[3], in1[2], 1); - out1[2] = amd_bytealign_S (in1[2], in1[1], 1); - out1[1] = amd_bytealign_S (in1[1], in1[0], 1); - out1[0] = amd_bytealign_S (in1[0], in0[3], 1); - out0[3] = amd_bytealign_S (in0[3], in0[2], 1); - out0[2] = amd_bytealign_S (in0[2], in0[1], 1); - out0[1] = amd_bytealign_S (in0[1], in0[0], 1); - out0[0] = amd_bytealign_S (in0[0], 0, 1); + case 3: out1[3] = hc_bytealign_S (in1[3], in1[2], 1); + out1[2] = hc_bytealign_S (in1[2], in1[1], 1); + out1[1] = hc_bytealign_S (in1[1], in1[0], 1); + out1[0] = hc_bytealign_S (in1[0], in0[3], 1); + out0[3] = hc_bytealign_S (in0[3], in0[2], 1); + out0[2] = hc_bytealign_S (in0[2], in0[1], 1); + out0[1] = hc_bytealign_S (in0[1], in0[0], 1); + out0[0] = hc_bytealign_S (in0[0], 0, 1); break; case 4: out1[3] = in1[2]; out1[2] = in1[1]; @@ -502,31 +502,31 @@ void rshift_block_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const out0[1] = in0[0]; out0[0] = 0; break; - case 5: out1[3] = amd_bytealign_S (in1[2], in1[1], 3); - out1[2] = amd_bytealign_S (in1[1], in1[0], 3); - out1[1] = amd_bytealign_S (in1[0], in0[3], 3); - out1[0] = amd_bytealign_S (in0[3], in0[2], 3); - out0[3] = amd_bytealign_S (in0[2], in0[1], 3); - out0[2] = amd_bytealign_S (in0[1], in0[0], 3); - out0[1] = amd_bytealign_S (in0[0], 0, 3); + case 5: out1[3] = hc_bytealign_S (in1[2], in1[1], 3); + out1[2] = hc_bytealign_S (in1[1], in1[0], 3); + out1[1] = hc_bytealign_S (in1[0], in0[3], 3); + out1[0] = hc_bytealign_S (in0[3], in0[2], 3); + out0[3] = hc_bytealign_S (in0[2], in0[1], 3); + out0[2] = hc_bytealign_S (in0[1], in0[0], 3); + out0[1] = hc_bytealign_S (in0[0], 0, 3); out0[0] = 0; break; - case 6: out1[3] = amd_bytealign_S (in1[2], in1[1], 2); - out1[2] = amd_bytealign_S (in1[1], in1[0], 2); - out1[1] = amd_bytealign_S (in1[0], in0[3], 2); - out1[0] = amd_bytealign_S (in0[3], in0[2], 2); - out0[3] = amd_bytealign_S (in0[2], in0[1], 2); - out0[2] = amd_bytealign_S (in0[1], in0[0], 2); - out0[1] = amd_bytealign_S (in0[0], 0, 2); + case 6: out1[3] = hc_bytealign_S (in1[2], in1[1], 2); + out1[2] = hc_bytealign_S (in1[1], in1[0], 2); + out1[1] = hc_bytealign_S (in1[0], in0[3], 2); + out1[0] = hc_bytealign_S (in0[3], in0[2], 2); + out0[3] = hc_bytealign_S (in0[2], in0[1], 2); + out0[2] = hc_bytealign_S (in0[1], in0[0], 2); + out0[1] = hc_bytealign_S (in0[0], 0, 2); out0[0] = 0; break; - case 7: out1[3] = amd_bytealign_S (in1[2], in1[1], 1); - out1[2] = amd_bytealign_S (in1[1], in1[0], 1); - out1[1] = amd_bytealign_S (in1[0], in0[3], 1); - out1[0] = amd_bytealign_S (in0[3], in0[2], 1); - out0[3] = amd_bytealign_S (in0[2], in0[1], 1); - out0[2] = amd_bytealign_S (in0[1], in0[0], 1); - out0[1] = amd_bytealign_S (in0[0], 0, 1); + case 7: out1[3] = hc_bytealign_S (in1[2], in1[1], 1); + out1[2] = hc_bytealign_S (in1[1], in1[0], 1); + out1[1] = hc_bytealign_S (in1[0], in0[3], 1); + out1[0] = hc_bytealign_S (in0[3], in0[2], 1); + out0[3] = hc_bytealign_S (in0[2], in0[1], 1); + out0[2] = hc_bytealign_S (in0[1], in0[0], 1); + out0[1] = hc_bytealign_S (in0[0], 0, 1); out0[0] = 0; break; case 8: out1[3] = in1[1]; @@ -538,30 +538,30 @@ void rshift_block_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const out0[1] = 0; out0[0] = 0; break; - case 9: out1[3] = amd_bytealign_S (in1[1], in1[0], 3); - out1[2] = amd_bytealign_S (in1[0], in0[3], 3); - out1[1] = amd_bytealign_S (in0[3], in0[2], 3); - out1[0] = amd_bytealign_S (in0[2], in0[1], 3); - out0[3] = amd_bytealign_S (in0[1], in0[0], 3); - out0[2] = amd_bytealign_S (in0[0], 0, 3); + case 9: out1[3] = hc_bytealign_S (in1[1], in1[0], 3); + out1[2] = hc_bytealign_S (in1[0], in0[3], 3); + out1[1] = hc_bytealign_S (in0[3], in0[2], 3); + out1[0] = hc_bytealign_S (in0[2], in0[1], 3); + out0[3] = hc_bytealign_S (in0[1], in0[0], 3); + out0[2] = hc_bytealign_S (in0[0], 0, 3); out0[1] = 0; out0[0] = 0; break; - case 10: out1[3] = amd_bytealign_S (in1[1], in1[0], 2); - out1[2] = amd_bytealign_S (in1[0], in0[3], 2); - out1[1] = amd_bytealign_S (in0[3], in0[2], 2); - out1[0] = amd_bytealign_S (in0[2], in0[1], 2); - out0[3] = amd_bytealign_S (in0[1], in0[0], 2); - out0[2] = amd_bytealign_S (in0[0], 0, 2); + case 10: out1[3] = hc_bytealign_S (in1[1], in1[0], 2); + out1[2] = hc_bytealign_S (in1[0], in0[3], 2); + out1[1] = hc_bytealign_S (in0[3], in0[2], 2); + out1[0] = hc_bytealign_S (in0[2], in0[1], 2); + out0[3] = hc_bytealign_S (in0[1], in0[0], 2); + out0[2] = hc_bytealign_S (in0[0], 0, 2); out0[1] = 0; out0[0] = 0; break; - case 11: out1[3] = amd_bytealign_S (in1[1], in1[0], 1); - out1[2] = amd_bytealign_S (in1[0], in0[3], 1); - out1[1] = amd_bytealign_S (in0[3], in0[2], 1); - out1[0] = amd_bytealign_S (in0[2], in0[1], 1); - out0[3] = amd_bytealign_S (in0[1], in0[0], 1); - out0[2] = amd_bytealign_S (in0[0], 0, 1); + case 11: out1[3] = hc_bytealign_S (in1[1], in1[0], 1); + out1[2] = hc_bytealign_S (in1[0], in0[3], 1); + out1[1] = hc_bytealign_S (in0[3], in0[2], 1); + out1[0] = hc_bytealign_S (in0[2], in0[1], 1); + out0[3] = hc_bytealign_S (in0[1], in0[0], 1); + out0[2] = hc_bytealign_S (in0[0], 0, 1); out0[1] = 0; out0[0] = 0; break; @@ -574,29 +574,29 @@ void rshift_block_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const out0[1] = 0; out0[0] = 0; break; - case 13: out1[3] = amd_bytealign_S (in1[0], in0[3], 3); - out1[2] = amd_bytealign_S (in0[3], in0[2], 3); - out1[1] = amd_bytealign_S (in0[2], in0[1], 3); - out1[0] = amd_bytealign_S (in0[1], in0[0], 3); - out0[3] = amd_bytealign_S (in0[0], 0, 3); + case 13: out1[3] = hc_bytealign_S (in1[0], in0[3], 3); + out1[2] = hc_bytealign_S (in0[3], in0[2], 3); + out1[1] = hc_bytealign_S (in0[2], in0[1], 3); + out1[0] = hc_bytealign_S (in0[1], in0[0], 3); + out0[3] = hc_bytealign_S (in0[0], 0, 3); out0[2] = 0; out0[1] = 0; out0[0] = 0; break; - case 14: out1[3] = amd_bytealign_S (in1[0], in0[3], 2); - out1[2] = amd_bytealign_S (in0[3], in0[2], 2); - out1[1] = amd_bytealign_S (in0[2], in0[1], 2); - out1[0] = amd_bytealign_S (in0[1], in0[0], 2); - out0[3] = amd_bytealign_S (in0[0], 0, 2); + case 14: out1[3] = hc_bytealign_S (in1[0], in0[3], 2); + out1[2] = hc_bytealign_S (in0[3], in0[2], 2); + out1[1] = hc_bytealign_S (in0[2], in0[1], 2); + out1[0] = hc_bytealign_S (in0[1], in0[0], 2); + out0[3] = hc_bytealign_S (in0[0], 0, 2); out0[2] = 0; out0[1] = 0; out0[0] = 0; break; - case 15: out1[3] = amd_bytealign_S (in1[0], in0[3], 1); - out1[2] = amd_bytealign_S (in0[3], in0[2], 1); - out1[1] = amd_bytealign_S (in0[2], in0[1], 1); - out1[0] = amd_bytealign_S (in0[1], in0[0], 1); - out0[3] = amd_bytealign_S (in0[0], 0, 1); + case 15: out1[3] = hc_bytealign_S (in1[0], in0[3], 1); + out1[2] = hc_bytealign_S (in0[3], in0[2], 1); + out1[1] = hc_bytealign_S (in0[2], in0[1], 1); + out1[0] = hc_bytealign_S (in0[1], in0[0], 1); + out0[3] = hc_bytealign_S (in0[0], 0, 1); out0[2] = 0; out0[1] = 0; out0[0] = 0; @@ -610,28 +610,28 @@ void rshift_block_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const out0[1] = 0; out0[0] = 0; break; - case 17: out1[3] = amd_bytealign_S (in0[3], in0[2], 3); - out1[2] = amd_bytealign_S (in0[2], in0[1], 3); - out1[1] = amd_bytealign_S (in0[1], in0[0], 3); - out1[0] = amd_bytealign_S (in0[0], 0, 3); + case 17: out1[3] = hc_bytealign_S (in0[3], in0[2], 3); + out1[2] = hc_bytealign_S (in0[2], in0[1], 3); + out1[1] = hc_bytealign_S (in0[1], in0[0], 3); + out1[0] = hc_bytealign_S (in0[0], 0, 3); out0[3] = 0; out0[2] = 0; out0[1] = 0; out0[0] = 0; break; - case 18: out1[3] = amd_bytealign_S (in0[3], in0[2], 2); - out1[2] = amd_bytealign_S (in0[2], in0[1], 2); - out1[1] = amd_bytealign_S (in0[1], in0[0], 2); - out1[0] = amd_bytealign_S (in0[0], 0, 2); + case 18: out1[3] = hc_bytealign_S (in0[3], in0[2], 2); + out1[2] = hc_bytealign_S (in0[2], in0[1], 2); + out1[1] = hc_bytealign_S (in0[1], in0[0], 2); + out1[0] = hc_bytealign_S (in0[0], 0, 2); out0[3] = 0; out0[2] = 0; out0[1] = 0; out0[0] = 0; break; - case 19: out1[3] = amd_bytealign_S (in0[3], in0[2], 1); - out1[2] = amd_bytealign_S (in0[2], in0[1], 1); - out1[1] = amd_bytealign_S (in0[1], in0[0], 1); - out1[0] = amd_bytealign_S (in0[0], 0, 1); + case 19: out1[3] = hc_bytealign_S (in0[3], in0[2], 1); + out1[2] = hc_bytealign_S (in0[2], in0[1], 1); + out1[1] = hc_bytealign_S (in0[1], in0[0], 1); + out1[0] = hc_bytealign_S (in0[0], 0, 1); out0[3] = 0; out0[2] = 0; out0[1] = 0; @@ -646,27 +646,27 @@ void rshift_block_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const out0[1] = 0; out0[0] = 0; break; - case 21: out1[3] = amd_bytealign_S (in0[2], in0[1], 3); - out1[2] = amd_bytealign_S (in0[1], in0[0], 3); - out1[1] = amd_bytealign_S (in0[0], 0, 3); + case 21: out1[3] = hc_bytealign_S (in0[2], in0[1], 3); + out1[2] = hc_bytealign_S (in0[1], in0[0], 3); + out1[1] = hc_bytealign_S (in0[0], 0, 3); out1[0] = 0; out0[3] = 0; out0[2] = 0; out0[1] = 0; out0[0] = 0; break; - case 22: out1[3] = amd_bytealign_S (in0[2], in0[1], 2); - out1[2] = amd_bytealign_S (in0[1], in0[0], 2); - out1[1] = amd_bytealign_S (in0[0], 0, 2); + case 22: out1[3] = hc_bytealign_S (in0[2], in0[1], 2); + out1[2] = hc_bytealign_S (in0[1], in0[0], 2); + out1[1] = hc_bytealign_S (in0[0], 0, 2); out1[0] = 0; out0[3] = 0; out0[2] = 0; out0[1] = 0; out0[0] = 0; break; - case 23: out1[3] = amd_bytealign_S (in0[2], in0[1], 1); - out1[2] = amd_bytealign_S (in0[1], in0[0], 1); - out1[1] = amd_bytealign_S (in0[0], 0, 1); + case 23: out1[3] = hc_bytealign_S (in0[2], in0[1], 1); + out1[2] = hc_bytealign_S (in0[1], in0[0], 1); + out1[1] = hc_bytealign_S (in0[0], 0, 1); out1[0] = 0; out0[3] = 0; out0[2] = 0; @@ -682,8 +682,8 @@ void rshift_block_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const out0[1] = 0; out0[0] = 0; break; - case 25: out1[3] = amd_bytealign_S (in0[1], in0[0], 3); - out1[2] = amd_bytealign_S (in0[0], 0, 3); + case 25: out1[3] = hc_bytealign_S (in0[1], in0[0], 3); + out1[2] = hc_bytealign_S (in0[0], 0, 3); out1[1] = 0; out1[0] = 0; out0[3] = 0; @@ -691,8 +691,8 @@ void rshift_block_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const out0[1] = 0; out0[0] = 0; break; - case 26: out1[3] = amd_bytealign_S (in0[1], in0[0], 2); - out1[2] = amd_bytealign_S (in0[0], 0, 2); + case 26: out1[3] = hc_bytealign_S (in0[1], in0[0], 2); + out1[2] = hc_bytealign_S (in0[0], 0, 2); out1[1] = 0; out1[0] = 0; out0[3] = 0; @@ -700,8 +700,8 @@ void rshift_block_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const out0[1] = 0; out0[0] = 0; break; - case 27: out1[3] = amd_bytealign_S (in0[1], in0[0], 1); - out1[2] = amd_bytealign_S (in0[0], 0, 1); + case 27: out1[3] = hc_bytealign_S (in0[1], in0[0], 1); + out1[2] = hc_bytealign_S (in0[0], 0, 1); out1[1] = 0; out1[0] = 0; out0[3] = 0; @@ -718,7 +718,7 @@ void rshift_block_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const out0[1] = 0; out0[0] = 0; break; - case 29: out1[3] = amd_bytealign_S (in0[0], 0, 3); + case 29: out1[3] = hc_bytealign_S (in0[0], 0, 3); out1[2] = 0; out1[1] = 0; out1[0] = 0; @@ -727,7 +727,7 @@ void rshift_block_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const out0[1] = 0; out0[0] = 0; break; - case 30: out1[3] = amd_bytealign_S (in0[0], 0, 2); + case 30: out1[3] = hc_bytealign_S (in0[0], 0, 2); out1[2] = 0; out1[1] = 0; out1[0] = 0; @@ -736,7 +736,7 @@ void rshift_block_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const out0[1] = 0; out0[0] = 0; break; - case 31: out1[3] = amd_bytealign_S (in0[0], 0, 1); + case 31: out1[3] = hc_bytealign_S (in0[0], 0, 1); out1[2] = 0; out1[1] = 0; out1[0] = 0; @@ -803,44 +803,44 @@ void append_block8 (const u32 offset, u32 *buf0, u32 *buf1, const u32 *src_l0, c switch (offset_switch) { case 0: - s7 = amd_bytealign_S (src_r12, src_r13, offset); - s6 = amd_bytealign_S (src_r11, src_r12, offset); - s5 = amd_bytealign_S (src_r10, src_r11, offset); - s4 = amd_bytealign_S (src_r03, src_r10, offset); - s3 = amd_bytealign_S (src_r02, src_r03, offset); - s2 = amd_bytealign_S (src_r01, src_r02, offset); - s1 = amd_bytealign_S (src_r00, src_r01, offset); - s0 = amd_bytealign_S ( 0, src_r00, offset); + s7 = hc_bytealign_S (src_r12, src_r13, offset); + s6 = hc_bytealign_S (src_r11, src_r12, offset); + s5 = hc_bytealign_S (src_r10, src_r11, offset); + s4 = hc_bytealign_S (src_r03, src_r10, offset); + s3 = hc_bytealign_S (src_r02, src_r03, offset); + s2 = hc_bytealign_S (src_r01, src_r02, offset); + s1 = hc_bytealign_S (src_r00, src_r01, offset); + s0 = hc_bytealign_S ( 0, src_r00, offset); break; case 1: - s7 = amd_bytealign_S (src_r11, src_r12, offset); - s6 = amd_bytealign_S (src_r10, src_r11, offset); - s5 = amd_bytealign_S (src_r03, src_r10, offset); - s4 = amd_bytealign_S (src_r02, src_r03, offset); - s3 = amd_bytealign_S (src_r01, src_r02, offset); - s2 = amd_bytealign_S (src_r00, src_r01, offset); - s1 = amd_bytealign_S ( 0, src_r00, offset); + s7 = hc_bytealign_S (src_r11, src_r12, offset); + s6 = hc_bytealign_S (src_r10, src_r11, offset); + s5 = hc_bytealign_S (src_r03, src_r10, offset); + s4 = hc_bytealign_S (src_r02, src_r03, offset); + s3 = hc_bytealign_S (src_r01, src_r02, offset); + s2 = hc_bytealign_S (src_r00, src_r01, offset); + s1 = hc_bytealign_S ( 0, src_r00, offset); s0 = 0; break; case 2: - s7 = amd_bytealign_S (src_r10, src_r11, offset); - s6 = amd_bytealign_S (src_r03, src_r10, offset); - s5 = amd_bytealign_S (src_r02, src_r03, offset); - s4 = amd_bytealign_S (src_r01, src_r02, offset); - s3 = amd_bytealign_S (src_r00, src_r01, offset); - s2 = amd_bytealign_S ( 0, src_r00, offset); + s7 = hc_bytealign_S (src_r10, src_r11, offset); + s6 = hc_bytealign_S (src_r03, src_r10, offset); + s5 = hc_bytealign_S (src_r02, src_r03, offset); + s4 = hc_bytealign_S (src_r01, src_r02, offset); + s3 = hc_bytealign_S (src_r00, src_r01, offset); + s2 = hc_bytealign_S ( 0, src_r00, offset); s1 = 0; s0 = 0; break; case 3: - s7 = amd_bytealign_S (src_r03, src_r10, offset); - s6 = amd_bytealign_S (src_r02, src_r03, offset); - s5 = amd_bytealign_S (src_r01, src_r02, offset); - s4 = amd_bytealign_S (src_r00, src_r01, offset); - s3 = amd_bytealign_S ( 0, src_r00, offset); + s7 = hc_bytealign_S (src_r03, src_r10, offset); + s6 = hc_bytealign_S (src_r02, src_r03, offset); + s5 = hc_bytealign_S (src_r01, src_r02, offset); + s4 = hc_bytealign_S (src_r00, src_r01, offset); + s3 = hc_bytealign_S ( 0, src_r00, offset); s2 = 0; s1 = 0; s0 = 0; @@ -848,10 +848,10 @@ void append_block8 (const u32 offset, u32 *buf0, u32 *buf1, const u32 *src_l0, c break; case 4: - s7 = amd_bytealign_S (src_r02, src_r03, offset); - s6 = amd_bytealign_S (src_r01, src_r02, offset); - s5 = amd_bytealign_S (src_r00, src_r01, offset); - s4 = amd_bytealign_S ( 0, src_r00, offset); + s7 = hc_bytealign_S (src_r02, src_r03, offset); + s6 = hc_bytealign_S (src_r01, src_r02, offset); + s5 = hc_bytealign_S (src_r00, src_r01, offset); + s4 = hc_bytealign_S ( 0, src_r00, offset); s3 = 0; s2 = 0; s1 = 0; @@ -859,9 +859,9 @@ void append_block8 (const u32 offset, u32 *buf0, u32 *buf1, const u32 *src_l0, c break; case 5: - s7 = amd_bytealign_S (src_r01, src_r02, offset); - s6 = amd_bytealign_S (src_r00, src_r01, offset); - s5 = amd_bytealign_S ( 0, src_r00, offset); + s7 = hc_bytealign_S (src_r01, src_r02, offset); + s6 = hc_bytealign_S (src_r00, src_r01, offset); + s5 = hc_bytealign_S ( 0, src_r00, offset); s4 = 0; s3 = 0; s2 = 0; @@ -870,8 +870,8 @@ void append_block8 (const u32 offset, u32 *buf0, u32 *buf1, const u32 *src_l0, c break; case 6: - s7 = amd_bytealign_S (src_r00, src_r01, offset); - s6 = amd_bytealign_S ( 0, src_r00, offset); + s7 = hc_bytealign_S (src_r00, src_r01, offset); + s6 = hc_bytealign_S ( 0, src_r00, offset); s5 = 0; s4 = 0; s3 = 0; @@ -881,7 +881,7 @@ void append_block8 (const u32 offset, u32 *buf0, u32 *buf1, const u32 *src_l0, c break; case 7: - s7 = amd_bytealign_S ( 0, src_r00, offset); + s7 = hc_bytealign_S ( 0, src_r00, offset); s6 = 0; s5 = 0; s4 = 0; @@ -928,44 +928,44 @@ void append_block8 (const u32 offset, u32 *buf0, u32 *buf1, const u32 *src_l0, c switch (offset_switch) { case 0: - s7 = __byte_perm_S (src_r12, src_r13, selector); - s6 = __byte_perm_S (src_r11, src_r12, selector); - s5 = __byte_perm_S (src_r10, src_r11, selector); - s4 = __byte_perm_S (src_r03, src_r10, selector); - s3 = __byte_perm_S (src_r02, src_r03, selector); - s2 = __byte_perm_S (src_r01, src_r02, selector); - s1 = __byte_perm_S (src_r00, src_r01, selector); - s0 = __byte_perm_S ( 0, src_r00, selector); + s7 = hc_byte_perm_S (src_r12, src_r13, selector); + s6 = hc_byte_perm_S (src_r11, src_r12, selector); + s5 = hc_byte_perm_S (src_r10, src_r11, selector); + s4 = hc_byte_perm_S (src_r03, src_r10, selector); + s3 = hc_byte_perm_S (src_r02, src_r03, selector); + s2 = hc_byte_perm_S (src_r01, src_r02, selector); + s1 = hc_byte_perm_S (src_r00, src_r01, selector); + s0 = hc_byte_perm_S ( 0, src_r00, selector); break; case 1: - s7 = __byte_perm_S (src_r11, src_r12, selector); - s6 = __byte_perm_S (src_r10, src_r11, selector); - s5 = __byte_perm_S (src_r03, src_r10, selector); - s4 = __byte_perm_S (src_r02, src_r03, selector); - s3 = __byte_perm_S (src_r01, src_r02, selector); - s2 = __byte_perm_S (src_r00, src_r01, selector); - s1 = __byte_perm_S ( 0, src_r00, selector); + s7 = hc_byte_perm_S (src_r11, src_r12, selector); + s6 = hc_byte_perm_S (src_r10, src_r11, selector); + s5 = hc_byte_perm_S (src_r03, src_r10, selector); + s4 = hc_byte_perm_S (src_r02, src_r03, selector); + s3 = hc_byte_perm_S (src_r01, src_r02, selector); + s2 = hc_byte_perm_S (src_r00, src_r01, selector); + s1 = hc_byte_perm_S ( 0, src_r00, selector); s0 = 0; break; case 2: - s7 = __byte_perm_S (src_r10, src_r11, selector); - s6 = __byte_perm_S (src_r03, src_r10, selector); - s5 = __byte_perm_S (src_r02, src_r03, selector); - s4 = __byte_perm_S (src_r01, src_r02, selector); - s3 = __byte_perm_S (src_r00, src_r01, selector); - s2 = __byte_perm_S ( 0, src_r00, selector); + s7 = hc_byte_perm_S (src_r10, src_r11, selector); + s6 = hc_byte_perm_S (src_r03, src_r10, selector); + s5 = hc_byte_perm_S (src_r02, src_r03, selector); + s4 = hc_byte_perm_S (src_r01, src_r02, selector); + s3 = hc_byte_perm_S (src_r00, src_r01, selector); + s2 = hc_byte_perm_S ( 0, src_r00, selector); s1 = 0; s0 = 0; break; case 3: - s7 = __byte_perm_S (src_r03, src_r10, selector); - s6 = __byte_perm_S (src_r02, src_r03, selector); - s5 = __byte_perm_S (src_r01, src_r02, selector); - s4 = __byte_perm_S (src_r00, src_r01, selector); - s3 = __byte_perm_S ( 0, src_r00, selector); + s7 = hc_byte_perm_S (src_r03, src_r10, selector); + s6 = hc_byte_perm_S (src_r02, src_r03, selector); + s5 = hc_byte_perm_S (src_r01, src_r02, selector); + s4 = hc_byte_perm_S (src_r00, src_r01, selector); + s3 = hc_byte_perm_S ( 0, src_r00, selector); s2 = 0; s1 = 0; s0 = 0; @@ -973,10 +973,10 @@ void append_block8 (const u32 offset, u32 *buf0, u32 *buf1, const u32 *src_l0, c break; case 4: - s7 = __byte_perm_S (src_r02, src_r03, selector); - s6 = __byte_perm_S (src_r01, src_r02, selector); - s5 = __byte_perm_S (src_r00, src_r01, selector); - s4 = __byte_perm_S ( 0, src_r00, selector); + s7 = hc_byte_perm_S (src_r02, src_r03, selector); + s6 = hc_byte_perm_S (src_r01, src_r02, selector); + s5 = hc_byte_perm_S (src_r00, src_r01, selector); + s4 = hc_byte_perm_S ( 0, src_r00, selector); s3 = 0; s2 = 0; s1 = 0; @@ -984,9 +984,9 @@ void append_block8 (const u32 offset, u32 *buf0, u32 *buf1, const u32 *src_l0, c break; case 5: - s7 = __byte_perm_S (src_r01, src_r02, selector); - s6 = __byte_perm_S (src_r00, src_r01, selector); - s5 = __byte_perm_S ( 0, src_r00, selector); + s7 = hc_byte_perm_S (src_r01, src_r02, selector); + s6 = hc_byte_perm_S (src_r00, src_r01, selector); + s5 = hc_byte_perm_S ( 0, src_r00, selector); s4 = 0; s3 = 0; s2 = 0; @@ -995,8 +995,8 @@ void append_block8 (const u32 offset, u32 *buf0, u32 *buf1, const u32 *src_l0, c break; case 6: - s7 = __byte_perm_S (src_r00, src_r01, selector); - s6 = __byte_perm_S ( 0, src_r00, selector); + s7 = hc_byte_perm_S (src_r00, src_r01, selector); + s6 = hc_byte_perm_S ( 0, src_r00, selector); s5 = 0; s4 = 0; s3 = 0; @@ -1006,7 +1006,7 @@ void append_block8 (const u32 offset, u32 *buf0, u32 *buf1, const u32 *src_l0, c break; case 7: - s7 = __byte_perm_S ( 0, src_r00, selector); + s7 = hc_byte_perm_S ( 0, src_r00, selector); s6 = 0; s5 = 0; s4 = 0; @@ -1681,10 +1681,10 @@ u32 search_on_register (const u32 in, const u32 p0) { u32 r = 0; - if (__bfe_S (in, 0, 8) == p0) r |= 1; - if (__bfe_S (in, 8, 8) == p0) r |= 2; - if (__bfe_S (in, 16, 8) == p0) r |= 4; - if (__bfe_S (in, 24, 8) == p0) r |= 8; + if (hc_bfe_S (in, 0, 8) == p0) r |= 1; + if (hc_bfe_S (in, 8, 8) == p0) r |= 2; + if (hc_bfe_S (in, 16, 8) == p0) r |= 4; + if (hc_bfe_S (in, 24, 8) == p0) r |= 8; return r; } diff --git a/OpenCL/inc_types.cl b/OpenCL/inc_types.cl index 85501ccf1..3f19b82ed 100644 --- a/OpenCL/inc_types.cl +++ b/OpenCL/inc_types.cl @@ -351,23 +351,28 @@ DECLSPEC u64x rotl64 (const u64x a, const u32 n) return rotr64 (a, 64 - n); } -DECLSPEC u32x __bfe (const u32x a, const u32x b, const u32x c) +DECLSPEC u32x hc_bfe (const u32x a, const u32x b, const u32x c) { return amd_bfe (a, b, c); } -DECLSPEC u32 __bfe_S (const u32 a, const u32 b, const u32 c) +DECLSPEC u32 hc_bfe_S (const u32 a, const u32 b, const u32 c) { return amd_bfe (a, b, c); } -DECLSPEC u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c) +DECLSPEC u32 hc_bytealign (const u32x a, const u32x b, const u32x c) +{ + return amd_bytealign (a, b, c); +} + +DECLSPEC u32 hc_bytealign_S (const u32 a, const u32 b, const u32 c) { return amd_bytealign (a, b, c); } #if AMD_GCN >= 3 -DECLSPEC u32x __byte_perm (const u32x a, const u32x b, const u32x c) +DECLSPEC u32x hc_byte_perm (const u32x a, const u32x b, const u32x c) { u32x r; @@ -420,7 +425,7 @@ DECLSPEC u32x __byte_perm (const u32x a, const u32x b, const u32x c) return r; } -DECLSPEC u32 __byte_perm_S (const u32 a, const u32 b, const u32 c) +DECLSPEC u32 hc_byte_perm_S (const u32 a, const u32 b, const u32 c) { u32 r; @@ -431,7 +436,7 @@ DECLSPEC u32 __byte_perm_S (const u32 a, const u32 b, const u32 c) #endif #if AMD_GCN >= 5 -DECLSPEC u32x __add3 (const u32x a, const u32x b, const u32x c) +DECLSPEC u32x hc_add3 (const u32x a, const u32x b, const u32x c) { u32x r; @@ -484,7 +489,7 @@ DECLSPEC u32x __add3 (const u32x a, const u32x b, const u32x c) return r; } -DECLSPEC u32 __add3_S (const u32 a, const u32 b, const u32 c) +DECLSPEC u32 hc_add3_S (const u32 a, const u32 b, const u32 c) { u32 r; @@ -493,12 +498,12 @@ DECLSPEC u32 __add3_S (const u32 a, const u32 b, const u32 c) return r; } #else -DECLSPEC u32x __add3 (const u32x a, const u32x b, const u32x c) +DECLSPEC u32x hc_add3 (const u32x a, const u32x b, const u32x c) { return a + b + c; } -DECLSPEC u32 __add3_S (const u32 a, const u32 b, const u32 c) +DECLSPEC u32 hc_add3_S (const u32 a, const u32 b, const u32 c) { return a + b + c; } @@ -741,7 +746,7 @@ DECLSPEC u64x rotl64 (const u64x a, const u32 n) return rotate (a, (u64x) n); } -DECLSPEC u32x __byte_perm (const u32x a, const u32x b, const u32x c) +DECLSPEC u32x hc_byte_perm (const u32x a, const u32x b, const u32x c) { u32x r; @@ -780,7 +785,7 @@ DECLSPEC u32x __byte_perm (const u32x a, const u32x b, const u32x c) return r; } -DECLSPEC u32 __byte_perm_S (const u32 a, const u32 b, const u32 c) +DECLSPEC u32 hc_byte_perm_S (const u32 a, const u32 b, const u32 c) { u32 r; @@ -789,7 +794,7 @@ DECLSPEC u32 __byte_perm_S (const u32 a, const u32 b, const u32 c) return r; } -DECLSPEC u32x __bfe (const u32x a, const u32x b, const u32x c) +DECLSPEC u32x hc_bfe (const u32x a, const u32x b, const u32x c) { u32x r; @@ -828,7 +833,7 @@ DECLSPEC u32x __bfe (const u32x a, const u32x b, const u32x c) return r; } -DECLSPEC u32 __bfe_S (const u32 a, const u32 b, const u32 c) +DECLSPEC u32 hc_bfe_S (const u32 a, const u32 b, const u32 c) { u32 r; @@ -837,7 +842,7 @@ DECLSPEC u32 __bfe_S (const u32 a, const u32 b, const u32 c) return r; } -DECLSPEC u32x amd_bytealign (const u32x a, const u32x b, const u32x c) +DECLSPEC u32x hc_bytealign (const u32x a, const u32x b, const u32x c) { u32x r; @@ -877,14 +882,14 @@ DECLSPEC u32x amd_bytealign (const u32x a, const u32x b, const u32x c) #else - r = __byte_perm (b, a, ((u32x) (0x76543210) >> ((c & 3) * 4)) & 0xffff); + r = hc_byte_perm (b, a, ((u32x) (0x76543210) >> ((c & 3) * 4)) & 0xffff); #endif return r; } -DECLSPEC u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c) +DECLSPEC u32 hc_bytealign_S (const u32 a, const u32 b, const u32 c) { u32 r; @@ -894,19 +899,19 @@ DECLSPEC u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c) #else - r = __byte_perm_S (b, a, (0x76543210 >> ((c & 3) * 4)) & 0xffff); + r = hc_byte_perm_S (b, a, (0x76543210 >> ((c & 3) * 4)) & 0xffff); #endif return r; } -DECLSPEC u32x __add3 (const u32x a, const u32x b, const u32x c) +DECLSPEC u32x hc_add3 (const u32x a, const u32x b, const u32x c) { return a + b + c; } -DECLSPEC u32 __add3_S (const u32 a, const u32 b, const u32 c) +DECLSPEC u32 hc_add3_S (const u32 a, const u32 b, const u32 c) { return a + b + c; } @@ -984,7 +989,7 @@ DECLSPEC u64x rotl64 (const u64x a, const u32 n) return rotate (a, (u64x) n); } -DECLSPEC u32x __bfe (const u32x a, const u32x b, const u32x c) +DECLSPEC u32x hc_bfe (const u32x a, const u32x b, const u32x c) { #define BIT(x) ((u32x) (1u) << (x)) #define BIT_MASK(x) (BIT (x) - 1) @@ -997,7 +1002,7 @@ DECLSPEC u32x __bfe (const u32x a, const u32x b, const u32x c) #undef BFE } -DECLSPEC u32 __bfe_S (const u32 a, const u32 b, const u32 c) +DECLSPEC u32 hc_bfe_S (const u32 a, const u32 b, const u32 c) { #define BIT(x) (1u << (x)) #define BIT_MASK(x) (BIT (x) - 1) @@ -1010,7 +1015,7 @@ DECLSPEC u32 __bfe_S (const u32 a, const u32 b, const u32 c) #undef BFE } -DECLSPEC u32x amd_bytealign (const u32x a, const u32x b, const u32 c) +DECLSPEC u32x hc_bytealign (const u32x a, const u32x b, const u32 c) { #if VECT_SIZE == 1 const u64x tmp = ((((u64x) (a)) << 32) | ((u64x) (b))) >> ((c & 3) * 8); @@ -1043,19 +1048,19 @@ DECLSPEC u32x amd_bytealign (const u32x a, const u32x b, const u32 c) #endif } -DECLSPEC u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c) +DECLSPEC u32 hc_bytealign_S (const u32 a, const u32 b, const u32 c) { const u64 tmp = ((((u64) a) << 32) | ((u64) b)) >> ((c & 3) * 8); return (u32) (tmp); } -DECLSPEC u32x __add3 (const u32x a, const u32x b, const u32x c) +DECLSPEC u32x hc_add3 (const u32x a, const u32x b, const u32x c) { return a + b + c; } -DECLSPEC u32 __add3_S (const u32 a, const u32 b, const u32 c) +DECLSPEC u32 hc_add3_S (const u32 a, const u32 b, const u32 c) { return a + b + c; } diff --git a/OpenCL/inc_vendor.cl b/OpenCL/inc_vendor.cl index 0e35b6198..d488f1e8c 100644 --- a/OpenCL/inc_vendor.cl +++ b/OpenCL/inc_vendor.cl @@ -101,9 +101,13 @@ */ #ifdef IS_AMD +#if defined(cl_amd_media_ops) #pragma OPENCL EXTENSION cl_amd_media_ops : enable +#endif +#if defined(cl_amd_media_ops2) #pragma OPENCL EXTENSION cl_amd_media_ops2 : enable #endif +#endif /** * Unrolling is generally enabled, for all device types and hash modes diff --git a/OpenCL/m00500-optimized.cl b/OpenCL/m00500-optimized.cl index 50fbbc1b3..c171f20f2 100644 --- a/OpenCL/m00500-optimized.cl +++ b/OpenCL/m00500-optimized.cl @@ -35,11 +35,11 @@ DECLSPEC void memcat16 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, cons u32 in2 = swap32_S (append[2]); u32 in3 = swap32_S (append[3]); - tmp0 = amd_bytealign ( 0, in0, offset); - tmp1 = amd_bytealign (in0, in1, offset); - tmp2 = amd_bytealign (in1, in2, offset); - tmp3 = amd_bytealign (in2, in3, offset); - tmp4 = amd_bytealign (in3, 0, offset); + tmp0 = hc_bytealign ( 0, in0, offset); + tmp1 = hc_bytealign (in0, in1, offset); + tmp2 = hc_bytealign (in1, in2, offset); + tmp3 = hc_bytealign (in2, in3, offset); + tmp4 = hc_bytealign (in3, 0, offset); tmp0 = swap32_S (tmp0); tmp1 = swap32_S (tmp1); @@ -56,11 +56,11 @@ DECLSPEC void memcat16 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, cons u32 in2 = append[2]; u32 in3 = append[3]; - tmp0 = __byte_perm ( 0, in0, selector); - tmp1 = __byte_perm (in0, in1, selector); - tmp2 = __byte_perm (in1, in2, selector); - tmp3 = __byte_perm (in2, in3, selector); - tmp4 = __byte_perm (in3, 0, selector); + tmp0 = hc_byte_perm ( 0, in0, selector); + tmp1 = hc_byte_perm (in0, in1, selector); + tmp2 = hc_byte_perm (in1, in2, selector); + tmp3 = hc_byte_perm (in2, in3, selector); + tmp4 = hc_byte_perm (in3, 0, selector); #endif const u32 div = offset / 4; @@ -149,11 +149,11 @@ DECLSPEC void memcat16_x80 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, u32 in3 = swap32_S (append[3]); u32 in4 = 0x80000000; - tmp0 = amd_bytealign ( 0, in0, offset); - tmp1 = amd_bytealign (in0, in1, offset); - tmp2 = amd_bytealign (in1, in2, offset); - tmp3 = amd_bytealign (in2, in3, offset); - tmp4 = amd_bytealign (in3, in4, offset); + tmp0 = hc_bytealign ( 0, in0, offset); + tmp1 = hc_bytealign (in0, in1, offset); + tmp2 = hc_bytealign (in1, in2, offset); + tmp3 = hc_bytealign (in2, in3, offset); + tmp4 = hc_bytealign (in3, in4, offset); tmp0 = swap32_S (tmp0); tmp1 = swap32_S (tmp1); @@ -171,11 +171,11 @@ DECLSPEC void memcat16_x80 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, u32 in3 = append[3]; u32 in4 = 0x80; - tmp0 = __byte_perm ( 0, in0, selector); - tmp1 = __byte_perm (in0, in1, selector); - tmp2 = __byte_perm (in1, in2, selector); - tmp3 = __byte_perm (in2, in3, selector); - tmp4 = __byte_perm (in3, in4, selector); + tmp0 = hc_byte_perm ( 0, in0, selector); + tmp1 = hc_byte_perm (in0, in1, selector); + tmp2 = hc_byte_perm (in1, in2, selector); + tmp3 = hc_byte_perm (in2, in3, selector); + tmp4 = hc_byte_perm (in3, in4, selector); #endif const u32 div = offset / 4; @@ -259,9 +259,9 @@ DECLSPEC void memcat8 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, const u32 in0 = swap32_S (append[0]); u32 in1 = swap32_S (append[1]); - tmp0 = amd_bytealign ( 0, in0, offset); - tmp1 = amd_bytealign (in0, in1, offset); - tmp2 = amd_bytealign (in1, 0, offset); + tmp0 = hc_bytealign ( 0, in0, offset); + tmp1 = hc_bytealign (in0, in1, offset); + tmp2 = hc_bytealign (in1, 0, offset); tmp0 = swap32_S (tmp0); tmp1 = swap32_S (tmp1); @@ -274,9 +274,9 @@ DECLSPEC void memcat8 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, const u32 in0 = append[0]; u32 in1 = append[1]; - tmp0 = __byte_perm ( 0, in0, selector); - tmp1 = __byte_perm (in0, in1, selector); - tmp2 = __byte_perm (in1, 0, selector); + tmp0 = hc_byte_perm ( 0, in0, selector); + tmp1 = hc_byte_perm (in0, in1, selector); + tmp2 = hc_byte_perm (in1, 0, selector); #endif const u32 div = offset / 4; diff --git a/OpenCL/m01600-optimized.cl b/OpenCL/m01600-optimized.cl index af874359d..0c84e2ff0 100644 --- a/OpenCL/m01600-optimized.cl +++ b/OpenCL/m01600-optimized.cl @@ -34,11 +34,11 @@ DECLSPEC void memcat16 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, cons u32 in2 = swap32_S (append[2]); u32 in3 = swap32_S (append[3]); - tmp0 = amd_bytealign ( 0, in0, offset); - tmp1 = amd_bytealign (in0, in1, offset); - tmp2 = amd_bytealign (in1, in2, offset); - tmp3 = amd_bytealign (in2, in3, offset); - tmp4 = amd_bytealign (in3, 0, offset); + tmp0 = hc_bytealign ( 0, in0, offset); + tmp1 = hc_bytealign (in0, in1, offset); + tmp2 = hc_bytealign (in1, in2, offset); + tmp3 = hc_bytealign (in2, in3, offset); + tmp4 = hc_bytealign (in3, 0, offset); tmp0 = swap32_S (tmp0); tmp1 = swap32_S (tmp1); @@ -55,11 +55,11 @@ DECLSPEC void memcat16 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, cons u32 in2 = append[2]; u32 in3 = append[3]; - tmp0 = __byte_perm ( 0, in0, selector); - tmp1 = __byte_perm (in0, in1, selector); - tmp2 = __byte_perm (in1, in2, selector); - tmp3 = __byte_perm (in2, in3, selector); - tmp4 = __byte_perm (in3, 0, selector); + tmp0 = hc_byte_perm ( 0, in0, selector); + tmp1 = hc_byte_perm (in0, in1, selector); + tmp2 = hc_byte_perm (in1, in2, selector); + tmp3 = hc_byte_perm (in2, in3, selector); + tmp4 = hc_byte_perm (in3, 0, selector); #endif const u32 div = offset / 4; @@ -148,11 +148,11 @@ DECLSPEC void memcat16_x80 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, u32 in3 = swap32_S (append[3]); u32 in4 = 0x80000000; - tmp0 = amd_bytealign ( 0, in0, offset); - tmp1 = amd_bytealign (in0, in1, offset); - tmp2 = amd_bytealign (in1, in2, offset); - tmp3 = amd_bytealign (in2, in3, offset); - tmp4 = amd_bytealign (in3, in4, offset); + tmp0 = hc_bytealign ( 0, in0, offset); + tmp1 = hc_bytealign (in0, in1, offset); + tmp2 = hc_bytealign (in1, in2, offset); + tmp3 = hc_bytealign (in2, in3, offset); + tmp4 = hc_bytealign (in3, in4, offset); tmp0 = swap32_S (tmp0); tmp1 = swap32_S (tmp1); @@ -170,11 +170,11 @@ DECLSPEC void memcat16_x80 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, u32 in3 = append[3]; u32 in4 = 0x80; - tmp0 = __byte_perm ( 0, in0, selector); - tmp1 = __byte_perm (in0, in1, selector); - tmp2 = __byte_perm (in1, in2, selector); - tmp3 = __byte_perm (in2, in3, selector); - tmp4 = __byte_perm (in3, in4, selector); + tmp0 = hc_byte_perm ( 0, in0, selector); + tmp1 = hc_byte_perm (in0, in1, selector); + tmp2 = hc_byte_perm (in1, in2, selector); + tmp3 = hc_byte_perm (in2, in3, selector); + tmp4 = hc_byte_perm (in3, in4, selector); #endif const u32 div = offset / 4; @@ -258,9 +258,9 @@ DECLSPEC void memcat8 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, const u32 in0 = swap32_S (append[0]); u32 in1 = swap32_S (append[1]); - tmp0 = amd_bytealign ( 0, in0, offset); - tmp1 = amd_bytealign (in0, in1, offset); - tmp2 = amd_bytealign (in1, 0, offset); + tmp0 = hc_bytealign ( 0, in0, offset); + tmp1 = hc_bytealign (in0, in1, offset); + tmp2 = hc_bytealign (in1, 0, offset); tmp0 = swap32_S (tmp0); tmp1 = swap32_S (tmp1); @@ -273,9 +273,9 @@ DECLSPEC void memcat8 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, const u32 in0 = append[0]; u32 in1 = append[1]; - tmp0 = __byte_perm ( 0, in0, selector); - tmp1 = __byte_perm (in0, in1, selector); - tmp2 = __byte_perm (in1, 0, selector); + tmp0 = hc_byte_perm ( 0, in0, selector); + tmp1 = hc_byte_perm (in0, in1, selector); + tmp2 = hc_byte_perm (in1, 0, selector); #endif const u32 div = offset / 4; diff --git a/OpenCL/m03200-pure.cl b/OpenCL/m03200-pure.cl index b07264b36..ee400efbc 100644 --- a/OpenCL/m03200-pure.cl +++ b/OpenCL/m03200-pure.cl @@ -307,10 +307,10 @@ __constant u32a c_sbox3[256] = { \ u32 tmp; \ \ - tmp = S0[__bfe ((L), 24, 8)]; \ - tmp += S1[__bfe ((L), 16, 8)]; \ - tmp ^= S2[__bfe ((L), 8, 8)]; \ - tmp += S3[__bfe ((L), 0, 8)]; \ + tmp = S0[hc_bfe ((L), 24, 8)]; \ + tmp += S1[hc_bfe ((L), 16, 8)]; \ + tmp ^= S2[hc_bfe ((L), 8, 8)]; \ + tmp += S3[hc_bfe ((L), 0, 8)]; \ \ (R) ^= tmp ^ P[(N)]; \ } diff --git a/OpenCL/m05800-optimized.cl b/OpenCL/m05800-optimized.cl index 8f7acca6e..67dd05180 100644 --- a/OpenCL/m05800-optimized.cl +++ b/OpenCL/m05800-optimized.cl @@ -2123,12 +2123,12 @@ DECLSPEC void append_salt (u32 *w0, u32 *w1, u32 *w2, const u32 *append, const u u32 in3 = swap32_S (append[3]); u32 in4 = swap32_S (append[4]); - tmp0 = amd_bytealign ( 0, in0, offset); - tmp1 = amd_bytealign (in0, in1, offset); - tmp2 = amd_bytealign (in1, in2, offset); - tmp3 = amd_bytealign (in2, in3, offset); - tmp4 = amd_bytealign (in3, in4, offset); - tmp5 = amd_bytealign (in4, 0, offset); + tmp0 = hc_bytealign ( 0, in0, offset); + tmp1 = hc_bytealign (in0, in1, offset); + tmp2 = hc_bytealign (in1, in2, offset); + tmp3 = hc_bytealign (in2, in3, offset); + tmp4 = hc_bytealign (in3, in4, offset); + tmp5 = hc_bytealign (in4, 0, offset); tmp0 = swap32_S (tmp0); tmp1 = swap32_S (tmp1); @@ -2147,12 +2147,12 @@ DECLSPEC void append_salt (u32 *w0, u32 *w1, u32 *w2, const u32 *append, const u u32 in3 = append[3]; u32 in4 = append[4]; - tmp0 = __byte_perm ( 0, in0, selector); - tmp1 = __byte_perm (in0, in1, selector); - tmp2 = __byte_perm (in1, in2, selector); - tmp3 = __byte_perm (in2, in3, selector); - tmp4 = __byte_perm (in3, in4, selector); - tmp5 = __byte_perm (in4, 0, selector); + tmp0 = hc_byte_perm ( 0, in0, selector); + tmp1 = hc_byte_perm (in0, in1, selector); + tmp2 = hc_byte_perm (in1, in2, selector); + tmp3 = hc_byte_perm (in2, in3, selector); + tmp4 = hc_byte_perm (in3, in4, selector); + tmp5 = hc_byte_perm (in4, 0, selector); #endif const u32 div = offset / 4; diff --git a/OpenCL/m05800-pure.cl b/OpenCL/m05800-pure.cl index 75259500b..24610fc49 100644 --- a/OpenCL/m05800-pure.cl +++ b/OpenCL/m05800-pure.cl @@ -2123,12 +2123,12 @@ DECLSPEC void append_salt (u32 *w0, u32 *w1, u32 *w2, const u32 *append, const u u32 in3 = swap32_S (append[3]); u32 in4 = swap32_S (append[4]); - tmp0 = amd_bytealign ( 0, in0, offset); - tmp1 = amd_bytealign (in0, in1, offset); - tmp2 = amd_bytealign (in1, in2, offset); - tmp3 = amd_bytealign (in2, in3, offset); - tmp4 = amd_bytealign (in3, in4, offset); - tmp5 = amd_bytealign (in4, 0, offset); + tmp0 = hc_bytealign ( 0, in0, offset); + tmp1 = hc_bytealign (in0, in1, offset); + tmp2 = hc_bytealign (in1, in2, offset); + tmp3 = hc_bytealign (in2, in3, offset); + tmp4 = hc_bytealign (in3, in4, offset); + tmp5 = hc_bytealign (in4, 0, offset); tmp0 = swap32_S (tmp0); tmp1 = swap32_S (tmp1); @@ -2147,12 +2147,12 @@ DECLSPEC void append_salt (u32 *w0, u32 *w1, u32 *w2, const u32 *append, const u u32 in3 = append[3]; u32 in4 = append[4]; - tmp0 = __byte_perm ( 0, in0, selector); - tmp1 = __byte_perm (in0, in1, selector); - tmp2 = __byte_perm (in1, in2, selector); - tmp3 = __byte_perm (in2, in3, selector); - tmp3 = __byte_perm (in3, in4, selector); - tmp4 = __byte_perm (in4, 0, selector); + tmp0 = hc_byte_perm ( 0, in0, selector); + tmp1 = hc_byte_perm (in0, in1, selector); + tmp2 = hc_byte_perm (in1, in2, selector); + tmp3 = hc_byte_perm (in2, in3, selector); + tmp3 = hc_byte_perm (in3, in4, selector); + tmp4 = hc_byte_perm (in4, 0, selector); #endif const u32 div = offset / 4; diff --git a/OpenCL/m06300-optimized.cl b/OpenCL/m06300-optimized.cl index 9cc0f7855..40ec17509 100644 --- a/OpenCL/m06300-optimized.cl +++ b/OpenCL/m06300-optimized.cl @@ -31,11 +31,11 @@ DECLSPEC void memcat16 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, cons u32 in2 = swap32_S (append[2]); u32 in3 = swap32_S (append[3]); - tmp0 = amd_bytealign ( 0, in0, offset); - tmp1 = amd_bytealign (in0, in1, offset); - tmp2 = amd_bytealign (in1, in2, offset); - tmp3 = amd_bytealign (in2, in3, offset); - tmp4 = amd_bytealign (in3, 0, offset); + tmp0 = hc_bytealign ( 0, in0, offset); + tmp1 = hc_bytealign (in0, in1, offset); + tmp2 = hc_bytealign (in1, in2, offset); + tmp3 = hc_bytealign (in2, in3, offset); + tmp4 = hc_bytealign (in3, 0, offset); tmp0 = swap32_S (tmp0); tmp1 = swap32_S (tmp1); @@ -52,11 +52,11 @@ DECLSPEC void memcat16 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, cons u32 in2 = append[2]; u32 in3 = append[3]; - tmp0 = __byte_perm ( 0, in0, selector); - tmp1 = __byte_perm (in0, in1, selector); - tmp2 = __byte_perm (in1, in2, selector); - tmp3 = __byte_perm (in2, in3, selector); - tmp4 = __byte_perm (in3, 0, selector); + tmp0 = hc_byte_perm ( 0, in0, selector); + tmp1 = hc_byte_perm (in0, in1, selector); + tmp2 = hc_byte_perm (in1, in2, selector); + tmp3 = hc_byte_perm (in2, in3, selector); + tmp4 = hc_byte_perm (in3, 0, selector); #endif const u32 div = offset / 4; @@ -145,11 +145,11 @@ DECLSPEC void memcat16_x80 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, u32 in3 = swap32_S (append[3]); u32 in4 = 0x80000000; - tmp0 = amd_bytealign ( 0, in0, offset); - tmp1 = amd_bytealign (in0, in1, offset); - tmp2 = amd_bytealign (in1, in2, offset); - tmp3 = amd_bytealign (in2, in3, offset); - tmp4 = amd_bytealign (in3, in4, offset); + tmp0 = hc_bytealign ( 0, in0, offset); + tmp1 = hc_bytealign (in0, in1, offset); + tmp2 = hc_bytealign (in1, in2, offset); + tmp3 = hc_bytealign (in2, in3, offset); + tmp4 = hc_bytealign (in3, in4, offset); tmp0 = swap32_S (tmp0); tmp1 = swap32_S (tmp1); @@ -167,11 +167,11 @@ DECLSPEC void memcat16_x80 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, u32 in3 = append[3]; u32 in4 = 0x80; - tmp0 = __byte_perm ( 0, in0, selector); - tmp1 = __byte_perm (in0, in1, selector); - tmp2 = __byte_perm (in1, in2, selector); - tmp3 = __byte_perm (in2, in3, selector); - tmp4 = __byte_perm (in3, in4, selector); + tmp0 = hc_byte_perm ( 0, in0, selector); + tmp1 = hc_byte_perm (in0, in1, selector); + tmp2 = hc_byte_perm (in1, in2, selector); + tmp3 = hc_byte_perm (in2, in3, selector); + tmp4 = hc_byte_perm (in3, in4, selector); #endif const u32 div = offset / 4; @@ -255,9 +255,9 @@ DECLSPEC void memcat8 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, const u32 in0 = swap32_S (append[0]); u32 in1 = swap32_S (append[1]); - tmp0 = amd_bytealign ( 0, in0, offset); - tmp1 = amd_bytealign (in0, in1, offset); - tmp2 = amd_bytealign (in1, 0, offset); + tmp0 = hc_bytealign ( 0, in0, offset); + tmp1 = hc_bytealign (in0, in1, offset); + tmp2 = hc_bytealign (in1, 0, offset); tmp0 = swap32_S (tmp0); tmp1 = swap32_S (tmp1); @@ -270,9 +270,9 @@ DECLSPEC void memcat8 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, const u32 in0 = append[0]; u32 in1 = append[1]; - tmp0 = __byte_perm ( 0, in0, selector); - tmp1 = __byte_perm (in0, in1, selector); - tmp2 = __byte_perm (in1, 0, selector); + tmp0 = hc_byte_perm ( 0, in0, selector); + tmp1 = hc_byte_perm (in0, in1, selector); + tmp2 = hc_byte_perm (in1, 0, selector); #endif const u32 div = offset / 4; diff --git a/OpenCL/m07400-optimized.cl b/OpenCL/m07400-optimized.cl index 9c29b921b..f20e448c6 100644 --- a/OpenCL/m07400-optimized.cl +++ b/OpenCL/m07400-optimized.cl @@ -102,11 +102,11 @@ DECLSPEC u32 memcat16 (u32 *block, const u32 offset, const u32 *append, const u3 u32 in2 = swap32_S (append[2]); u32 in3 = swap32_S (append[3]); - tmp0 = amd_bytealign ( 0, in0, offset); - tmp1 = amd_bytealign (in0, in1, offset); - tmp2 = amd_bytealign (in1, in2, offset); - tmp3 = amd_bytealign (in2, in3, offset); - tmp4 = amd_bytealign (in3, 0, offset); + tmp0 = hc_bytealign ( 0, in0, offset); + tmp1 = hc_bytealign (in0, in1, offset); + tmp2 = hc_bytealign (in1, in2, offset); + tmp3 = hc_bytealign (in2, in3, offset); + tmp4 = hc_bytealign (in3, 0, offset); tmp0 = swap32_S (tmp0); tmp1 = swap32_S (tmp1); @@ -123,11 +123,11 @@ DECLSPEC u32 memcat16 (u32 *block, const u32 offset, const u32 *append, const u3 u32 in2 = append[2]; u32 in3 = append[3]; - tmp0 = __byte_perm ( 0, in0, selector); - tmp1 = __byte_perm (in0, in1, selector); - tmp2 = __byte_perm (in1, in2, selector); - tmp3 = __byte_perm (in2, in3, selector); - tmp4 = __byte_perm (in3, 0, selector); + tmp0 = hc_byte_perm ( 0, in0, selector); + tmp1 = hc_byte_perm (in0, in1, selector); + tmp2 = hc_byte_perm (in1, in2, selector); + tmp3 = hc_byte_perm (in2, in3, selector); + tmp4 = hc_byte_perm (in3, 0, selector); #endif switch (offset / 4) @@ -243,11 +243,11 @@ DECLSPEC u32 memcat16c (u32 *block, const u32 offset, const u32 *append, const u u32 in2 = swap32_S (append[2]); u32 in3 = swap32_S (append[3]); - tmp0 = amd_bytealign ( 0, in0, offset); - tmp1 = amd_bytealign (in0, in1, offset); - tmp2 = amd_bytealign (in1, in2, offset); - tmp3 = amd_bytealign (in2, in3, offset); - tmp4 = amd_bytealign (in3, 0, offset); + tmp0 = hc_bytealign ( 0, in0, offset); + tmp1 = hc_bytealign (in0, in1, offset); + tmp2 = hc_bytealign (in1, in2, offset); + tmp3 = hc_bytealign (in2, in3, offset); + tmp4 = hc_bytealign (in3, 0, offset); tmp0 = swap32_S (tmp0); tmp1 = swap32_S (tmp1); @@ -264,11 +264,11 @@ DECLSPEC u32 memcat16c (u32 *block, const u32 offset, const u32 *append, const u u32 in2 = append[2]; u32 in3 = append[3]; - tmp0 = __byte_perm ( 0, in0, selector); - tmp1 = __byte_perm (in0, in1, selector); - tmp2 = __byte_perm (in1, in2, selector); - tmp3 = __byte_perm (in2, in3, selector); - tmp4 = __byte_perm (in3, 0, selector); + tmp0 = hc_byte_perm ( 0, in0, selector); + tmp1 = hc_byte_perm (in0, in1, selector); + tmp2 = hc_byte_perm (in1, in2, selector); + tmp3 = hc_byte_perm (in2, in3, selector); + tmp4 = hc_byte_perm (in3, 0, selector); #endif u32 carry[4] = { 0, 0, 0, 0 }; @@ -410,11 +410,11 @@ DECLSPEC u32 memcat20 (u32 *block, const u32 offset, const u32 *append, const u3 u32 in2 = swap32_S (append[2]); u32 in3 = swap32_S (append[3]); - tmp0 = amd_bytealign ( 0, in0, offset); - tmp1 = amd_bytealign (in0, in1, offset); - tmp2 = amd_bytealign (in1, in2, offset); - tmp3 = amd_bytealign (in2, in3, offset); - tmp4 = amd_bytealign (in3, 0, offset); + tmp0 = hc_bytealign ( 0, in0, offset); + tmp1 = hc_bytealign (in0, in1, offset); + tmp2 = hc_bytealign (in1, in2, offset); + tmp3 = hc_bytealign (in2, in3, offset); + tmp4 = hc_bytealign (in3, 0, offset); tmp0 = swap32_S (tmp0); tmp1 = swap32_S (tmp1); @@ -431,11 +431,11 @@ DECLSPEC u32 memcat20 (u32 *block, const u32 offset, const u32 *append, const u3 u32 in2 = append[2]; u32 in3 = append[3]; - tmp0 = __byte_perm ( 0, in0, selector); - tmp1 = __byte_perm (in0, in1, selector); - tmp2 = __byte_perm (in1, in2, selector); - tmp3 = __byte_perm (in2, in3, selector); - tmp4 = __byte_perm (in3, 0, selector); + tmp0 = hc_byte_perm ( 0, in0, selector); + tmp1 = hc_byte_perm (in0, in1, selector); + tmp2 = hc_byte_perm (in1, in2, selector); + tmp3 = hc_byte_perm (in2, in3, selector); + tmp4 = hc_byte_perm (in3, 0, selector); #endif switch (offset / 4) @@ -560,11 +560,11 @@ DECLSPEC u32 memcat20_x80 (u32 *block, const u32 offset, const u32 *append, cons u32 in3 = swap32_S (append[3]); u32 in4 = 0x80000000; - tmp0 = amd_bytealign ( 0, in0, offset); - tmp1 = amd_bytealign (in0, in1, offset); - tmp2 = amd_bytealign (in1, in2, offset); - tmp3 = amd_bytealign (in2, in3, offset); - tmp4 = amd_bytealign (in3, in4, offset); + tmp0 = hc_bytealign ( 0, in0, offset); + tmp1 = hc_bytealign (in0, in1, offset); + tmp2 = hc_bytealign (in1, in2, offset); + tmp3 = hc_bytealign (in2, in3, offset); + tmp4 = hc_bytealign (in3, in4, offset); tmp0 = swap32_S (tmp0); tmp1 = swap32_S (tmp1); @@ -582,11 +582,11 @@ DECLSPEC u32 memcat20_x80 (u32 *block, const u32 offset, const u32 *append, cons u32 in3 = append[3]; u32 in4 = 0x80; - tmp0 = __byte_perm ( 0, in0, selector); - tmp1 = __byte_perm (in0, in1, selector); - tmp2 = __byte_perm (in1, in2, selector); - tmp3 = __byte_perm (in2, in3, selector); - tmp4 = __byte_perm (in3, in4, selector); + tmp0 = hc_byte_perm ( 0, in0, selector); + tmp1 = hc_byte_perm (in0, in1, selector); + tmp2 = hc_byte_perm (in1, in2, selector); + tmp3 = hc_byte_perm (in2, in3, selector); + tmp4 = hc_byte_perm (in3, in4, selector); #endif switch (offset / 4) diff --git a/OpenCL/m09000-pure.cl b/OpenCL/m09000-pure.cl index 14a874c27..89ad1be7c 100644 --- a/OpenCL/m09000-pure.cl +++ b/OpenCL/m09000-pure.cl @@ -316,10 +316,10 @@ __constant u32a c_pbox[18] = { \ u32 tmp; \ \ - tmp = S0[__bfe_S ((L), 24, 8)]; \ - tmp += S1[__bfe_S ((L), 16, 8)]; \ - tmp ^= S2[__bfe_S ((L), 8, 8)]; \ - tmp += S3[__bfe_S ((L), 0, 8)]; \ + tmp = S0[hc_bfe_S ((L), 24, 8)]; \ + tmp += S1[hc_bfe_S ((L), 16, 8)]; \ + tmp ^= S2[hc_bfe_S ((L), 8, 8)]; \ + tmp += S3[hc_bfe_S ((L), 0, 8)]; \ \ (R) ^= tmp ^ P[(N)]; \ } diff --git a/OpenCL/m10700-optimized.cl b/OpenCL/m10700-optimized.cl index 93310abf6..6a716ef74 100644 --- a/OpenCL/m10700-optimized.cl +++ b/OpenCL/m10700-optimized.cl @@ -200,11 +200,11 @@ DECLSPEC void make_sc (u32 *sc, const u32 *pw, const u32 pw_len, const u32 *bl, #if defined IS_AMD || defined IS_GENERIC for (i = 0; i < pd; i++) sc[idx++] = pw[i]; sc[idx++] = pw[i] - | amd_bytealign (bl[0], 0, pm4); - for (i = 1; i < bd; i++) sc[idx++] = amd_bytealign (bl[i], bl[i - 1], pm4); - sc[idx++] = amd_bytealign (sc[0], bl[i - 1], pm4); - for (i = 1; i < 4; i++) sc[idx++] = amd_bytealign (sc[i], sc[i - 1], pm4); - sc[idx++] = amd_bytealign ( 0, sc[i - 1], pm4); + | hc_bytealign (bl[0], 0, pm4); + for (i = 1; i < bd; i++) sc[idx++] = hc_bytealign (bl[i], bl[i - 1], pm4); + sc[idx++] = hc_bytealign (sc[0], bl[i - 1], pm4); + for (i = 1; i < 4; i++) sc[idx++] = hc_bytealign (sc[i], sc[i - 1], pm4); + sc[idx++] = hc_bytealign ( 0, sc[i - 1], pm4); #endif #ifdef IS_NV @@ -212,11 +212,11 @@ DECLSPEC void make_sc (u32 *sc, const u32 *pw, const u32 pw_len, const u32 *bl, for (i = 0; i < pd; i++) sc[idx++] = pw[i]; sc[idx++] = pw[i] - | __byte_perm ( 0, bl[0], selector); - for (i = 1; i < bd; i++) sc[idx++] = __byte_perm (bl[i - 1], bl[i], selector); - sc[idx++] = __byte_perm (bl[i - 1], sc[0], selector); - for (i = 1; i < 4; i++) sc[idx++] = __byte_perm (sc[i - 1], sc[i], selector); - sc[idx++] = __byte_perm (sc[i - 1], 0, selector); + | hc_byte_perm ( 0, bl[0], selector); + for (i = 1; i < bd; i++) sc[idx++] = hc_byte_perm (bl[i - 1], bl[i], selector); + sc[idx++] = hc_byte_perm (bl[i - 1], sc[0], selector); + for (i = 1; i < 4; i++) sc[idx++] = hc_byte_perm (sc[i - 1], sc[i], selector); + sc[idx++] = hc_byte_perm (sc[i - 1], 0, selector); #endif } } @@ -229,19 +229,19 @@ DECLSPEC void make_pt_with_offset (u32 *pt, const u32 offset, const u32 *sc, con const u32 od = m / 4; #if defined IS_AMD || defined IS_GENERIC - pt[0] = amd_bytealign (sc[od + 1], sc[od + 0], om); - pt[1] = amd_bytealign (sc[od + 2], sc[od + 1], om); - pt[2] = amd_bytealign (sc[od + 3], sc[od + 2], om); - pt[3] = amd_bytealign (sc[od + 4], sc[od + 3], om); + pt[0] = hc_bytealign (sc[od + 1], sc[od + 0], om); + pt[1] = hc_bytealign (sc[od + 2], sc[od + 1], om); + pt[2] = hc_bytealign (sc[od + 3], sc[od + 2], om); + pt[3] = hc_bytealign (sc[od + 4], sc[od + 3], om); #endif #ifdef IS_NV int selector = (0x76543210 >> (om * 4)) & 0xffff; - pt[0] = __byte_perm (sc[od + 0], sc[od + 1], selector); - pt[1] = __byte_perm (sc[od + 1], sc[od + 2], selector); - pt[2] = __byte_perm (sc[od + 2], sc[od + 3], selector); - pt[3] = __byte_perm (sc[od + 3], sc[od + 4], selector); + pt[0] = hc_byte_perm (sc[od + 0], sc[od + 1], selector); + pt[1] = hc_byte_perm (sc[od + 1], sc[od + 2], selector); + pt[2] = hc_byte_perm (sc[od + 2], sc[od + 3], selector); + pt[3] = hc_byte_perm (sc[od + 3], sc[od + 4], selector); #endif } diff --git a/OpenCL/m11600-pure.cl b/OpenCL/m11600-pure.cl index b001611da..43805f615 100644 --- a/OpenCL/m11600-pure.cl +++ b/OpenCL/m11600-pure.cl @@ -23,13 +23,13 @@ DECLSPEC void memcat8c_be (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 len, co #ifdef IS_NV const int selector = (0x76543210 >> ((func_len & 3) * 4)) & 0xffff; - tmp0 = __byte_perm (append, 0, selector); - tmp1 = __byte_perm (0, append, selector); + tmp0 = hc_byte_perm (append, 0, selector); + tmp1 = hc_byte_perm (0, append, selector); #endif #if defined IS_AMD || defined IS_GENERIC - tmp0 = amd_bytealign (0, append, func_len); - tmp1 = amd_bytealign (append, 0, func_len); + tmp0 = hc_bytealign (0, append, func_len); + tmp1 = hc_bytealign (append, 0, func_len); #endif u32 carry = 0; diff --git a/OpenCL/m13800_a0-optimized.cl b/OpenCL/m13800_a0-optimized.cl index b504673c3..6b5898bcf 100644 --- a/OpenCL/m13800_a0-optimized.cl +++ b/OpenCL/m13800_a0-optimized.cl @@ -45,45 +45,45 @@ DECLSPEC void memcat64c_be (u32x *block, const u32 offset, u32x *carry) u32x tmp16; #if defined IS_AMD || defined IS_GENERIC - tmp00 = amd_bytealign ( 0, carry[ 0], offset); - tmp01 = amd_bytealign (carry[ 0], carry[ 1], offset); - tmp02 = amd_bytealign (carry[ 1], carry[ 2], offset); - tmp03 = amd_bytealign (carry[ 2], carry[ 3], offset); - tmp04 = amd_bytealign (carry[ 3], carry[ 4], offset); - tmp05 = amd_bytealign (carry[ 4], carry[ 5], offset); - tmp06 = amd_bytealign (carry[ 5], carry[ 6], offset); - tmp07 = amd_bytealign (carry[ 6], carry[ 7], offset); - tmp08 = amd_bytealign (carry[ 7], carry[ 8], offset); - tmp09 = amd_bytealign (carry[ 8], carry[ 9], offset); - tmp10 = amd_bytealign (carry[ 9], carry[10], offset); - tmp11 = amd_bytealign (carry[10], carry[11], offset); - tmp12 = amd_bytealign (carry[11], carry[12], offset); - tmp13 = amd_bytealign (carry[12], carry[13], offset); - tmp14 = amd_bytealign (carry[13], carry[14], offset); - tmp15 = amd_bytealign (carry[14], carry[15], offset); - tmp16 = amd_bytealign (carry[15], 0, offset); + tmp00 = hc_bytealign ( 0, carry[ 0], offset); + tmp01 = hc_bytealign (carry[ 0], carry[ 1], offset); + tmp02 = hc_bytealign (carry[ 1], carry[ 2], offset); + tmp03 = hc_bytealign (carry[ 2], carry[ 3], offset); + tmp04 = hc_bytealign (carry[ 3], carry[ 4], offset); + tmp05 = hc_bytealign (carry[ 4], carry[ 5], offset); + tmp06 = hc_bytealign (carry[ 5], carry[ 6], offset); + tmp07 = hc_bytealign (carry[ 6], carry[ 7], offset); + tmp08 = hc_bytealign (carry[ 7], carry[ 8], offset); + tmp09 = hc_bytealign (carry[ 8], carry[ 9], offset); + tmp10 = hc_bytealign (carry[ 9], carry[10], offset); + tmp11 = hc_bytealign (carry[10], carry[11], offset); + tmp12 = hc_bytealign (carry[11], carry[12], offset); + tmp13 = hc_bytealign (carry[12], carry[13], offset); + tmp14 = hc_bytealign (carry[13], carry[14], offset); + tmp15 = hc_bytealign (carry[14], carry[15], offset); + tmp16 = hc_bytealign (carry[15], 0, offset); #endif #ifdef IS_NV const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; - tmp00 = __byte_perm (carry[ 0], 0, selector); - tmp01 = __byte_perm (carry[ 1], carry[ 0], selector); - tmp02 = __byte_perm (carry[ 2], carry[ 1], selector); - tmp03 = __byte_perm (carry[ 3], carry[ 2], selector); - tmp04 = __byte_perm (carry[ 4], carry[ 3], selector); - tmp05 = __byte_perm (carry[ 5], carry[ 4], selector); - tmp06 = __byte_perm (carry[ 6], carry[ 5], selector); - tmp07 = __byte_perm (carry[ 7], carry[ 6], selector); - tmp08 = __byte_perm (carry[ 8], carry[ 7], selector); - tmp09 = __byte_perm (carry[ 9], carry[ 8], selector); - tmp10 = __byte_perm (carry[10], carry[ 9], selector); - tmp11 = __byte_perm (carry[11], carry[10], selector); - tmp12 = __byte_perm (carry[12], carry[11], selector); - tmp13 = __byte_perm (carry[13], carry[12], selector); - tmp14 = __byte_perm (carry[14], carry[13], selector); - tmp15 = __byte_perm (carry[15], carry[14], selector); - tmp16 = __byte_perm ( 0, carry[15], selector); + tmp00 = hc_byte_perm (carry[ 0], 0, selector); + tmp01 = hc_byte_perm (carry[ 1], carry[ 0], selector); + tmp02 = hc_byte_perm (carry[ 2], carry[ 1], selector); + tmp03 = hc_byte_perm (carry[ 3], carry[ 2], selector); + tmp04 = hc_byte_perm (carry[ 4], carry[ 3], selector); + tmp05 = hc_byte_perm (carry[ 5], carry[ 4], selector); + tmp06 = hc_byte_perm (carry[ 6], carry[ 5], selector); + tmp07 = hc_byte_perm (carry[ 7], carry[ 6], selector); + tmp08 = hc_byte_perm (carry[ 8], carry[ 7], selector); + tmp09 = hc_byte_perm (carry[ 9], carry[ 8], selector); + tmp10 = hc_byte_perm (carry[10], carry[ 9], selector); + tmp11 = hc_byte_perm (carry[11], carry[10], selector); + tmp12 = hc_byte_perm (carry[12], carry[11], selector); + tmp13 = hc_byte_perm (carry[13], carry[12], selector); + tmp14 = hc_byte_perm (carry[14], carry[13], selector); + tmp15 = hc_byte_perm (carry[15], carry[14], selector); + tmp16 = hc_byte_perm ( 0, carry[15], selector); #endif carry[ 0] = 0; diff --git a/OpenCL/m13800_a1-optimized.cl b/OpenCL/m13800_a1-optimized.cl index 1f4da9102..09acc73eb 100644 --- a/OpenCL/m13800_a1-optimized.cl +++ b/OpenCL/m13800_a1-optimized.cl @@ -43,45 +43,45 @@ DECLSPEC void memcat64c_be (u32x *block, const u32 offset, u32x *carry) u32x tmp16; #if defined IS_AMD || defined IS_GENERIC - tmp00 = amd_bytealign ( 0, carry[ 0], offset); - tmp01 = amd_bytealign (carry[ 0], carry[ 1], offset); - tmp02 = amd_bytealign (carry[ 1], carry[ 2], offset); - tmp03 = amd_bytealign (carry[ 2], carry[ 3], offset); - tmp04 = amd_bytealign (carry[ 3], carry[ 4], offset); - tmp05 = amd_bytealign (carry[ 4], carry[ 5], offset); - tmp06 = amd_bytealign (carry[ 5], carry[ 6], offset); - tmp07 = amd_bytealign (carry[ 6], carry[ 7], offset); - tmp08 = amd_bytealign (carry[ 7], carry[ 8], offset); - tmp09 = amd_bytealign (carry[ 8], carry[ 9], offset); - tmp10 = amd_bytealign (carry[ 9], carry[10], offset); - tmp11 = amd_bytealign (carry[10], carry[11], offset); - tmp12 = amd_bytealign (carry[11], carry[12], offset); - tmp13 = amd_bytealign (carry[12], carry[13], offset); - tmp14 = amd_bytealign (carry[13], carry[14], offset); - tmp15 = amd_bytealign (carry[14], carry[15], offset); - tmp16 = amd_bytealign (carry[15], 0, offset); + tmp00 = hc_bytealign ( 0, carry[ 0], offset); + tmp01 = hc_bytealign (carry[ 0], carry[ 1], offset); + tmp02 = hc_bytealign (carry[ 1], carry[ 2], offset); + tmp03 = hc_bytealign (carry[ 2], carry[ 3], offset); + tmp04 = hc_bytealign (carry[ 3], carry[ 4], offset); + tmp05 = hc_bytealign (carry[ 4], carry[ 5], offset); + tmp06 = hc_bytealign (carry[ 5], carry[ 6], offset); + tmp07 = hc_bytealign (carry[ 6], carry[ 7], offset); + tmp08 = hc_bytealign (carry[ 7], carry[ 8], offset); + tmp09 = hc_bytealign (carry[ 8], carry[ 9], offset); + tmp10 = hc_bytealign (carry[ 9], carry[10], offset); + tmp11 = hc_bytealign (carry[10], carry[11], offset); + tmp12 = hc_bytealign (carry[11], carry[12], offset); + tmp13 = hc_bytealign (carry[12], carry[13], offset); + tmp14 = hc_bytealign (carry[13], carry[14], offset); + tmp15 = hc_bytealign (carry[14], carry[15], offset); + tmp16 = hc_bytealign (carry[15], 0, offset); #endif #ifdef IS_NV const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; - tmp00 = __byte_perm (carry[ 0], 0, selector); - tmp01 = __byte_perm (carry[ 1], carry[ 0], selector); - tmp02 = __byte_perm (carry[ 2], carry[ 1], selector); - tmp03 = __byte_perm (carry[ 3], carry[ 2], selector); - tmp04 = __byte_perm (carry[ 4], carry[ 3], selector); - tmp05 = __byte_perm (carry[ 5], carry[ 4], selector); - tmp06 = __byte_perm (carry[ 6], carry[ 5], selector); - tmp07 = __byte_perm (carry[ 7], carry[ 6], selector); - tmp08 = __byte_perm (carry[ 8], carry[ 7], selector); - tmp09 = __byte_perm (carry[ 9], carry[ 8], selector); - tmp10 = __byte_perm (carry[10], carry[ 9], selector); - tmp11 = __byte_perm (carry[11], carry[10], selector); - tmp12 = __byte_perm (carry[12], carry[11], selector); - tmp13 = __byte_perm (carry[13], carry[12], selector); - tmp14 = __byte_perm (carry[14], carry[13], selector); - tmp15 = __byte_perm (carry[15], carry[14], selector); - tmp16 = __byte_perm ( 0, carry[15], selector); + tmp00 = hc_byte_perm (carry[ 0], 0, selector); + tmp01 = hc_byte_perm (carry[ 1], carry[ 0], selector); + tmp02 = hc_byte_perm (carry[ 2], carry[ 1], selector); + tmp03 = hc_byte_perm (carry[ 3], carry[ 2], selector); + tmp04 = hc_byte_perm (carry[ 4], carry[ 3], selector); + tmp05 = hc_byte_perm (carry[ 5], carry[ 4], selector); + tmp06 = hc_byte_perm (carry[ 6], carry[ 5], selector); + tmp07 = hc_byte_perm (carry[ 7], carry[ 6], selector); + tmp08 = hc_byte_perm (carry[ 8], carry[ 7], selector); + tmp09 = hc_byte_perm (carry[ 9], carry[ 8], selector); + tmp10 = hc_byte_perm (carry[10], carry[ 9], selector); + tmp11 = hc_byte_perm (carry[11], carry[10], selector); + tmp12 = hc_byte_perm (carry[12], carry[11], selector); + tmp13 = hc_byte_perm (carry[13], carry[12], selector); + tmp14 = hc_byte_perm (carry[14], carry[13], selector); + tmp15 = hc_byte_perm (carry[15], carry[14], selector); + tmp16 = hc_byte_perm ( 0, carry[15], selector); #endif carry[ 0] = 0; diff --git a/OpenCL/m13800_a3-optimized.cl b/OpenCL/m13800_a3-optimized.cl index 1225a825c..4d73a7706 100644 --- a/OpenCL/m13800_a3-optimized.cl +++ b/OpenCL/m13800_a3-optimized.cl @@ -42,45 +42,45 @@ DECLSPEC void memcat64c_be (u32x *block, const u32 offset, u32x *carry) u32x tmp16; #if defined IS_AMD || defined IS_GENERIC - tmp00 = amd_bytealign ( 0, carry[ 0], offset); - tmp01 = amd_bytealign (carry[ 0], carry[ 1], offset); - tmp02 = amd_bytealign (carry[ 1], carry[ 2], offset); - tmp03 = amd_bytealign (carry[ 2], carry[ 3], offset); - tmp04 = amd_bytealign (carry[ 3], carry[ 4], offset); - tmp05 = amd_bytealign (carry[ 4], carry[ 5], offset); - tmp06 = amd_bytealign (carry[ 5], carry[ 6], offset); - tmp07 = amd_bytealign (carry[ 6], carry[ 7], offset); - tmp08 = amd_bytealign (carry[ 7], carry[ 8], offset); - tmp09 = amd_bytealign (carry[ 8], carry[ 9], offset); - tmp10 = amd_bytealign (carry[ 9], carry[10], offset); - tmp11 = amd_bytealign (carry[10], carry[11], offset); - tmp12 = amd_bytealign (carry[11], carry[12], offset); - tmp13 = amd_bytealign (carry[12], carry[13], offset); - tmp14 = amd_bytealign (carry[13], carry[14], offset); - tmp15 = amd_bytealign (carry[14], carry[15], offset); - tmp16 = amd_bytealign (carry[15], 0, offset); + tmp00 = hc_bytealign ( 0, carry[ 0], offset); + tmp01 = hc_bytealign (carry[ 0], carry[ 1], offset); + tmp02 = hc_bytealign (carry[ 1], carry[ 2], offset); + tmp03 = hc_bytealign (carry[ 2], carry[ 3], offset); + tmp04 = hc_bytealign (carry[ 3], carry[ 4], offset); + tmp05 = hc_bytealign (carry[ 4], carry[ 5], offset); + tmp06 = hc_bytealign (carry[ 5], carry[ 6], offset); + tmp07 = hc_bytealign (carry[ 6], carry[ 7], offset); + tmp08 = hc_bytealign (carry[ 7], carry[ 8], offset); + tmp09 = hc_bytealign (carry[ 8], carry[ 9], offset); + tmp10 = hc_bytealign (carry[ 9], carry[10], offset); + tmp11 = hc_bytealign (carry[10], carry[11], offset); + tmp12 = hc_bytealign (carry[11], carry[12], offset); + tmp13 = hc_bytealign (carry[12], carry[13], offset); + tmp14 = hc_bytealign (carry[13], carry[14], offset); + tmp15 = hc_bytealign (carry[14], carry[15], offset); + tmp16 = hc_bytealign (carry[15], 0, offset); #endif #ifdef IS_NV const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; - tmp00 = __byte_perm (carry[ 0], 0, selector); - tmp01 = __byte_perm (carry[ 1], carry[ 0], selector); - tmp02 = __byte_perm (carry[ 2], carry[ 1], selector); - tmp03 = __byte_perm (carry[ 3], carry[ 2], selector); - tmp04 = __byte_perm (carry[ 4], carry[ 3], selector); - tmp05 = __byte_perm (carry[ 5], carry[ 4], selector); - tmp06 = __byte_perm (carry[ 6], carry[ 5], selector); - tmp07 = __byte_perm (carry[ 7], carry[ 6], selector); - tmp08 = __byte_perm (carry[ 8], carry[ 7], selector); - tmp09 = __byte_perm (carry[ 9], carry[ 8], selector); - tmp10 = __byte_perm (carry[10], carry[ 9], selector); - tmp11 = __byte_perm (carry[11], carry[10], selector); - tmp12 = __byte_perm (carry[12], carry[11], selector); - tmp13 = __byte_perm (carry[13], carry[12], selector); - tmp14 = __byte_perm (carry[14], carry[13], selector); - tmp15 = __byte_perm (carry[15], carry[14], selector); - tmp16 = __byte_perm ( 0, carry[15], selector); + tmp00 = hc_byte_perm (carry[ 0], 0, selector); + tmp01 = hc_byte_perm (carry[ 1], carry[ 0], selector); + tmp02 = hc_byte_perm (carry[ 2], carry[ 1], selector); + tmp03 = hc_byte_perm (carry[ 3], carry[ 2], selector); + tmp04 = hc_byte_perm (carry[ 4], carry[ 3], selector); + tmp05 = hc_byte_perm (carry[ 5], carry[ 4], selector); + tmp06 = hc_byte_perm (carry[ 6], carry[ 5], selector); + tmp07 = hc_byte_perm (carry[ 7], carry[ 6], selector); + tmp08 = hc_byte_perm (carry[ 8], carry[ 7], selector); + tmp09 = hc_byte_perm (carry[ 9], carry[ 8], selector); + tmp10 = hc_byte_perm (carry[10], carry[ 9], selector); + tmp11 = hc_byte_perm (carry[11], carry[10], selector); + tmp12 = hc_byte_perm (carry[12], carry[11], selector); + tmp13 = hc_byte_perm (carry[13], carry[12], selector); + tmp14 = hc_byte_perm (carry[14], carry[13], selector); + tmp15 = hc_byte_perm (carry[15], carry[14], selector); + tmp16 = hc_byte_perm ( 0, carry[15], selector); #endif carry[ 0] = 0;