diff --git a/OpenCL/inc_common.cl b/OpenCL/inc_common.cl index 407a24ef6..5c34a824f 100644 --- a/OpenCL/inc_common.cl +++ b/OpenCL/inc_common.cl @@ -7499,7 +7499,7 @@ DECLSPEC void switch_buffer_by_offset_8x4_le (u32x *w0, u32x *w1, u32x *w2, u32x #endif } -DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x *w3, u32x *w4, u32x *w5, u32x *w6, u32x *w7, const u32 offset) +DECLSPEC void switch_buffer_by_offset_8x4_carry_le (u32x *w0, u32x *w1, u32x *w2, u32x *w3, u32x *w4, u32x *w5, u32x *w6, u32x *w7, u32x *c0, u32x *c1, u32x *c2, u32x *c3, u32x *c4, u32x *c5, u32x *c6, u32x *c7, const u32 offset) { const int offset_switch = offset / 4; @@ -7507,143 +7507,153 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x switch (offset_switch) { case 0: - w7[3] = hc_bytealign_be (w7[2], w7[3], offset); - w7[2] = hc_bytealign_be (w7[1], w7[2], offset); - w7[1] = hc_bytealign_be (w7[0], w7[1], offset); - w7[0] = hc_bytealign_be (w6[3], w7[0], offset); - w6[3] = hc_bytealign_be (w6[2], w6[3], offset); - w6[2] = hc_bytealign_be (w6[1], w6[2], offset); - w6[1] = hc_bytealign_be (w6[0], w6[1], offset); - w6[0] = hc_bytealign_be (w5[3], w6[0], offset); - w5[3] = hc_bytealign_be (w5[2], w5[3], offset); - w5[2] = hc_bytealign_be (w5[1], w5[2], offset); - w5[1] = hc_bytealign_be (w5[0], w5[1], offset); - w5[0] = hc_bytealign_be (w4[3], w5[0], offset); - w4[3] = hc_bytealign_be (w4[2], w4[3], offset); - w4[2] = hc_bytealign_be (w4[1], w4[2], offset); - w4[1] = hc_bytealign_be (w4[0], w4[1], offset); - w4[0] = hc_bytealign_be (w3[3], w4[0], offset); - w3[3] = hc_bytealign_be (w3[2], w3[3], offset); - w3[2] = hc_bytealign_be (w3[1], w3[2], offset); - w3[1] = hc_bytealign_be (w3[0], w3[1], offset); - w3[0] = hc_bytealign_be (w2[3], w3[0], offset); - w2[3] = hc_bytealign_be (w2[2], w2[3], offset); - w2[2] = hc_bytealign_be (w2[1], w2[2], offset); - w2[1] = hc_bytealign_be (w2[0], w2[1], offset); - w2[0] = hc_bytealign_be (w1[3], w2[0], offset); - w1[3] = hc_bytealign_be (w1[2], w1[3], offset); - w1[2] = hc_bytealign_be (w1[1], w1[2], offset); - w1[1] = hc_bytealign_be (w1[0], w1[1], offset); - w1[0] = hc_bytealign_be (w0[3], w1[0], offset); - w0[3] = hc_bytealign_be (w0[2], w0[3], offset); - w0[2] = hc_bytealign_be (w0[1], w0[2], offset); - w0[1] = hc_bytealign_be (w0[0], w0[1], offset); - w0[0] = hc_bytealign_be ( 0, w0[0], offset); + c0[0] = hc_bytealign (w7[3], 0, offset); + w7[3] = hc_bytealign (w7[2], w7[3], offset); + w7[2] = hc_bytealign (w7[1], w7[2], offset); + w7[1] = hc_bytealign (w7[0], w7[1], offset); + w7[0] = hc_bytealign (w6[3], w7[0], offset); + w6[3] = hc_bytealign (w6[2], w6[3], offset); + w6[2] = hc_bytealign (w6[1], w6[2], offset); + w6[1] = hc_bytealign (w6[0], w6[1], offset); + w6[0] = hc_bytealign (w5[3], w6[0], offset); + w5[3] = hc_bytealign (w5[2], w5[3], offset); + w5[2] = hc_bytealign (w5[1], w5[2], offset); + w5[1] = hc_bytealign (w5[0], w5[1], offset); + w5[0] = hc_bytealign (w4[3], w5[0], offset); + w4[3] = hc_bytealign (w4[2], w4[3], offset); + w4[2] = hc_bytealign (w4[1], w4[2], offset); + w4[1] = hc_bytealign (w4[0], w4[1], offset); + w4[0] = hc_bytealign (w3[3], w4[0], offset); + w3[3] = hc_bytealign (w3[2], w3[3], offset); + w3[2] = hc_bytealign (w3[1], w3[2], offset); + w3[1] = hc_bytealign (w3[0], w3[1], offset); + w3[0] = hc_bytealign (w2[3], w3[0], offset); + w2[3] = hc_bytealign (w2[2], w2[3], offset); + w2[2] = hc_bytealign (w2[1], w2[2], offset); + w2[1] = hc_bytealign (w2[0], w2[1], offset); + w2[0] = hc_bytealign (w1[3], w2[0], offset); + w1[3] = hc_bytealign (w1[2], w1[3], offset); + w1[2] = hc_bytealign (w1[1], w1[2], offset); + w1[1] = hc_bytealign (w1[0], w1[1], offset); + w1[0] = hc_bytealign (w0[3], w1[0], offset); + w0[3] = hc_bytealign (w0[2], w0[3], offset); + w0[2] = hc_bytealign (w0[1], w0[2], offset); + w0[1] = hc_bytealign (w0[0], w0[1], offset); + w0[0] = hc_bytealign ( 0, w0[0], offset); break; case 1: - w7[3] = hc_bytealign_be (w7[1], w7[2], offset); - w7[2] = hc_bytealign_be (w7[0], w7[1], offset); - w7[1] = hc_bytealign_be (w6[3], w7[0], offset); - w7[0] = hc_bytealign_be (w6[2], w6[3], offset); - w6[3] = hc_bytealign_be (w6[1], w6[2], offset); - w6[2] = hc_bytealign_be (w6[0], w6[1], offset); - w6[1] = hc_bytealign_be (w5[3], w6[0], offset); - w6[0] = hc_bytealign_be (w5[2], w5[3], offset); - w5[3] = hc_bytealign_be (w5[1], w5[2], offset); - w5[2] = hc_bytealign_be (w5[0], w5[1], offset); - w5[1] = hc_bytealign_be (w4[3], w5[0], offset); - w5[0] = hc_bytealign_be (w4[2], w4[3], offset); - w4[3] = hc_bytealign_be (w4[1], w4[2], offset); - w4[2] = hc_bytealign_be (w4[0], w4[1], offset); - w4[1] = hc_bytealign_be (w3[3], w4[0], offset); - w4[0] = hc_bytealign_be (w3[2], w3[3], offset); - w3[3] = hc_bytealign_be (w3[1], w3[2], offset); - w3[2] = hc_bytealign_be (w3[0], w3[1], offset); - w3[1] = hc_bytealign_be (w2[3], w3[0], offset); - w3[0] = hc_bytealign_be (w2[2], w2[3], offset); - w2[3] = hc_bytealign_be (w2[1], w2[2], offset); - w2[2] = hc_bytealign_be (w2[0], w2[1], offset); - w2[1] = hc_bytealign_be (w1[3], w2[0], offset); - w2[0] = hc_bytealign_be (w1[2], w1[3], offset); - w1[3] = hc_bytealign_be (w1[1], w1[2], offset); - w1[2] = hc_bytealign_be (w1[0], w1[1], offset); - w1[1] = hc_bytealign_be (w0[3], w1[0], offset); - w1[0] = hc_bytealign_be (w0[2], w0[3], offset); - w0[3] = hc_bytealign_be (w0[1], w0[2], offset); - w0[2] = hc_bytealign_be (w0[0], w0[1], offset); - w0[1] = hc_bytealign_be ( 0, w0[0], offset); + c0[1] = hc_bytealign (w7[3], 0, offset); + c0[0] = hc_bytealign (w7[2], w7[3], offset); + w7[3] = hc_bytealign (w7[1], w7[2], offset); + w7[2] = hc_bytealign (w7[0], w7[1], offset); + w7[1] = hc_bytealign (w6[3], w7[0], offset); + w7[0] = hc_bytealign (w6[2], w6[3], offset); + w6[3] = hc_bytealign (w6[1], w6[2], offset); + w6[2] = hc_bytealign (w6[0], w6[1], offset); + w6[1] = hc_bytealign (w5[3], w6[0], offset); + w6[0] = hc_bytealign (w5[2], w5[3], offset); + w5[3] = hc_bytealign (w5[1], w5[2], offset); + w5[2] = hc_bytealign (w5[0], w5[1], offset); + w5[1] = hc_bytealign (w4[3], w5[0], offset); + w5[0] = hc_bytealign (w4[2], w4[3], offset); + w4[3] = hc_bytealign (w4[1], w4[2], offset); + w4[2] = hc_bytealign (w4[0], w4[1], offset); + w4[1] = hc_bytealign (w3[3], w4[0], offset); + w4[0] = hc_bytealign (w3[2], w3[3], offset); + w3[3] = hc_bytealign (w3[1], w3[2], offset); + w3[2] = hc_bytealign (w3[0], w3[1], offset); + w3[1] = hc_bytealign (w2[3], w3[0], offset); + w3[0] = hc_bytealign (w2[2], w2[3], offset); + w2[3] = hc_bytealign (w2[1], w2[2], offset); + w2[2] = hc_bytealign (w2[0], w2[1], offset); + w2[1] = hc_bytealign (w1[3], w2[0], offset); + w2[0] = hc_bytealign (w1[2], w1[3], offset); + w1[3] = hc_bytealign (w1[1], w1[2], offset); + w1[2] = hc_bytealign (w1[0], w1[1], offset); + w1[1] = hc_bytealign (w0[3], w1[0], offset); + w1[0] = hc_bytealign (w0[2], w0[3], offset); + w0[3] = hc_bytealign (w0[1], w0[2], offset); + w0[2] = hc_bytealign (w0[0], w0[1], offset); + w0[1] = hc_bytealign ( 0, w0[0], offset); w0[0] = 0; break; case 2: - w7[3] = hc_bytealign_be (w7[0], w7[1], offset); - w7[2] = hc_bytealign_be (w6[3], w7[0], offset); - w7[1] = hc_bytealign_be (w6[2], w6[3], offset); - w7[0] = hc_bytealign_be (w6[1], w6[2], offset); - w6[3] = hc_bytealign_be (w6[0], w6[1], offset); - w6[2] = hc_bytealign_be (w5[3], w6[0], offset); - w6[1] = hc_bytealign_be (w5[2], w5[3], offset); - w6[0] = hc_bytealign_be (w5[1], w5[2], offset); - w5[3] = hc_bytealign_be (w5[0], w5[1], offset); - w5[2] = hc_bytealign_be (w4[3], w5[0], offset); - w5[1] = hc_bytealign_be (w4[2], w4[3], offset); - w5[0] = hc_bytealign_be (w4[1], w4[2], offset); - w4[3] = hc_bytealign_be (w4[0], w4[1], offset); - w4[2] = hc_bytealign_be (w3[3], w4[0], offset); - w4[1] = hc_bytealign_be (w3[2], w3[3], offset); - w4[0] = hc_bytealign_be (w3[1], w3[2], offset); - w3[3] = hc_bytealign_be (w3[0], w3[1], offset); - w3[2] = hc_bytealign_be (w2[3], w3[0], offset); - w3[1] = hc_bytealign_be (w2[2], w2[3], offset); - w3[0] = hc_bytealign_be (w2[1], w2[2], offset); - w2[3] = hc_bytealign_be (w2[0], w2[1], offset); - w2[2] = hc_bytealign_be (w1[3], w2[0], offset); - w2[1] = hc_bytealign_be (w1[2], w1[3], offset); - w2[0] = hc_bytealign_be (w1[1], w1[2], offset); - w1[3] = hc_bytealign_be (w1[0], w1[1], offset); - w1[2] = hc_bytealign_be (w0[3], w1[0], offset); - w1[1] = hc_bytealign_be (w0[2], w0[3], offset); - w1[0] = hc_bytealign_be (w0[1], w0[2], offset); - w0[3] = hc_bytealign_be (w0[0], w0[1], offset); - w0[2] = hc_bytealign_be ( 0, w0[0], offset); + c0[2] = hc_bytealign (w7[3], 0, offset); + c0[1] = hc_bytealign (w7[2], w7[3], offset); + c0[0] = hc_bytealign (w7[1], w7[2], offset); + w7[3] = hc_bytealign (w7[0], w7[1], offset); + w7[2] = hc_bytealign (w6[3], w7[0], offset); + w7[1] = hc_bytealign (w6[2], w6[3], offset); + w7[0] = hc_bytealign (w6[1], w6[2], offset); + w6[3] = hc_bytealign (w6[0], w6[1], offset); + w6[2] = hc_bytealign (w5[3], w6[0], offset); + w6[1] = hc_bytealign (w5[2], w5[3], offset); + w6[0] = hc_bytealign (w5[1], w5[2], offset); + w5[3] = hc_bytealign (w5[0], w5[1], offset); + w5[2] = hc_bytealign (w4[3], w5[0], offset); + w5[1] = hc_bytealign (w4[2], w4[3], offset); + w5[0] = hc_bytealign (w4[1], w4[2], offset); + w4[3] = hc_bytealign (w4[0], w4[1], offset); + w4[2] = hc_bytealign (w3[3], w4[0], offset); + w4[1] = hc_bytealign (w3[2], w3[3], offset); + w4[0] = hc_bytealign (w3[1], w3[2], offset); + w3[3] = hc_bytealign (w3[0], w3[1], offset); + w3[2] = hc_bytealign (w2[3], w3[0], offset); + w3[1] = hc_bytealign (w2[2], w2[3], offset); + w3[0] = hc_bytealign (w2[1], w2[2], offset); + w2[3] = hc_bytealign (w2[0], w2[1], offset); + w2[2] = hc_bytealign (w1[3], w2[0], offset); + w2[1] = hc_bytealign (w1[2], w1[3], offset); + w2[0] = hc_bytealign (w1[1], w1[2], offset); + w1[3] = hc_bytealign (w1[0], w1[1], offset); + w1[2] = hc_bytealign (w0[3], w1[0], offset); + w1[1] = hc_bytealign (w0[2], w0[3], offset); + w1[0] = hc_bytealign (w0[1], w0[2], offset); + w0[3] = hc_bytealign (w0[0], w0[1], offset); + w0[2] = hc_bytealign ( 0, w0[0], offset); w0[1] = 0; w0[0] = 0; break; case 3: - w7[3] = hc_bytealign_be (w6[3], w7[0], offset); - w7[2] = hc_bytealign_be (w6[2], w6[3], offset); - w7[1] = hc_bytealign_be (w6[1], w6[2], offset); - w7[0] = hc_bytealign_be (w6[0], w6[1], offset); - w6[3] = hc_bytealign_be (w5[3], w6[0], offset); - w6[2] = hc_bytealign_be (w5[2], w5[3], offset); - w6[1] = hc_bytealign_be (w5[1], w5[2], offset); - w6[0] = hc_bytealign_be (w5[0], w5[1], offset); - w5[3] = hc_bytealign_be (w4[3], w5[0], offset); - w5[2] = hc_bytealign_be (w4[2], w4[3], offset); - w5[1] = hc_bytealign_be (w4[1], w4[2], offset); - w5[0] = hc_bytealign_be (w4[0], w4[1], offset); - w4[3] = hc_bytealign_be (w3[3], w4[0], offset); - w4[2] = hc_bytealign_be (w3[2], w3[3], offset); - w4[1] = hc_bytealign_be (w3[1], w3[2], offset); - w4[0] = hc_bytealign_be (w3[0], w3[1], offset); - w3[3] = hc_bytealign_be (w2[3], w3[0], offset); - w3[2] = hc_bytealign_be (w2[2], w2[3], offset); - w3[1] = hc_bytealign_be (w2[1], w2[2], offset); - w3[0] = hc_bytealign_be (w2[0], w2[1], offset); - w2[3] = hc_bytealign_be (w1[3], w2[0], offset); - w2[2] = hc_bytealign_be (w1[2], w1[3], offset); - w2[1] = hc_bytealign_be (w1[1], w1[2], offset); - w2[0] = hc_bytealign_be (w1[0], w1[1], offset); - w1[3] = hc_bytealign_be (w0[3], w1[0], offset); - w1[2] = hc_bytealign_be (w0[2], w0[3], offset); - w1[1] = hc_bytealign_be (w0[1], w0[2], offset); - w1[0] = hc_bytealign_be (w0[0], w0[1], offset); - w0[3] = hc_bytealign_be ( 0, w0[0], offset); + c0[3] = hc_bytealign (w7[3], 0, offset); + c0[2] = hc_bytealign (w7[2], w7[3], offset); + c0[1] = hc_bytealign (w7[1], w7[2], offset); + c0[0] = hc_bytealign (w7[0], w7[1], offset); + w7[3] = hc_bytealign (w6[3], w7[0], offset); + w7[2] = hc_bytealign (w6[2], w6[3], offset); + w7[1] = hc_bytealign (w6[1], w6[2], offset); + w7[0] = hc_bytealign (w6[0], w6[1], offset); + w6[3] = hc_bytealign (w5[3], w6[0], offset); + w6[2] = hc_bytealign (w5[2], w5[3], offset); + w6[1] = hc_bytealign (w5[1], w5[2], offset); + w6[0] = hc_bytealign (w5[0], w5[1], offset); + w5[3] = hc_bytealign (w4[3], w5[0], offset); + w5[2] = hc_bytealign (w4[2], w4[3], offset); + w5[1] = hc_bytealign (w4[1], w4[2], offset); + w5[0] = hc_bytealign (w4[0], w4[1], offset); + w4[3] = hc_bytealign (w3[3], w4[0], offset); + w4[2] = hc_bytealign (w3[2], w3[3], offset); + w4[1] = hc_bytealign (w3[1], w3[2], offset); + w4[0] = hc_bytealign (w3[0], w3[1], offset); + w3[3] = hc_bytealign (w2[3], w3[0], offset); + w3[2] = hc_bytealign (w2[2], w2[3], offset); + w3[1] = hc_bytealign (w2[1], w2[2], offset); + w3[0] = hc_bytealign (w2[0], w2[1], offset); + w2[3] = hc_bytealign (w1[3], w2[0], offset); + w2[2] = hc_bytealign (w1[2], w1[3], offset); + w2[1] = hc_bytealign (w1[1], w1[2], offset); + w2[0] = hc_bytealign (w1[0], w1[1], offset); + w1[3] = hc_bytealign (w0[3], w1[0], offset); + w1[2] = hc_bytealign (w0[2], w0[3], offset); + w1[1] = hc_bytealign (w0[1], w0[2], offset); + w1[0] = hc_bytealign (w0[0], w0[1], offset); + w0[3] = hc_bytealign ( 0, w0[0], offset); w0[2] = 0; w0[1] = 0; w0[0] = 0; @@ -7651,34 +7661,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 4: - w7[3] = hc_bytealign_be (w6[2], w6[3], offset); - w7[2] = hc_bytealign_be (w6[1], w6[2], offset); - w7[1] = hc_bytealign_be (w6[0], w6[1], offset); - w7[0] = hc_bytealign_be (w5[3], w6[0], offset); - w6[3] = hc_bytealign_be (w5[2], w5[3], offset); - w6[2] = hc_bytealign_be (w5[1], w5[2], offset); - w6[1] = hc_bytealign_be (w5[0], w5[1], offset); - w6[0] = hc_bytealign_be (w4[3], w5[0], offset); - w5[3] = hc_bytealign_be (w4[2], w4[3], offset); - w5[2] = hc_bytealign_be (w4[1], w4[2], offset); - w5[1] = hc_bytealign_be (w4[0], w4[1], offset); - w5[0] = hc_bytealign_be (w3[3], w4[0], offset); - w4[3] = hc_bytealign_be (w3[2], w3[3], offset); - w4[2] = hc_bytealign_be (w3[1], w3[2], offset); - w4[1] = hc_bytealign_be (w3[0], w3[1], offset); - w4[0] = hc_bytealign_be (w2[3], w3[0], offset); - w3[3] = hc_bytealign_be (w2[2], w2[3], offset); - w3[2] = hc_bytealign_be (w2[1], w2[2], offset); - w3[1] = hc_bytealign_be (w2[0], w2[1], offset); - w3[0] = hc_bytealign_be (w1[3], w2[0], offset); - w2[3] = hc_bytealign_be (w1[2], w1[3], offset); - w2[2] = hc_bytealign_be (w1[1], w1[2], offset); - w2[1] = hc_bytealign_be (w1[0], w1[1], offset); - w2[0] = hc_bytealign_be (w0[3], w1[0], offset); - w1[3] = hc_bytealign_be (w0[2], w0[3], offset); - w1[2] = hc_bytealign_be (w0[1], w0[2], offset); - w1[1] = hc_bytealign_be (w0[0], w0[1], offset); - w1[0] = hc_bytealign_be ( 0, w0[0], offset); + c1[0] = hc_bytealign (w7[3], 0, offset); + c0[3] = hc_bytealign (w7[2], w7[3], offset); + c0[2] = hc_bytealign (w7[1], w7[2], offset); + c0[1] = hc_bytealign (w7[0], w7[1], offset); + c0[0] = hc_bytealign (w6[3], w7[0], offset); + w7[3] = hc_bytealign (w6[2], w6[3], offset); + w7[2] = hc_bytealign (w6[1], w6[2], offset); + w7[1] = hc_bytealign (w6[0], w6[1], offset); + w7[0] = hc_bytealign (w5[3], w6[0], offset); + w6[3] = hc_bytealign (w5[2], w5[3], offset); + w6[2] = hc_bytealign (w5[1], w5[2], offset); + w6[1] = hc_bytealign (w5[0], w5[1], offset); + w6[0] = hc_bytealign (w4[3], w5[0], offset); + w5[3] = hc_bytealign (w4[2], w4[3], offset); + w5[2] = hc_bytealign (w4[1], w4[2], offset); + w5[1] = hc_bytealign (w4[0], w4[1], offset); + w5[0] = hc_bytealign (w3[3], w4[0], offset); + w4[3] = hc_bytealign (w3[2], w3[3], offset); + w4[2] = hc_bytealign (w3[1], w3[2], offset); + w4[1] = hc_bytealign (w3[0], w3[1], offset); + w4[0] = hc_bytealign (w2[3], w3[0], offset); + w3[3] = hc_bytealign (w2[2], w2[3], offset); + w3[2] = hc_bytealign (w2[1], w2[2], offset); + w3[1] = hc_bytealign (w2[0], w2[1], offset); + w3[0] = hc_bytealign (w1[3], w2[0], offset); + w2[3] = hc_bytealign (w1[2], w1[3], offset); + w2[2] = hc_bytealign (w1[1], w1[2], offset); + w2[1] = hc_bytealign (w1[0], w1[1], offset); + w2[0] = hc_bytealign (w0[3], w1[0], offset); + w1[3] = hc_bytealign (w0[2], w0[3], offset); + w1[2] = hc_bytealign (w0[1], w0[2], offset); + w1[1] = hc_bytealign (w0[0], w0[1], offset); + w1[0] = hc_bytealign ( 0, w0[0], offset); w0[3] = 0; w0[2] = 0; w0[1] = 0; @@ -7687,33 +7702,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 5: - w7[3] = hc_bytealign_be (w6[1], w6[2], offset); - w7[2] = hc_bytealign_be (w6[0], w6[1], offset); - w7[1] = hc_bytealign_be (w5[3], w6[0], offset); - w7[0] = hc_bytealign_be (w5[2], w5[3], offset); - w6[3] = hc_bytealign_be (w5[1], w5[2], offset); - w6[2] = hc_bytealign_be (w5[0], w5[1], offset); - w6[1] = hc_bytealign_be (w4[3], w5[0], offset); - w6[0] = hc_bytealign_be (w4[2], w4[3], offset); - w5[3] = hc_bytealign_be (w4[1], w4[2], offset); - w5[2] = hc_bytealign_be (w4[0], w4[1], offset); - w5[1] = hc_bytealign_be (w3[3], w4[0], offset); - w5[0] = hc_bytealign_be (w3[2], w3[3], offset); - w4[3] = hc_bytealign_be (w3[1], w3[2], offset); - w4[2] = hc_bytealign_be (w3[0], w3[1], offset); - w4[1] = hc_bytealign_be (w2[3], w3[0], offset); - w4[0] = hc_bytealign_be (w2[2], w2[3], offset); - w3[3] = hc_bytealign_be (w2[1], w2[2], offset); - w3[2] = hc_bytealign_be (w2[0], w2[1], offset); - w3[1] = hc_bytealign_be (w1[3], w2[0], offset); - w3[0] = hc_bytealign_be (w1[2], w1[3], offset); - w2[3] = hc_bytealign_be (w1[1], w1[2], offset); - w2[2] = hc_bytealign_be (w1[0], w1[1], offset); - w2[1] = hc_bytealign_be (w0[3], w1[0], offset); - w2[0] = hc_bytealign_be (w0[2], w0[3], offset); - w1[3] = hc_bytealign_be (w0[1], w0[2], offset); - w1[2] = hc_bytealign_be (w0[0], w0[1], offset); - w1[1] = hc_bytealign_be ( 0, w0[0], offset); + c1[1] = hc_bytealign (w7[3], 0, offset); + c1[0] = hc_bytealign (w7[2], w7[3], offset); + c0[3] = hc_bytealign (w7[1], w7[2], offset); + c0[2] = hc_bytealign (w7[0], w7[1], offset); + c0[1] = hc_bytealign (w6[3], w7[0], offset); + c0[0] = hc_bytealign (w6[2], w6[3], offset); + w7[3] = hc_bytealign (w6[1], w6[2], offset); + w7[2] = hc_bytealign (w6[0], w6[1], offset); + w7[1] = hc_bytealign (w5[3], w6[0], offset); + w7[0] = hc_bytealign (w5[2], w5[3], offset); + w6[3] = hc_bytealign (w5[1], w5[2], offset); + w6[2] = hc_bytealign (w5[0], w5[1], offset); + w6[1] = hc_bytealign (w4[3], w5[0], offset); + w6[0] = hc_bytealign (w4[2], w4[3], offset); + w5[3] = hc_bytealign (w4[1], w4[2], offset); + w5[2] = hc_bytealign (w4[0], w4[1], offset); + w5[1] = hc_bytealign (w3[3], w4[0], offset); + w5[0] = hc_bytealign (w3[2], w3[3], offset); + w4[3] = hc_bytealign (w3[1], w3[2], offset); + w4[2] = hc_bytealign (w3[0], w3[1], offset); + w4[1] = hc_bytealign (w2[3], w3[0], offset); + w4[0] = hc_bytealign (w2[2], w2[3], offset); + w3[3] = hc_bytealign (w2[1], w2[2], offset); + w3[2] = hc_bytealign (w2[0], w2[1], offset); + w3[1] = hc_bytealign (w1[3], w2[0], offset); + w3[0] = hc_bytealign (w1[2], w1[3], offset); + w2[3] = hc_bytealign (w1[1], w1[2], offset); + w2[2] = hc_bytealign (w1[0], w1[1], offset); + w2[1] = hc_bytealign (w0[3], w1[0], offset); + w2[0] = hc_bytealign (w0[2], w0[3], offset); + w1[3] = hc_bytealign (w0[1], w0[2], offset); + w1[2] = hc_bytealign (w0[0], w0[1], offset); + w1[1] = hc_bytealign ( 0, w0[0], offset); w1[0] = 0; w0[3] = 0; w0[2] = 0; @@ -7723,32 +7744,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 6: - w7[3] = hc_bytealign_be (w6[0], w6[1], offset); - w7[2] = hc_bytealign_be (w5[3], w6[0], offset); - w7[1] = hc_bytealign_be (w5[2], w5[3], offset); - w7[0] = hc_bytealign_be (w5[1], w5[2], offset); - w6[3] = hc_bytealign_be (w5[0], w5[1], offset); - w6[2] = hc_bytealign_be (w4[3], w5[0], offset); - w6[1] = hc_bytealign_be (w4[2], w4[3], offset); - w6[0] = hc_bytealign_be (w4[1], w4[2], offset); - w5[3] = hc_bytealign_be (w4[0], w4[1], offset); - w5[2] = hc_bytealign_be (w3[3], w4[0], offset); - w5[1] = hc_bytealign_be (w3[2], w3[3], offset); - w5[0] = hc_bytealign_be (w3[1], w3[2], offset); - w4[3] = hc_bytealign_be (w3[0], w3[1], offset); - w4[2] = hc_bytealign_be (w2[3], w3[0], offset); - w4[1] = hc_bytealign_be (w2[2], w2[3], offset); - w4[0] = hc_bytealign_be (w2[1], w2[2], offset); - w3[3] = hc_bytealign_be (w2[0], w2[1], offset); - w3[2] = hc_bytealign_be (w1[3], w2[0], offset); - w3[1] = hc_bytealign_be (w1[2], w1[3], offset); - w3[0] = hc_bytealign_be (w1[1], w1[2], offset); - w2[3] = hc_bytealign_be (w1[0], w1[1], offset); - w2[2] = hc_bytealign_be (w0[3], w1[0], offset); - w2[1] = hc_bytealign_be (w0[2], w0[3], offset); - w2[0] = hc_bytealign_be (w0[1], w0[2], offset); - w1[3] = hc_bytealign_be (w0[0], w0[1], offset); - w1[2] = hc_bytealign_be ( 0, w0[0], offset); + c1[2] = hc_bytealign (w7[3], 0, offset); + c1[1] = hc_bytealign (w7[2], w7[3], offset); + c1[0] = hc_bytealign (w7[1], w7[2], offset); + c0[3] = hc_bytealign (w7[0], w7[1], offset); + c0[2] = hc_bytealign (w6[3], w7[0], offset); + c0[1] = hc_bytealign (w6[2], w6[3], offset); + c0[0] = hc_bytealign (w6[1], w6[2], offset); + w7[3] = hc_bytealign (w6[0], w6[1], offset); + w7[2] = hc_bytealign (w5[3], w6[0], offset); + w7[1] = hc_bytealign (w5[2], w5[3], offset); + w7[0] = hc_bytealign (w5[1], w5[2], offset); + w6[3] = hc_bytealign (w5[0], w5[1], offset); + w6[2] = hc_bytealign (w4[3], w5[0], offset); + w6[1] = hc_bytealign (w4[2], w4[3], offset); + w6[0] = hc_bytealign (w4[1], w4[2], offset); + w5[3] = hc_bytealign (w4[0], w4[1], offset); + w5[2] = hc_bytealign (w3[3], w4[0], offset); + w5[1] = hc_bytealign (w3[2], w3[3], offset); + w5[0] = hc_bytealign (w3[1], w3[2], offset); + w4[3] = hc_bytealign (w3[0], w3[1], offset); + w4[2] = hc_bytealign (w2[3], w3[0], offset); + w4[1] = hc_bytealign (w2[2], w2[3], offset); + w4[0] = hc_bytealign (w2[1], w2[2], offset); + w3[3] = hc_bytealign (w2[0], w2[1], offset); + w3[2] = hc_bytealign (w1[3], w2[0], offset); + w3[1] = hc_bytealign (w1[2], w1[3], offset); + w3[0] = hc_bytealign (w1[1], w1[2], offset); + w2[3] = hc_bytealign (w1[0], w1[1], offset); + w2[2] = hc_bytealign (w0[3], w1[0], offset); + w2[1] = hc_bytealign (w0[2], w0[3], offset); + w2[0] = hc_bytealign (w0[1], w0[2], offset); + w1[3] = hc_bytealign (w0[0], w0[1], offset); + w1[2] = hc_bytealign ( 0, w0[0], offset); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -7759,31 +7787,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 7: - w7[3] = hc_bytealign_be (w5[3], w6[0], offset); - w7[2] = hc_bytealign_be (w5[2], w5[3], offset); - w7[1] = hc_bytealign_be (w5[1], w5[2], offset); - w7[0] = hc_bytealign_be (w5[0], w5[1], offset); - w6[3] = hc_bytealign_be (w4[3], w5[0], offset); - w6[2] = hc_bytealign_be (w4[2], w4[3], offset); - w6[1] = hc_bytealign_be (w4[1], w4[2], offset); - w6[0] = hc_bytealign_be (w4[0], w4[1], offset); - w5[3] = hc_bytealign_be (w3[3], w4[0], offset); - w5[2] = hc_bytealign_be (w3[2], w3[3], offset); - w5[1] = hc_bytealign_be (w3[1], w3[2], offset); - w5[0] = hc_bytealign_be (w3[0], w3[1], offset); - w4[3] = hc_bytealign_be (w2[3], w3[0], offset); - w4[2] = hc_bytealign_be (w2[2], w2[3], offset); - w4[1] = hc_bytealign_be (w2[1], w2[2], offset); - w4[0] = hc_bytealign_be (w2[0], w2[1], offset); - w3[3] = hc_bytealign_be (w1[3], w2[0], offset); - w3[2] = hc_bytealign_be (w1[2], w1[3], offset); - w3[1] = hc_bytealign_be (w1[1], w1[2], offset); - w3[0] = hc_bytealign_be (w1[0], w1[1], offset); - w2[3] = hc_bytealign_be (w0[3], w1[0], offset); - w2[2] = hc_bytealign_be (w0[2], w0[3], offset); - w2[1] = hc_bytealign_be (w0[1], w0[2], offset); - w2[0] = hc_bytealign_be (w0[0], w0[1], offset); - w1[3] = hc_bytealign_be ( 0, w0[0], offset); + c1[3] = hc_bytealign (w7[3], 0, offset); + c1[2] = hc_bytealign (w7[2], w7[3], offset); + c1[1] = hc_bytealign (w7[1], w7[2], offset); + c1[0] = hc_bytealign (w7[0], w7[1], offset); + c0[3] = hc_bytealign (w6[3], w7[0], offset); + c0[2] = hc_bytealign (w6[2], w6[3], offset); + c0[1] = hc_bytealign (w6[1], w6[2], offset); + c0[0] = hc_bytealign (w6[0], w6[1], offset); + w7[3] = hc_bytealign (w5[3], w6[0], offset); + w7[2] = hc_bytealign (w5[2], w5[3], offset); + w7[1] = hc_bytealign (w5[1], w5[2], offset); + w7[0] = hc_bytealign (w5[0], w5[1], offset); + w6[3] = hc_bytealign (w4[3], w5[0], offset); + w6[2] = hc_bytealign (w4[2], w4[3], offset); + w6[1] = hc_bytealign (w4[1], w4[2], offset); + w6[0] = hc_bytealign (w4[0], w4[1], offset); + w5[3] = hc_bytealign (w3[3], w4[0], offset); + w5[2] = hc_bytealign (w3[2], w3[3], offset); + w5[1] = hc_bytealign (w3[1], w3[2], offset); + w5[0] = hc_bytealign (w3[0], w3[1], offset); + w4[3] = hc_bytealign (w2[3], w3[0], offset); + w4[2] = hc_bytealign (w2[2], w2[3], offset); + w4[1] = hc_bytealign (w2[1], w2[2], offset); + w4[0] = hc_bytealign (w2[0], w2[1], offset); + w3[3] = hc_bytealign (w1[3], w2[0], offset); + w3[2] = hc_bytealign (w1[2], w1[3], offset); + w3[1] = hc_bytealign (w1[1], w1[2], offset); + w3[0] = hc_bytealign (w1[0], w1[1], offset); + w2[3] = hc_bytealign (w0[3], w1[0], offset); + w2[2] = hc_bytealign (w0[2], w0[3], offset); + w2[1] = hc_bytealign (w0[1], w0[2], offset); + w2[0] = hc_bytealign (w0[0], w0[1], offset); + w1[3] = hc_bytealign ( 0, w0[0], offset); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -7795,30 +7831,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 8: - w7[3] = hc_bytealign_be (w5[2], w5[3], offset); - w7[2] = hc_bytealign_be (w5[1], w5[2], offset); - w7[1] = hc_bytealign_be (w5[0], w5[1], offset); - w7[0] = hc_bytealign_be (w4[3], w5[0], offset); - w6[3] = hc_bytealign_be (w4[2], w4[3], offset); - w6[2] = hc_bytealign_be (w4[1], w4[2], offset); - w6[1] = hc_bytealign_be (w4[0], w4[1], offset); - w6[0] = hc_bytealign_be (w3[3], w4[0], offset); - w5[3] = hc_bytealign_be (w3[2], w3[3], offset); - w5[2] = hc_bytealign_be (w3[1], w3[2], offset); - w5[1] = hc_bytealign_be (w3[0], w3[1], offset); - w5[0] = hc_bytealign_be (w2[3], w3[0], offset); - w4[3] = hc_bytealign_be (w2[2], w2[3], offset); - w4[2] = hc_bytealign_be (w2[1], w2[2], offset); - w4[1] = hc_bytealign_be (w2[0], w2[1], offset); - w4[0] = hc_bytealign_be (w1[3], w2[0], offset); - w3[3] = hc_bytealign_be (w1[2], w1[3], offset); - w3[2] = hc_bytealign_be (w1[1], w1[2], offset); - w3[1] = hc_bytealign_be (w1[0], w1[1], offset); - w3[0] = hc_bytealign_be (w0[3], w1[0], offset); - w2[3] = hc_bytealign_be (w0[2], w0[3], offset); - w2[2] = hc_bytealign_be (w0[1], w0[2], offset); - w2[1] = hc_bytealign_be (w0[0], w0[1], offset); - w2[0] = hc_bytealign_be ( 0, w0[0], offset); + c2[0] = hc_bytealign (w7[3], 0, offset); + c1[3] = hc_bytealign (w7[2], w7[3], offset); + c1[2] = hc_bytealign (w7[1], w7[2], offset); + c1[1] = hc_bytealign (w7[0], w7[1], offset); + c1[0] = hc_bytealign (w6[3], w7[0], offset); + c0[3] = hc_bytealign (w6[2], w6[3], offset); + c0[2] = hc_bytealign (w6[1], w6[2], offset); + c0[1] = hc_bytealign (w6[0], w6[1], offset); + c0[0] = hc_bytealign (w5[3], w6[0], offset); + w7[3] = hc_bytealign (w5[2], w5[3], offset); + w7[2] = hc_bytealign (w5[1], w5[2], offset); + w7[1] = hc_bytealign (w5[0], w5[1], offset); + w7[0] = hc_bytealign (w4[3], w5[0], offset); + w6[3] = hc_bytealign (w4[2], w4[3], offset); + w6[2] = hc_bytealign (w4[1], w4[2], offset); + w6[1] = hc_bytealign (w4[0], w4[1], offset); + w6[0] = hc_bytealign (w3[3], w4[0], offset); + w5[3] = hc_bytealign (w3[2], w3[3], offset); + w5[2] = hc_bytealign (w3[1], w3[2], offset); + w5[1] = hc_bytealign (w3[0], w3[1], offset); + w5[0] = hc_bytealign (w2[3], w3[0], offset); + w4[3] = hc_bytealign (w2[2], w2[3], offset); + w4[2] = hc_bytealign (w2[1], w2[2], offset); + w4[1] = hc_bytealign (w2[0], w2[1], offset); + w4[0] = hc_bytealign (w1[3], w2[0], offset); + w3[3] = hc_bytealign (w1[2], w1[3], offset); + w3[2] = hc_bytealign (w1[1], w1[2], offset); + w3[1] = hc_bytealign (w1[0], w1[1], offset); + w3[0] = hc_bytealign (w0[3], w1[0], offset); + w2[3] = hc_bytealign (w0[2], w0[3], offset); + w2[2] = hc_bytealign (w0[1], w0[2], offset); + w2[1] = hc_bytealign (w0[0], w0[1], offset); + w2[0] = hc_bytealign ( 0, w0[0], offset); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -7831,29 +7876,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 9: - w7[3] = hc_bytealign_be (w5[1], w5[2], offset); - w7[2] = hc_bytealign_be (w5[0], w5[1], offset); - w7[1] = hc_bytealign_be (w4[3], w5[0], offset); - w7[0] = hc_bytealign_be (w4[2], w4[3], offset); - w6[3] = hc_bytealign_be (w4[1], w4[2], offset); - w6[2] = hc_bytealign_be (w4[0], w4[1], offset); - w6[1] = hc_bytealign_be (w3[3], w4[0], offset); - w6[0] = hc_bytealign_be (w3[2], w3[3], offset); - w5[3] = hc_bytealign_be (w3[1], w3[2], offset); - w5[2] = hc_bytealign_be (w3[0], w3[1], offset); - w5[1] = hc_bytealign_be (w2[3], w3[0], offset); - w5[0] = hc_bytealign_be (w2[2], w2[3], offset); - w4[3] = hc_bytealign_be (w2[1], w2[2], offset); - w4[2] = hc_bytealign_be (w2[0], w2[1], offset); - w4[1] = hc_bytealign_be (w1[3], w2[0], offset); - w4[0] = hc_bytealign_be (w1[2], w1[3], offset); - w3[3] = hc_bytealign_be (w1[1], w1[2], offset); - w3[2] = hc_bytealign_be (w1[0], w1[1], offset); - w3[1] = hc_bytealign_be (w0[3], w1[0], offset); - w3[0] = hc_bytealign_be (w0[2], w0[3], offset); - w2[3] = hc_bytealign_be (w0[1], w0[2], offset); - w2[2] = hc_bytealign_be (w0[0], w0[1], offset); - w2[1] = hc_bytealign_be ( 0, w0[0], offset); + c2[1] = hc_bytealign (w7[3], 0, offset); + c2[0] = hc_bytealign (w7[2], w7[3], offset); + c1[3] = hc_bytealign (w7[1], w7[2], offset); + c1[2] = hc_bytealign (w7[0], w7[1], offset); + c1[1] = hc_bytealign (w6[3], w7[0], offset); + c1[0] = hc_bytealign (w6[2], w6[3], offset); + c0[3] = hc_bytealign (w6[1], w6[2], offset); + c0[2] = hc_bytealign (w6[0], w6[1], offset); + c0[1] = hc_bytealign (w5[3], w6[0], offset); + c0[0] = hc_bytealign (w5[2], w5[3], offset); + w7[3] = hc_bytealign (w5[1], w5[2], offset); + w7[2] = hc_bytealign (w5[0], w5[1], offset); + w7[1] = hc_bytealign (w4[3], w5[0], offset); + w7[0] = hc_bytealign (w4[2], w4[3], offset); + w6[3] = hc_bytealign (w4[1], w4[2], offset); + w6[2] = hc_bytealign (w4[0], w4[1], offset); + w6[1] = hc_bytealign (w3[3], w4[0], offset); + w6[0] = hc_bytealign (w3[2], w3[3], offset); + w5[3] = hc_bytealign (w3[1], w3[2], offset); + w5[2] = hc_bytealign (w3[0], w3[1], offset); + w5[1] = hc_bytealign (w2[3], w3[0], offset); + w5[0] = hc_bytealign (w2[2], w2[3], offset); + w4[3] = hc_bytealign (w2[1], w2[2], offset); + w4[2] = hc_bytealign (w2[0], w2[1], offset); + w4[1] = hc_bytealign (w1[3], w2[0], offset); + w4[0] = hc_bytealign (w1[2], w1[3], offset); + w3[3] = hc_bytealign (w1[1], w1[2], offset); + w3[2] = hc_bytealign (w1[0], w1[1], offset); + w3[1] = hc_bytealign (w0[3], w1[0], offset); + w3[0] = hc_bytealign (w0[2], w0[3], offset); + w2[3] = hc_bytealign (w0[1], w0[2], offset); + w2[2] = hc_bytealign (w0[0], w0[1], offset); + w2[1] = hc_bytealign ( 0, w0[0], offset); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -7867,28 +7922,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 10: - w7[3] = hc_bytealign_be (w5[0], w5[1], offset); - w7[2] = hc_bytealign_be (w4[3], w5[0], offset); - w7[1] = hc_bytealign_be (w4[2], w4[3], offset); - w7[0] = hc_bytealign_be (w4[1], w4[2], offset); - w6[3] = hc_bytealign_be (w4[0], w4[1], offset); - w6[2] = hc_bytealign_be (w3[3], w4[0], offset); - w6[1] = hc_bytealign_be (w3[2], w3[3], offset); - w6[0] = hc_bytealign_be (w3[1], w3[2], offset); - w5[3] = hc_bytealign_be (w3[0], w3[1], offset); - w5[2] = hc_bytealign_be (w2[3], w3[0], offset); - w5[1] = hc_bytealign_be (w2[2], w2[3], offset); - w5[0] = hc_bytealign_be (w2[1], w2[2], offset); - w4[3] = hc_bytealign_be (w2[0], w2[1], offset); - w4[2] = hc_bytealign_be (w1[3], w2[0], offset); - w4[1] = hc_bytealign_be (w1[2], w1[3], offset); - w4[0] = hc_bytealign_be (w1[1], w1[2], offset); - w3[3] = hc_bytealign_be (w1[0], w1[1], offset); - w3[2] = hc_bytealign_be (w0[3], w1[0], offset); - w3[1] = hc_bytealign_be (w0[2], w0[3], offset); - w3[0] = hc_bytealign_be (w0[1], w0[2], offset); - w2[3] = hc_bytealign_be (w0[0], w0[1], offset); - w2[2] = hc_bytealign_be ( 0, w0[0], offset); + c2[2] = hc_bytealign (w7[3], 0, offset); + c2[1] = hc_bytealign (w7[2], w7[3], offset); + c2[0] = hc_bytealign (w7[1], w7[2], offset); + c1[3] = hc_bytealign (w7[0], w7[1], offset); + c1[2] = hc_bytealign (w6[3], w7[0], offset); + c1[1] = hc_bytealign (w6[2], w6[3], offset); + c1[0] = hc_bytealign (w6[1], w6[2], offset); + c0[3] = hc_bytealign (w6[0], w6[1], offset); + c0[2] = hc_bytealign (w5[3], w6[0], offset); + c0[1] = hc_bytealign (w5[2], w5[3], offset); + c0[0] = hc_bytealign (w5[1], w5[2], offset); + w7[3] = hc_bytealign (w5[0], w5[1], offset); + w7[2] = hc_bytealign (w4[3], w5[0], offset); + w7[1] = hc_bytealign (w4[2], w4[3], offset); + w7[0] = hc_bytealign (w4[1], w4[2], offset); + w6[3] = hc_bytealign (w4[0], w4[1], offset); + w6[2] = hc_bytealign (w3[3], w4[0], offset); + w6[1] = hc_bytealign (w3[2], w3[3], offset); + w6[0] = hc_bytealign (w3[1], w3[2], offset); + w5[3] = hc_bytealign (w3[0], w3[1], offset); + w5[2] = hc_bytealign (w2[3], w3[0], offset); + w5[1] = hc_bytealign (w2[2], w2[3], offset); + w5[0] = hc_bytealign (w2[1], w2[2], offset); + w4[3] = hc_bytealign (w2[0], w2[1], offset); + w4[2] = hc_bytealign (w1[3], w2[0], offset); + w4[1] = hc_bytealign (w1[2], w1[3], offset); + w4[0] = hc_bytealign (w1[1], w1[2], offset); + w3[3] = hc_bytealign (w1[0], w1[1], offset); + w3[2] = hc_bytealign (w0[3], w1[0], offset); + w3[1] = hc_bytealign (w0[2], w0[3], offset); + w3[0] = hc_bytealign (w0[1], w0[2], offset); + w2[3] = hc_bytealign (w0[0], w0[1], offset); + w2[2] = hc_bytealign ( 0, w0[0], offset); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -7903,27 +7969,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 11: - w7[3] = hc_bytealign_be (w4[3], w5[0], offset); - w7[2] = hc_bytealign_be (w4[2], w4[3], offset); - w7[1] = hc_bytealign_be (w4[1], w4[2], offset); - w7[0] = hc_bytealign_be (w4[0], w4[1], offset); - w6[3] = hc_bytealign_be (w3[3], w4[0], offset); - w6[2] = hc_bytealign_be (w3[2], w3[3], offset); - w6[1] = hc_bytealign_be (w3[1], w3[2], offset); - w6[0] = hc_bytealign_be (w3[0], w3[1], offset); - w5[3] = hc_bytealign_be (w2[3], w3[0], offset); - w5[2] = hc_bytealign_be (w2[2], w2[3], offset); - w5[1] = hc_bytealign_be (w2[1], w2[2], offset); - w5[0] = hc_bytealign_be (w2[0], w2[1], offset); - w4[3] = hc_bytealign_be (w1[3], w2[0], offset); - w4[2] = hc_bytealign_be (w1[2], w1[3], offset); - w4[1] = hc_bytealign_be (w1[1], w1[2], offset); - w4[0] = hc_bytealign_be (w1[0], w1[1], offset); - w3[3] = hc_bytealign_be (w0[3], w1[0], offset); - w3[2] = hc_bytealign_be (w0[2], w0[3], offset); - w3[1] = hc_bytealign_be (w0[1], w0[2], offset); - w3[0] = hc_bytealign_be (w0[0], w0[1], offset); - w2[3] = hc_bytealign_be ( 0, w0[0], offset); + c2[3] = hc_bytealign (w7[3], 0, offset); + c2[2] = hc_bytealign (w7[2], w7[3], offset); + c2[1] = hc_bytealign (w7[1], w7[2], offset); + c2[0] = hc_bytealign (w7[0], w7[1], offset); + c1[3] = hc_bytealign (w6[3], w7[0], offset); + c1[2] = hc_bytealign (w6[2], w6[3], offset); + c1[1] = hc_bytealign (w6[1], w6[2], offset); + c1[0] = hc_bytealign (w6[0], w6[1], offset); + c0[3] = hc_bytealign (w5[3], w6[0], offset); + c0[2] = hc_bytealign (w5[2], w5[3], offset); + c0[1] = hc_bytealign (w5[1], w5[2], offset); + c0[0] = hc_bytealign (w5[0], w5[1], offset); + w7[3] = hc_bytealign (w4[3], w5[0], offset); + w7[2] = hc_bytealign (w4[2], w4[3], offset); + w7[1] = hc_bytealign (w4[1], w4[2], offset); + w7[0] = hc_bytealign (w4[0], w4[1], offset); + w6[3] = hc_bytealign (w3[3], w4[0], offset); + w6[2] = hc_bytealign (w3[2], w3[3], offset); + w6[1] = hc_bytealign (w3[1], w3[2], offset); + w6[0] = hc_bytealign (w3[0], w3[1], offset); + w5[3] = hc_bytealign (w2[3], w3[0], offset); + w5[2] = hc_bytealign (w2[2], w2[3], offset); + w5[1] = hc_bytealign (w2[1], w2[2], offset); + w5[0] = hc_bytealign (w2[0], w2[1], offset); + w4[3] = hc_bytealign (w1[3], w2[0], offset); + w4[2] = hc_bytealign (w1[2], w1[3], offset); + w4[1] = hc_bytealign (w1[1], w1[2], offset); + w4[0] = hc_bytealign (w1[0], w1[1], offset); + w3[3] = hc_bytealign (w0[3], w1[0], offset); + w3[2] = hc_bytealign (w0[2], w0[3], offset); + w3[1] = hc_bytealign (w0[1], w0[2], offset); + w3[0] = hc_bytealign (w0[0], w0[1], offset); + w2[3] = hc_bytealign ( 0, w0[0], offset); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -7939,26 +8017,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 12: - w7[3] = hc_bytealign_be (w4[2], w4[3], offset); - w7[2] = hc_bytealign_be (w4[1], w4[2], offset); - w7[1] = hc_bytealign_be (w4[0], w4[1], offset); - w7[0] = hc_bytealign_be (w3[3], w4[0], offset); - w6[3] = hc_bytealign_be (w3[2], w3[3], offset); - w6[2] = hc_bytealign_be (w3[1], w3[2], offset); - w6[1] = hc_bytealign_be (w3[0], w3[1], offset); - w6[0] = hc_bytealign_be (w2[3], w3[0], offset); - w5[3] = hc_bytealign_be (w2[2], w2[3], offset); - w5[2] = hc_bytealign_be (w2[1], w2[2], offset); - w5[1] = hc_bytealign_be (w2[0], w2[1], offset); - w5[0] = hc_bytealign_be (w1[3], w2[0], offset); - w4[3] = hc_bytealign_be (w1[2], w1[3], offset); - w4[2] = hc_bytealign_be (w1[1], w1[2], offset); - w4[1] = hc_bytealign_be (w1[0], w1[1], offset); - w4[0] = hc_bytealign_be (w0[3], w1[0], offset); - w3[3] = hc_bytealign_be (w0[2], w0[3], offset); - w3[2] = hc_bytealign_be (w0[1], w0[2], offset); - w3[1] = hc_bytealign_be (w0[0], w0[1], offset); - w3[0] = hc_bytealign_be ( 0, w0[0], offset); + c3[0] = hc_bytealign (w7[3], 0, offset); + c2[3] = hc_bytealign (w7[2], w7[3], offset); + c2[2] = hc_bytealign (w7[1], w7[2], offset); + c2[1] = hc_bytealign (w7[0], w7[1], offset); + c2[0] = hc_bytealign (w6[3], w7[0], offset); + c1[3] = hc_bytealign (w6[2], w6[3], offset); + c1[2] = hc_bytealign (w6[1], w6[2], offset); + c1[1] = hc_bytealign (w6[0], w6[1], offset); + c1[0] = hc_bytealign (w5[3], w6[0], offset); + c0[3] = hc_bytealign (w5[2], w5[3], offset); + c0[2] = hc_bytealign (w5[1], w5[2], offset); + c0[1] = hc_bytealign (w5[0], w5[1], offset); + c0[0] = hc_bytealign (w4[3], w5[0], offset); + w7[3] = hc_bytealign (w4[2], w4[3], offset); + w7[2] = hc_bytealign (w4[1], w4[2], offset); + w7[1] = hc_bytealign (w4[0], w4[1], offset); + w7[0] = hc_bytealign (w3[3], w4[0], offset); + w6[3] = hc_bytealign (w3[2], w3[3], offset); + w6[2] = hc_bytealign (w3[1], w3[2], offset); + w6[1] = hc_bytealign (w3[0], w3[1], offset); + w6[0] = hc_bytealign (w2[3], w3[0], offset); + w5[3] = hc_bytealign (w2[2], w2[3], offset); + w5[2] = hc_bytealign (w2[1], w2[2], offset); + w5[1] = hc_bytealign (w2[0], w2[1], offset); + w5[0] = hc_bytealign (w1[3], w2[0], offset); + w4[3] = hc_bytealign (w1[2], w1[3], offset); + w4[2] = hc_bytealign (w1[1], w1[2], offset); + w4[1] = hc_bytealign (w1[0], w1[1], offset); + w4[0] = hc_bytealign (w0[3], w1[0], offset); + w3[3] = hc_bytealign (w0[2], w0[3], offset); + w3[2] = hc_bytealign (w0[1], w0[2], offset); + w3[1] = hc_bytealign (w0[0], w0[1], offset); + w3[0] = hc_bytealign ( 0, w0[0], offset); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -7975,25 +8066,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 13: - w7[3] = hc_bytealign_be (w4[1], w4[2], offset); - w7[2] = hc_bytealign_be (w4[0], w4[1], offset); - w7[1] = hc_bytealign_be (w3[3], w4[0], offset); - w7[0] = hc_bytealign_be (w3[2], w3[3], offset); - w6[3] = hc_bytealign_be (w3[1], w3[2], offset); - w6[2] = hc_bytealign_be (w3[0], w3[1], offset); - w6[1] = hc_bytealign_be (w2[3], w3[0], offset); - w6[0] = hc_bytealign_be (w2[2], w2[3], offset); - w5[3] = hc_bytealign_be (w2[1], w2[2], offset); - w5[2] = hc_bytealign_be (w2[0], w2[1], offset); - w5[1] = hc_bytealign_be (w1[3], w2[0], offset); - w5[0] = hc_bytealign_be (w1[2], w1[3], offset); - w4[3] = hc_bytealign_be (w1[1], w1[2], offset); - w4[2] = hc_bytealign_be (w1[0], w1[1], offset); - w4[1] = hc_bytealign_be (w0[3], w1[0], offset); - w4[0] = hc_bytealign_be (w0[2], w0[3], offset); - w3[3] = hc_bytealign_be (w0[1], w0[2], offset); - w3[2] = hc_bytealign_be (w0[0], w0[1], offset); - w3[1] = hc_bytealign_be ( 0, w0[0], offset); + c3[1] = hc_bytealign (w7[3], 0, offset); + c3[0] = hc_bytealign (w7[2], w7[3], offset); + c2[3] = hc_bytealign (w7[1], w7[2], offset); + c2[2] = hc_bytealign (w7[0], w7[1], offset); + c2[1] = hc_bytealign (w6[3], w7[0], offset); + c2[0] = hc_bytealign (w6[2], w6[3], offset); + c1[3] = hc_bytealign (w6[1], w6[2], offset); + c1[2] = hc_bytealign (w6[0], w6[1], offset); + c1[1] = hc_bytealign (w5[3], w6[0], offset); + c1[0] = hc_bytealign (w5[2], w5[3], offset); + c0[3] = hc_bytealign (w5[1], w5[2], offset); + c0[2] = hc_bytealign (w5[0], w5[1], offset); + c0[1] = hc_bytealign (w4[3], w5[0], offset); + c0[0] = hc_bytealign (w4[2], w4[3], offset); + w7[3] = hc_bytealign (w4[1], w4[2], offset); + w7[2] = hc_bytealign (w4[0], w4[1], offset); + w7[1] = hc_bytealign (w3[3], w4[0], offset); + w7[0] = hc_bytealign (w3[2], w3[3], offset); + w6[3] = hc_bytealign (w3[1], w3[2], offset); + w6[2] = hc_bytealign (w3[0], w3[1], offset); + w6[1] = hc_bytealign (w2[3], w3[0], offset); + w6[0] = hc_bytealign (w2[2], w2[3], offset); + w5[3] = hc_bytealign (w2[1], w2[2], offset); + w5[2] = hc_bytealign (w2[0], w2[1], offset); + w5[1] = hc_bytealign (w1[3], w2[0], offset); + w5[0] = hc_bytealign (w1[2], w1[3], offset); + w4[3] = hc_bytealign (w1[1], w1[2], offset); + w4[2] = hc_bytealign (w1[0], w1[1], offset); + w4[1] = hc_bytealign (w0[3], w1[0], offset); + w4[0] = hc_bytealign (w0[2], w0[3], offset); + w3[3] = hc_bytealign (w0[1], w0[2], offset); + w3[2] = hc_bytealign (w0[0], w0[1], offset); + w3[1] = hc_bytealign ( 0, w0[0], offset); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -8011,24 +8116,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 14: - w7[3] = hc_bytealign_be (w4[0], w4[1], offset); - w7[2] = hc_bytealign_be (w3[3], w4[0], offset); - w7[1] = hc_bytealign_be (w3[2], w3[3], offset); - w7[0] = hc_bytealign_be (w3[1], w3[2], offset); - w6[3] = hc_bytealign_be (w3[0], w3[1], offset); - w6[2] = hc_bytealign_be (w2[3], w3[0], offset); - w6[1] = hc_bytealign_be (w2[2], w2[3], offset); - w6[0] = hc_bytealign_be (w2[1], w2[2], offset); - w5[3] = hc_bytealign_be (w2[0], w2[1], offset); - w5[2] = hc_bytealign_be (w1[3], w2[0], offset); - w5[1] = hc_bytealign_be (w1[2], w1[3], offset); - w5[0] = hc_bytealign_be (w1[1], w1[2], offset); - w4[3] = hc_bytealign_be (w1[0], w1[1], offset); - w4[2] = hc_bytealign_be (w0[3], w1[0], offset); - w4[1] = hc_bytealign_be (w0[2], w0[3], offset); - w4[0] = hc_bytealign_be (w0[1], w0[2], offset); - w3[3] = hc_bytealign_be (w0[0], w0[1], offset); - w3[2] = hc_bytealign_be ( 0, w0[0], offset); + c3[2] = hc_bytealign (w7[3], 0, offset); + c3[1] = hc_bytealign (w7[2], w7[3], offset); + c3[0] = hc_bytealign (w7[1], w7[2], offset); + c2[3] = hc_bytealign (w7[0], w7[1], offset); + c2[2] = hc_bytealign (w6[3], w7[0], offset); + c2[1] = hc_bytealign (w6[2], w6[3], offset); + c2[0] = hc_bytealign (w6[1], w6[2], offset); + c1[3] = hc_bytealign (w6[0], w6[1], offset); + c1[2] = hc_bytealign (w5[3], w6[0], offset); + c1[1] = hc_bytealign (w5[2], w5[3], offset); + c1[0] = hc_bytealign (w5[1], w5[2], offset); + c0[3] = hc_bytealign (w5[0], w5[1], offset); + c0[2] = hc_bytealign (w4[3], w5[0], offset); + c0[1] = hc_bytealign (w4[2], w4[3], offset); + c0[0] = hc_bytealign (w4[1], w4[2], offset); + w7[3] = hc_bytealign (w4[0], w4[1], offset); + w7[2] = hc_bytealign (w3[3], w4[0], offset); + w7[1] = hc_bytealign (w3[2], w3[3], offset); + w7[0] = hc_bytealign (w3[1], w3[2], offset); + w6[3] = hc_bytealign (w3[0], w3[1], offset); + w6[2] = hc_bytealign (w2[3], w3[0], offset); + w6[1] = hc_bytealign (w2[2], w2[3], offset); + w6[0] = hc_bytealign (w2[1], w2[2], offset); + w5[3] = hc_bytealign (w2[0], w2[1], offset); + w5[2] = hc_bytealign (w1[3], w2[0], offset); + w5[1] = hc_bytealign (w1[2], w1[3], offset); + w5[0] = hc_bytealign (w1[1], w1[2], offset); + w4[3] = hc_bytealign (w1[0], w1[1], offset); + w4[2] = hc_bytealign (w0[3], w1[0], offset); + w4[1] = hc_bytealign (w0[2], w0[3], offset); + w4[0] = hc_bytealign (w0[1], w0[2], offset); + w3[3] = hc_bytealign (w0[0], w0[1], offset); + w3[2] = hc_bytealign ( 0, w0[0], offset); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -8047,23 +8167,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 15: - w7[3] = hc_bytealign_be (w3[3], w4[0], offset); - w7[2] = hc_bytealign_be (w3[2], w3[3], offset); - w7[1] = hc_bytealign_be (w3[1], w3[2], offset); - w7[0] = hc_bytealign_be (w3[0], w3[1], offset); - w6[3] = hc_bytealign_be (w2[3], w3[0], offset); - w6[2] = hc_bytealign_be (w2[2], w2[3], offset); - w6[1] = hc_bytealign_be (w2[1], w2[2], offset); - w6[0] = hc_bytealign_be (w2[0], w2[1], offset); - w5[3] = hc_bytealign_be (w1[3], w2[0], offset); - w5[2] = hc_bytealign_be (w1[2], w1[3], offset); - w5[1] = hc_bytealign_be (w1[1], w1[2], offset); - w5[0] = hc_bytealign_be (w1[0], w1[1], offset); - w4[3] = hc_bytealign_be (w0[3], w1[0], offset); - w4[2] = hc_bytealign_be (w0[2], w0[3], offset); - w4[1] = hc_bytealign_be (w0[1], w0[2], offset); - w4[0] = hc_bytealign_be (w0[0], w0[1], offset); - w3[3] = hc_bytealign_be ( 0, w0[0], offset); + c3[3] = hc_bytealign (w7[3], 0, offset); + c3[2] = hc_bytealign (w7[2], w7[3], offset); + c3[1] = hc_bytealign (w7[1], w7[2], offset); + c3[0] = hc_bytealign (w7[0], w7[1], offset); + c2[3] = hc_bytealign (w6[3], w7[0], offset); + c2[2] = hc_bytealign (w6[2], w6[3], offset); + c2[1] = hc_bytealign (w6[1], w6[2], offset); + c2[0] = hc_bytealign (w6[0], w6[1], offset); + c1[3] = hc_bytealign (w5[3], w6[0], offset); + c1[2] = hc_bytealign (w5[2], w5[3], offset); + c1[1] = hc_bytealign (w5[1], w5[2], offset); + c1[0] = hc_bytealign (w5[0], w5[1], offset); + c0[3] = hc_bytealign (w4[3], w5[0], offset); + c0[2] = hc_bytealign (w4[2], w4[3], offset); + c0[1] = hc_bytealign (w4[1], w4[2], offset); + c0[0] = hc_bytealign (w4[0], w4[1], offset); + w7[3] = hc_bytealign (w3[3], w4[0], offset); + w7[2] = hc_bytealign (w3[2], w3[3], offset); + w7[1] = hc_bytealign (w3[1], w3[2], offset); + w7[0] = hc_bytealign (w3[0], w3[1], offset); + w6[3] = hc_bytealign (w2[3], w3[0], offset); + w6[2] = hc_bytealign (w2[2], w2[3], offset); + w6[1] = hc_bytealign (w2[1], w2[2], offset); + w6[0] = hc_bytealign (w2[0], w2[1], offset); + w5[3] = hc_bytealign (w1[3], w2[0], offset); + w5[2] = hc_bytealign (w1[2], w1[3], offset); + w5[1] = hc_bytealign (w1[1], w1[2], offset); + w5[0] = hc_bytealign (w1[0], w1[1], offset); + w4[3] = hc_bytealign (w0[3], w1[0], offset); + w4[2] = hc_bytealign (w0[2], w0[3], offset); + w4[1] = hc_bytealign (w0[1], w0[2], offset); + w4[0] = hc_bytealign (w0[0], w0[1], offset); + w3[3] = hc_bytealign ( 0, w0[0], offset); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -8083,22 +8219,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 16: - w7[3] = hc_bytealign_be (w3[2], w3[3], offset); - w7[2] = hc_bytealign_be (w3[1], w3[2], offset); - w7[1] = hc_bytealign_be (w3[0], w3[1], offset); - w7[0] = hc_bytealign_be (w2[3], w3[0], offset); - w6[3] = hc_bytealign_be (w2[2], w2[3], offset); - w6[2] = hc_bytealign_be (w2[1], w2[2], offset); - w6[1] = hc_bytealign_be (w2[0], w2[1], offset); - w6[0] = hc_bytealign_be (w1[3], w2[0], offset); - w5[3] = hc_bytealign_be (w1[2], w1[3], offset); - w5[2] = hc_bytealign_be (w1[1], w1[2], offset); - w5[1] = hc_bytealign_be (w1[0], w1[1], offset); - w5[0] = hc_bytealign_be (w0[3], w1[0], offset); - w4[3] = hc_bytealign_be (w0[2], w0[3], offset); - w4[2] = hc_bytealign_be (w0[1], w0[2], offset); - w4[1] = hc_bytealign_be (w0[0], w0[1], offset); - w4[0] = hc_bytealign_be ( 0, w0[0], offset); + c4[0] = hc_bytealign (w7[3], 0, offset); + c3[3] = hc_bytealign (w7[2], w7[3], offset); + c3[2] = hc_bytealign (w7[1], w7[2], offset); + c3[1] = hc_bytealign (w7[0], w7[1], offset); + c3[0] = hc_bytealign (w6[3], w7[0], offset); + c2[3] = hc_bytealign (w6[2], w6[3], offset); + c2[2] = hc_bytealign (w6[1], w6[2], offset); + c2[1] = hc_bytealign (w6[0], w6[1], offset); + c2[0] = hc_bytealign (w5[3], w6[0], offset); + c1[3] = hc_bytealign (w5[2], w5[3], offset); + c1[2] = hc_bytealign (w5[1], w5[2], offset); + c1[1] = hc_bytealign (w5[0], w5[1], offset); + c1[0] = hc_bytealign (w4[3], w5[0], offset); + c0[3] = hc_bytealign (w4[2], w4[3], offset); + c0[2] = hc_bytealign (w4[1], w4[2], offset); + c0[1] = hc_bytealign (w4[0], w4[1], offset); + c0[0] = hc_bytealign (w3[3], w4[0], offset); + w7[3] = hc_bytealign (w3[2], w3[3], offset); + w7[2] = hc_bytealign (w3[1], w3[2], offset); + w7[1] = hc_bytealign (w3[0], w3[1], offset); + w7[0] = hc_bytealign (w2[3], w3[0], offset); + w6[3] = hc_bytealign (w2[2], w2[3], offset); + w6[2] = hc_bytealign (w2[1], w2[2], offset); + w6[1] = hc_bytealign (w2[0], w2[1], offset); + w6[0] = hc_bytealign (w1[3], w2[0], offset); + w5[3] = hc_bytealign (w1[2], w1[3], offset); + w5[2] = hc_bytealign (w1[1], w1[2], offset); + w5[1] = hc_bytealign (w1[0], w1[1], offset); + w5[0] = hc_bytealign (w0[3], w1[0], offset); + w4[3] = hc_bytealign (w0[2], w0[3], offset); + w4[2] = hc_bytealign (w0[1], w0[2], offset); + w4[1] = hc_bytealign (w0[0], w0[1], offset); + w4[0] = hc_bytealign ( 0, w0[0], offset); w3[3] = 0; w3[2] = 0; w3[1] = 0; @@ -8119,21 +8272,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 17: - w7[3] = hc_bytealign_be (w3[1], w3[2], offset); - w7[2] = hc_bytealign_be (w3[0], w3[1], offset); - w7[1] = hc_bytealign_be (w2[3], w3[0], offset); - w7[0] = hc_bytealign_be (w2[2], w2[3], offset); - w6[3] = hc_bytealign_be (w2[1], w2[2], offset); - w6[2] = hc_bytealign_be (w2[0], w2[1], offset); - w6[1] = hc_bytealign_be (w1[3], w2[0], offset); - w6[0] = hc_bytealign_be (w1[2], w1[3], offset); - w5[3] = hc_bytealign_be (w1[1], w1[2], offset); - w5[2] = hc_bytealign_be (w1[0], w1[1], offset); - w5[1] = hc_bytealign_be (w0[3], w1[0], offset); - w5[0] = hc_bytealign_be (w0[2], w0[3], offset); - w4[3] = hc_bytealign_be (w0[1], w0[2], offset); - w4[2] = hc_bytealign_be (w0[0], w0[1], offset); - w4[1] = hc_bytealign_be ( 0, w0[0], offset); + c4[1] = hc_bytealign (w7[3], 0, offset); + c4[0] = hc_bytealign (w7[2], w7[3], offset); + c3[3] = hc_bytealign (w7[1], w7[2], offset); + c3[2] = hc_bytealign (w7[0], w7[1], offset); + c3[1] = hc_bytealign (w6[3], w7[0], offset); + c3[0] = hc_bytealign (w6[2], w6[3], offset); + c2[3] = hc_bytealign (w6[1], w6[2], offset); + c2[2] = hc_bytealign (w6[0], w6[1], offset); + c2[1] = hc_bytealign (w5[3], w6[0], offset); + c2[0] = hc_bytealign (w5[2], w5[3], offset); + c1[3] = hc_bytealign (w5[1], w5[2], offset); + c1[2] = hc_bytealign (w5[0], w5[1], offset); + c1[1] = hc_bytealign (w4[3], w5[0], offset); + c1[0] = hc_bytealign (w4[2], w4[3], offset); + c0[3] = hc_bytealign (w4[1], w4[2], offset); + c0[2] = hc_bytealign (w4[0], w4[1], offset); + c0[1] = hc_bytealign (w3[3], w4[0], offset); + c0[0] = hc_bytealign (w3[2], w3[3], offset); + w7[3] = hc_bytealign (w3[1], w3[2], offset); + w7[2] = hc_bytealign (w3[0], w3[1], offset); + w7[1] = hc_bytealign (w2[3], w3[0], offset); + w7[0] = hc_bytealign (w2[2], w2[3], offset); + w6[3] = hc_bytealign (w2[1], w2[2], offset); + w6[2] = hc_bytealign (w2[0], w2[1], offset); + w6[1] = hc_bytealign (w1[3], w2[0], offset); + w6[0] = hc_bytealign (w1[2], w1[3], offset); + w5[3] = hc_bytealign (w1[1], w1[2], offset); + w5[2] = hc_bytealign (w1[0], w1[1], offset); + w5[1] = hc_bytealign (w0[3], w1[0], offset); + w5[0] = hc_bytealign (w0[2], w0[3], offset); + w4[3] = hc_bytealign (w0[1], w0[2], offset); + w4[2] = hc_bytealign (w0[0], w0[1], offset); + w4[1] = hc_bytealign ( 0, w0[0], offset); w4[0] = 0; w3[3] = 0; w3[2] = 0; @@ -8155,20 +8326,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 18: - w7[3] = hc_bytealign_be (w3[0], w3[1], offset); - w7[2] = hc_bytealign_be (w2[3], w3[0], offset); - w7[1] = hc_bytealign_be (w2[2], w2[3], offset); - w7[0] = hc_bytealign_be (w2[1], w2[2], offset); - w6[3] = hc_bytealign_be (w2[0], w2[1], offset); - w6[2] = hc_bytealign_be (w1[3], w2[0], offset); - w6[1] = hc_bytealign_be (w1[2], w1[3], offset); - w6[0] = hc_bytealign_be (w1[1], w1[2], offset); - w5[3] = hc_bytealign_be (w1[0], w1[1], offset); - w5[2] = hc_bytealign_be (w0[3], w1[0], offset); - w5[1] = hc_bytealign_be (w0[2], w0[3], offset); - w5[0] = hc_bytealign_be (w0[1], w0[2], offset); - w4[3] = hc_bytealign_be (w0[0], w0[1], offset); - w4[2] = hc_bytealign_be ( 0, w0[0], offset); + c4[2] = hc_bytealign (w7[3], 0, offset); + c4[1] = hc_bytealign (w7[2], w7[3], offset); + c4[0] = hc_bytealign (w7[1], w7[2], offset); + c3[3] = hc_bytealign (w7[0], w7[1], offset); + c3[2] = hc_bytealign (w6[3], w7[0], offset); + c3[1] = hc_bytealign (w6[2], w6[3], offset); + c3[0] = hc_bytealign (w6[1], w6[2], offset); + c2[3] = hc_bytealign (w6[0], w6[1], offset); + c2[2] = hc_bytealign (w5[3], w6[0], offset); + c2[1] = hc_bytealign (w5[2], w5[3], offset); + c2[0] = hc_bytealign (w5[1], w5[2], offset); + c1[3] = hc_bytealign (w5[0], w5[1], offset); + c1[2] = hc_bytealign (w4[3], w5[0], offset); + c1[1] = hc_bytealign (w4[2], w4[3], offset); + c1[0] = hc_bytealign (w4[1], w4[2], offset); + c0[3] = hc_bytealign (w4[0], w4[1], offset); + c0[2] = hc_bytealign (w3[3], w4[0], offset); + c0[1] = hc_bytealign (w3[2], w3[3], offset); + c0[0] = hc_bytealign (w3[1], w3[2], offset); + w7[3] = hc_bytealign (w3[0], w3[1], offset); + w7[2] = hc_bytealign (w2[3], w3[0], offset); + w7[1] = hc_bytealign (w2[2], w2[3], offset); + w7[0] = hc_bytealign (w2[1], w2[2], offset); + w6[3] = hc_bytealign (w2[0], w2[1], offset); + w6[2] = hc_bytealign (w1[3], w2[0], offset); + w6[1] = hc_bytealign (w1[2], w1[3], offset); + w6[0] = hc_bytealign (w1[1], w1[2], offset); + w5[3] = hc_bytealign (w1[0], w1[1], offset); + w5[2] = hc_bytealign (w0[3], w1[0], offset); + w5[1] = hc_bytealign (w0[2], w0[3], offset); + w5[0] = hc_bytealign (w0[1], w0[2], offset); + w4[3] = hc_bytealign (w0[0], w0[1], offset); + w4[2] = hc_bytealign ( 0, w0[0], offset); w4[1] = 0; w4[0] = 0; w3[3] = 0; @@ -8191,19 +8381,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 19: - w7[3] = hc_bytealign_be (w2[3], w3[0], offset); - w7[2] = hc_bytealign_be (w2[2], w2[3], offset); - w7[1] = hc_bytealign_be (w2[1], w2[2], offset); - w7[0] = hc_bytealign_be (w2[0], w2[1], offset); - w6[3] = hc_bytealign_be (w1[3], w2[0], offset); - w6[2] = hc_bytealign_be (w1[2], w1[3], offset); - w6[1] = hc_bytealign_be (w1[1], w1[2], offset); - w6[0] = hc_bytealign_be (w1[0], w1[1], offset); - w5[3] = hc_bytealign_be (w0[3], w1[0], offset); - w5[2] = hc_bytealign_be (w0[2], w0[3], offset); - w5[1] = hc_bytealign_be (w0[1], w0[2], offset); - w5[0] = hc_bytealign_be (w0[0], w0[1], offset); - w4[3] = hc_bytealign_be ( 0, w0[0], offset); + c4[3] = hc_bytealign (w7[3], 0, offset); + c4[2] = hc_bytealign (w7[2], w7[3], offset); + c4[1] = hc_bytealign (w7[1], w7[2], offset); + c4[0] = hc_bytealign (w7[0], w7[1], offset); + c3[3] = hc_bytealign (w6[3], w7[0], offset); + c3[2] = hc_bytealign (w6[2], w6[3], offset); + c3[1] = hc_bytealign (w6[1], w6[2], offset); + c3[0] = hc_bytealign (w6[0], w6[1], offset); + c2[3] = hc_bytealign (w5[3], w6[0], offset); + c2[2] = hc_bytealign (w5[2], w5[3], offset); + c2[1] = hc_bytealign (w5[1], w5[2], offset); + c2[0] = hc_bytealign (w5[0], w5[1], offset); + c1[3] = hc_bytealign (w4[3], w5[0], offset); + c1[2] = hc_bytealign (w4[2], w4[3], offset); + c1[1] = hc_bytealign (w4[1], w4[2], offset); + c1[0] = hc_bytealign (w4[0], w4[1], offset); + c0[3] = hc_bytealign (w3[3], w4[0], offset); + c0[2] = hc_bytealign (w3[2], w3[3], offset); + c0[1] = hc_bytealign (w3[1], w3[2], offset); + c0[0] = hc_bytealign (w3[0], w3[1], offset); + w7[3] = hc_bytealign (w2[3], w3[0], offset); + w7[2] = hc_bytealign (w2[2], w2[3], offset); + w7[1] = hc_bytealign (w2[1], w2[2], offset); + w7[0] = hc_bytealign (w2[0], w2[1], offset); + w6[3] = hc_bytealign (w1[3], w2[0], offset); + w6[2] = hc_bytealign (w1[2], w1[3], offset); + w6[1] = hc_bytealign (w1[1], w1[2], offset); + w6[0] = hc_bytealign (w1[0], w1[1], offset); + w5[3] = hc_bytealign (w0[3], w1[0], offset); + w5[2] = hc_bytealign (w0[2], w0[3], offset); + w5[1] = hc_bytealign (w0[1], w0[2], offset); + w5[0] = hc_bytealign (w0[0], w0[1], offset); + w4[3] = hc_bytealign ( 0, w0[0], offset); w4[2] = 0; w4[1] = 0; w4[0] = 0; @@ -8227,18 +8437,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 20: - w7[3] = hc_bytealign_be (w2[2], w2[3], offset); - w7[2] = hc_bytealign_be (w2[1], w2[2], offset); - w7[1] = hc_bytealign_be (w2[0], w2[1], offset); - w7[0] = hc_bytealign_be (w1[3], w2[0], offset); - w6[3] = hc_bytealign_be (w1[2], w1[3], offset); - w6[2] = hc_bytealign_be (w1[1], w1[2], offset); - w6[1] = hc_bytealign_be (w1[0], w1[1], offset); - w6[0] = hc_bytealign_be (w0[3], w1[0], offset); - w5[3] = hc_bytealign_be (w0[2], w0[3], offset); - w5[2] = hc_bytealign_be (w0[1], w0[2], offset); - w5[1] = hc_bytealign_be (w0[0], w0[1], offset); - w5[0] = hc_bytealign_be ( 0, w0[0], offset); + c5[0] = hc_bytealign (w7[3], 0, offset); + c4[3] = hc_bytealign (w7[2], w7[3], offset); + c4[2] = hc_bytealign (w7[1], w7[2], offset); + c4[1] = hc_bytealign (w7[0], w7[1], offset); + c4[0] = hc_bytealign (w6[3], w7[0], offset); + c3[3] = hc_bytealign (w6[2], w6[3], offset); + c3[2] = hc_bytealign (w6[1], w6[2], offset); + c3[1] = hc_bytealign (w6[0], w6[1], offset); + c3[0] = hc_bytealign (w5[3], w6[0], offset); + c2[3] = hc_bytealign (w5[2], w5[3], offset); + c2[2] = hc_bytealign (w5[1], w5[2], offset); + c2[1] = hc_bytealign (w5[0], w5[1], offset); + c2[0] = hc_bytealign (w4[3], w5[0], offset); + c1[3] = hc_bytealign (w4[2], w4[3], offset); + c1[2] = hc_bytealign (w4[1], w4[2], offset); + c1[1] = hc_bytealign (w4[0], w4[1], offset); + c1[0] = hc_bytealign (w3[3], w4[0], offset); + c0[3] = hc_bytealign (w3[2], w3[3], offset); + c0[2] = hc_bytealign (w3[1], w3[2], offset); + c0[1] = hc_bytealign (w3[0], w3[1], offset); + c0[0] = hc_bytealign (w2[3], w3[0], offset); + w7[3] = hc_bytealign (w2[2], w2[3], offset); + w7[2] = hc_bytealign (w2[1], w2[2], offset); + w7[1] = hc_bytealign (w2[0], w2[1], offset); + w7[0] = hc_bytealign (w1[3], w2[0], offset); + w6[3] = hc_bytealign (w1[2], w1[3], offset); + w6[2] = hc_bytealign (w1[1], w1[2], offset); + w6[1] = hc_bytealign (w1[0], w1[1], offset); + w6[0] = hc_bytealign (w0[3], w1[0], offset); + w5[3] = hc_bytealign (w0[2], w0[3], offset); + w5[2] = hc_bytealign (w0[1], w0[2], offset); + w5[1] = hc_bytealign (w0[0], w0[1], offset); + w5[0] = hc_bytealign ( 0, w0[0], offset); w4[3] = 0; w4[2] = 0; w4[1] = 0; @@ -8263,17 +8494,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 21: - w7[3] = hc_bytealign_be (w2[1], w2[2], offset); - w7[2] = hc_bytealign_be (w2[0], w2[1], offset); - w7[1] = hc_bytealign_be (w1[3], w2[0], offset); - w7[0] = hc_bytealign_be (w1[2], w1[3], offset); - w6[3] = hc_bytealign_be (w1[1], w1[2], offset); - w6[2] = hc_bytealign_be (w1[0], w1[1], offset); - w6[1] = hc_bytealign_be (w0[3], w1[0], offset); - w6[0] = hc_bytealign_be (w0[2], w0[3], offset); - w5[3] = hc_bytealign_be (w0[1], w0[2], offset); - w5[2] = hc_bytealign_be (w0[0], w0[1], offset); - w5[1] = hc_bytealign_be ( 0, w0[0], offset); + c5[1] = hc_bytealign (w7[3], 0, offset); + c5[0] = hc_bytealign (w7[2], w7[3], offset); + c4[3] = hc_bytealign (w7[1], w7[2], offset); + c4[2] = hc_bytealign (w7[0], w7[1], offset); + c4[1] = hc_bytealign (w6[3], w7[0], offset); + c4[0] = hc_bytealign (w6[2], w6[3], offset); + c3[3] = hc_bytealign (w6[1], w6[2], offset); + c3[2] = hc_bytealign (w6[0], w6[1], offset); + c3[1] = hc_bytealign (w5[3], w6[0], offset); + c3[0] = hc_bytealign (w5[2], w5[3], offset); + c2[3] = hc_bytealign (w5[1], w5[2], offset); + c2[2] = hc_bytealign (w5[0], w5[1], offset); + c2[1] = hc_bytealign (w4[3], w5[0], offset); + c2[0] = hc_bytealign (w4[2], w4[3], offset); + c1[3] = hc_bytealign (w4[1], w4[2], offset); + c1[2] = hc_bytealign (w4[0], w4[1], offset); + c1[1] = hc_bytealign (w3[3], w4[0], offset); + c1[0] = hc_bytealign (w3[2], w3[3], offset); + c0[3] = hc_bytealign (w3[1], w3[2], offset); + c0[2] = hc_bytealign (w3[0], w3[1], offset); + c0[1] = hc_bytealign (w2[3], w3[0], offset); + c0[0] = hc_bytealign (w2[2], w2[3], offset); + w7[3] = hc_bytealign (w2[1], w2[2], offset); + w7[2] = hc_bytealign (w2[0], w2[1], offset); + w7[1] = hc_bytealign (w1[3], w2[0], offset); + w7[0] = hc_bytealign (w1[2], w1[3], offset); + w6[3] = hc_bytealign (w1[1], w1[2], offset); + w6[2] = hc_bytealign (w1[0], w1[1], offset); + w6[1] = hc_bytealign (w0[3], w1[0], offset); + w6[0] = hc_bytealign (w0[2], w0[3], offset); + w5[3] = hc_bytealign (w0[1], w0[2], offset); + w5[2] = hc_bytealign (w0[0], w0[1], offset); + w5[1] = hc_bytealign ( 0, w0[0], offset); w5[0] = 0; w4[3] = 0; w4[2] = 0; @@ -8299,16 +8552,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 22: - w7[3] = hc_bytealign_be (w2[0], w2[1], offset); - w7[2] = hc_bytealign_be (w1[3], w2[0], offset); - w7[1] = hc_bytealign_be (w1[2], w1[3], offset); - w7[0] = hc_bytealign_be (w1[1], w1[2], offset); - w6[3] = hc_bytealign_be (w1[0], w1[1], offset); - w6[2] = hc_bytealign_be (w0[3], w1[0], offset); - w6[1] = hc_bytealign_be (w0[2], w0[3], offset); - w6[0] = hc_bytealign_be (w0[1], w0[2], offset); - w5[3] = hc_bytealign_be (w0[0], w0[1], offset); - w5[2] = hc_bytealign_be ( 0, w0[0], offset); + c5[2] = hc_bytealign (w7[3], 0, offset); + c5[1] = hc_bytealign (w7[2], w7[3], offset); + c5[0] = hc_bytealign (w7[1], w7[2], offset); + c4[3] = hc_bytealign (w7[0], w7[1], offset); + c4[2] = hc_bytealign (w6[3], w7[0], offset); + c4[1] = hc_bytealign (w6[2], w6[3], offset); + c4[0] = hc_bytealign (w6[1], w6[2], offset); + c3[3] = hc_bytealign (w6[0], w6[1], offset); + c3[2] = hc_bytealign (w5[3], w6[0], offset); + c3[1] = hc_bytealign (w5[2], w5[3], offset); + c3[0] = hc_bytealign (w5[1], w5[2], offset); + c2[3] = hc_bytealign (w5[0], w5[1], offset); + c2[2] = hc_bytealign (w4[3], w5[0], offset); + c2[1] = hc_bytealign (w4[2], w4[3], offset); + c2[0] = hc_bytealign (w4[1], w4[2], offset); + c1[3] = hc_bytealign (w4[0], w4[1], offset); + c1[2] = hc_bytealign (w3[3], w4[0], offset); + c1[1] = hc_bytealign (w3[2], w3[3], offset); + c1[0] = hc_bytealign (w3[1], w3[2], offset); + c0[3] = hc_bytealign (w3[0], w3[1], offset); + c0[2] = hc_bytealign (w2[3], w3[0], offset); + c0[1] = hc_bytealign (w2[2], w2[3], offset); + c0[0] = hc_bytealign (w2[1], w2[2], offset); + w7[3] = hc_bytealign (w2[0], w2[1], offset); + w7[2] = hc_bytealign (w1[3], w2[0], offset); + w7[1] = hc_bytealign (w1[2], w1[3], offset); + w7[0] = hc_bytealign (w1[1], w1[2], offset); + w6[3] = hc_bytealign (w1[0], w1[1], offset); + w6[2] = hc_bytealign (w0[3], w1[0], offset); + w6[1] = hc_bytealign (w0[2], w0[3], offset); + w6[0] = hc_bytealign (w0[1], w0[2], offset); + w5[3] = hc_bytealign (w0[0], w0[1], offset); + w5[2] = hc_bytealign ( 0, w0[0], offset); w5[1] = 0; w5[0] = 0; w4[3] = 0; @@ -8335,15 +8611,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 23: - w7[3] = hc_bytealign_be (w1[3], w2[0], offset); - w7[2] = hc_bytealign_be (w1[2], w1[3], offset); - w7[1] = hc_bytealign_be (w1[1], w1[2], offset); - w7[0] = hc_bytealign_be (w1[0], w1[1], offset); - w6[3] = hc_bytealign_be (w0[3], w1[0], offset); - w6[2] = hc_bytealign_be (w0[2], w0[3], offset); - w6[1] = hc_bytealign_be (w0[1], w0[2], offset); - w6[0] = hc_bytealign_be (w0[0], w0[1], offset); - w5[3] = hc_bytealign_be ( 0, w0[0], offset); + c5[3] = hc_bytealign (w7[3], 0, offset); + c5[2] = hc_bytealign (w7[2], w7[3], offset); + c5[1] = hc_bytealign (w7[1], w7[2], offset); + c5[0] = hc_bytealign (w7[0], w7[1], offset); + c4[3] = hc_bytealign (w6[3], w7[0], offset); + c4[2] = hc_bytealign (w6[2], w6[3], offset); + c4[1] = hc_bytealign (w6[1], w6[2], offset); + c4[0] = hc_bytealign (w6[0], w6[1], offset); + c3[3] = hc_bytealign (w5[3], w6[0], offset); + c3[2] = hc_bytealign (w5[2], w5[3], offset); + c3[1] = hc_bytealign (w5[1], w5[2], offset); + c3[0] = hc_bytealign (w5[0], w5[1], offset); + c2[3] = hc_bytealign (w4[3], w5[0], offset); + c2[2] = hc_bytealign (w4[2], w4[3], offset); + c2[1] = hc_bytealign (w4[1], w4[2], offset); + c2[0] = hc_bytealign (w4[0], w4[1], offset); + c1[3] = hc_bytealign (w3[3], w4[0], offset); + c1[2] = hc_bytealign (w3[2], w3[3], offset); + c1[1] = hc_bytealign (w3[1], w3[2], offset); + c1[0] = hc_bytealign (w3[0], w3[1], offset); + c0[3] = hc_bytealign (w2[3], w3[0], offset); + c0[2] = hc_bytealign (w2[2], w2[3], offset); + c0[1] = hc_bytealign (w2[1], w2[2], offset); + c0[0] = hc_bytealign (w2[0], w2[1], offset); + w7[3] = hc_bytealign (w1[3], w2[0], offset); + w7[2] = hc_bytealign (w1[2], w1[3], offset); + w7[1] = hc_bytealign (w1[1], w1[2], offset); + w7[0] = hc_bytealign (w1[0], w1[1], offset); + w6[3] = hc_bytealign (w0[3], w1[0], offset); + w6[2] = hc_bytealign (w0[2], w0[3], offset); + w6[1] = hc_bytealign (w0[1], w0[2], offset); + w6[0] = hc_bytealign (w0[0], w0[1], offset); + w5[3] = hc_bytealign ( 0, w0[0], offset); w5[2] = 0; w5[1] = 0; w5[0] = 0; @@ -8371,17 +8671,42 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 24: - w7[3] = hc_bytealign_be (w1[2], w1[3], offset); - w7[2] = hc_bytealign_be (w1[1], w1[2], offset); - w7[1] = hc_bytealign_be (w1[0], w1[1], offset); - w7[0] = hc_bytealign_be (w0[3], w1[0], offset); - w6[3] = hc_bytealign_be (w0[2], w0[3], offset); - w6[2] = hc_bytealign_be (w0[1], w0[2], offset); - w6[1] = hc_bytealign_be (w0[0], w0[1], offset); - w6[0] = hc_bytealign_be ( 0, w0[0], offset); - w5[3] = 0; - w5[2] = 0; - w5[1] = 0; + c6[0] = hc_bytealign (w7[3], 0, offset); + c5[3] = hc_bytealign (w7[2], w7[3], offset); + c5[2] = hc_bytealign (w7[1], w7[2], offset); + c5[1] = hc_bytealign (w7[0], w7[1], offset); + c5[0] = hc_bytealign (w6[3], w7[0], offset); + c4[3] = hc_bytealign (w6[2], w6[3], offset); + c4[2] = hc_bytealign (w6[1], w6[2], offset); + c4[1] = hc_bytealign (w6[0], w6[1], offset); + c4[0] = hc_bytealign (w5[3], w6[0], offset); + c3[3] = hc_bytealign (w5[2], w5[3], offset); + c3[2] = hc_bytealign (w5[1], w5[2], offset); + c3[1] = hc_bytealign (w5[0], w5[1], offset); + c3[0] = hc_bytealign (w4[3], w5[0], offset); + c2[3] = hc_bytealign (w4[2], w4[3], offset); + c2[2] = hc_bytealign (w4[1], w4[2], offset); + c2[1] = hc_bytealign (w4[0], w4[1], offset); + c2[0] = hc_bytealign (w3[3], w4[0], offset); + c1[3] = hc_bytealign (w3[2], w3[3], offset); + c1[2] = hc_bytealign (w3[1], w3[2], offset); + c1[1] = hc_bytealign (w3[0], w3[1], offset); + c1[0] = hc_bytealign (w2[3], w3[0], offset); + c0[3] = hc_bytealign (w2[2], w2[3], offset); + c0[2] = hc_bytealign (w2[1], w2[2], offset); + c0[1] = hc_bytealign (w2[0], w2[1], offset); + c0[0] = hc_bytealign (w1[3], w2[0], offset); + w7[3] = hc_bytealign (w1[2], w1[3], offset); + w7[2] = hc_bytealign (w1[1], w1[2], offset); + w7[1] = hc_bytealign (w1[0], w1[1], offset); + w7[0] = hc_bytealign (w0[3], w1[0], offset); + w6[3] = hc_bytealign (w0[2], w0[3], offset); + w6[2] = hc_bytealign (w0[1], w0[2], offset); + w6[1] = hc_bytealign (w0[0], w0[1], offset); + w6[0] = hc_bytealign ( 0, w0[0], offset); + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; @@ -8407,13 +8732,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 25: - w7[3] = hc_bytealign_be (w1[1], w1[2], offset); - w7[2] = hc_bytealign_be (w1[0], w1[1], offset); - w7[1] = hc_bytealign_be (w0[3], w1[0], offset); - w7[0] = hc_bytealign_be (w0[2], w0[3], offset); - w6[3] = hc_bytealign_be (w0[1], w0[2], offset); - w6[2] = hc_bytealign_be (w0[0], w0[1], offset); - w6[1] = hc_bytealign_be ( 0, w0[0], offset); + c6[1] = hc_bytealign (w7[3], 0, offset); + c6[0] = hc_bytealign (w7[2], w7[3], offset); + c5[3] = hc_bytealign (w7[1], w7[2], offset); + c5[2] = hc_bytealign (w7[0], w7[1], offset); + c5[1] = hc_bytealign (w6[3], w7[0], offset); + c5[0] = hc_bytealign (w6[2], w6[3], offset); + c4[3] = hc_bytealign (w6[1], w6[2], offset); + c4[2] = hc_bytealign (w6[0], w6[1], offset); + c4[1] = hc_bytealign (w5[3], w6[0], offset); + c4[0] = hc_bytealign (w5[2], w5[3], offset); + c3[3] = hc_bytealign (w5[1], w5[2], offset); + c3[2] = hc_bytealign (w5[0], w5[1], offset); + c3[1] = hc_bytealign (w4[3], w5[0], offset); + c3[0] = hc_bytealign (w4[2], w4[3], offset); + c2[3] = hc_bytealign (w4[1], w4[2], offset); + c2[2] = hc_bytealign (w4[0], w4[1], offset); + c2[1] = hc_bytealign (w3[3], w4[0], offset); + c2[0] = hc_bytealign (w3[2], w3[3], offset); + c1[3] = hc_bytealign (w3[1], w3[2], offset); + c1[2] = hc_bytealign (w3[0], w3[1], offset); + c1[1] = hc_bytealign (w2[3], w3[0], offset); + c1[0] = hc_bytealign (w2[2], w2[3], offset); + c0[3] = hc_bytealign (w2[1], w2[2], offset); + c0[2] = hc_bytealign (w2[0], w2[1], offset); + c0[1] = hc_bytealign (w1[3], w2[0], offset); + c0[0] = hc_bytealign (w1[2], w1[3], offset); + w7[3] = hc_bytealign (w1[1], w1[2], offset); + w7[2] = hc_bytealign (w1[0], w1[1], offset); + w7[1] = hc_bytealign (w0[3], w1[0], offset); + w7[0] = hc_bytealign (w0[2], w0[3], offset); + w6[3] = hc_bytealign (w0[1], w0[2], offset); + w6[2] = hc_bytealign (w0[0], w0[1], offset); + w6[1] = hc_bytealign ( 0, w0[0], offset); w6[0] = 0; w5[3] = 0; w5[2] = 0; @@ -8443,12 +8794,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 26: - w7[3] = hc_bytealign_be (w1[0], w1[1], offset); - w7[2] = hc_bytealign_be (w0[3], w1[0], offset); - w7[1] = hc_bytealign_be (w0[2], w0[3], offset); - w7[0] = hc_bytealign_be (w0[1], w0[2], offset); - w6[3] = hc_bytealign_be (w0[0], w0[1], offset); - w6[2] = hc_bytealign_be ( 0, w0[0], offset); + c6[2] = hc_bytealign (w7[3], 0, offset); + c6[1] = hc_bytealign (w7[2], w7[3], offset); + c6[0] = hc_bytealign (w7[1], w7[2], offset); + c5[3] = hc_bytealign (w7[0], w7[1], offset); + c5[2] = hc_bytealign (w6[3], w7[0], offset); + c5[1] = hc_bytealign (w6[2], w6[3], offset); + c5[0] = hc_bytealign (w6[1], w6[2], offset); + c4[3] = hc_bytealign (w6[0], w6[1], offset); + c4[2] = hc_bytealign (w5[3], w6[0], offset); + c4[1] = hc_bytealign (w5[2], w5[3], offset); + c4[0] = hc_bytealign (w5[1], w5[2], offset); + c3[3] = hc_bytealign (w5[0], w5[1], offset); + c3[2] = hc_bytealign (w4[3], w5[0], offset); + c3[1] = hc_bytealign (w4[2], w4[3], offset); + c3[0] = hc_bytealign (w4[1], w4[2], offset); + c2[3] = hc_bytealign (w4[0], w4[1], offset); + c2[2] = hc_bytealign (w3[3], w4[0], offset); + c2[1] = hc_bytealign (w3[2], w3[3], offset); + c2[0] = hc_bytealign (w3[1], w3[2], offset); + c1[3] = hc_bytealign (w3[0], w3[1], offset); + c1[2] = hc_bytealign (w2[3], w3[0], offset); + c1[1] = hc_bytealign (w2[2], w2[3], offset); + c1[0] = hc_bytealign (w2[1], w2[2], offset); + c0[3] = hc_bytealign (w2[0], w2[1], offset); + c0[2] = hc_bytealign (w1[3], w2[0], offset); + c0[1] = hc_bytealign (w1[2], w1[3], offset); + c0[0] = hc_bytealign (w1[1], w1[2], offset); + w7[3] = hc_bytealign (w1[0], w1[1], offset); + w7[2] = hc_bytealign (w0[3], w1[0], offset); + w7[1] = hc_bytealign (w0[2], w0[3], offset); + w7[0] = hc_bytealign (w0[1], w0[2], offset); + w6[3] = hc_bytealign (w0[0], w0[1], offset); + w6[2] = hc_bytealign ( 0, w0[0], offset); w6[1] = 0; w6[0] = 0; w5[3] = 0; @@ -8479,11 +8857,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 27: - w7[3] = hc_bytealign_be (w0[3], w1[0], offset); - w7[2] = hc_bytealign_be (w0[2], w0[3], offset); - w7[1] = hc_bytealign_be (w0[1], w0[2], offset); - w7[0] = hc_bytealign_be (w0[0], w0[1], offset); - w6[3] = hc_bytealign_be ( 0, w0[0], offset); + c6[3] = hc_bytealign (w7[3], 0, offset); + c6[2] = hc_bytealign (w7[2], w7[3], offset); + c6[1] = hc_bytealign (w7[1], w7[2], offset); + c6[0] = hc_bytealign (w7[0], w7[1], offset); + c5[3] = hc_bytealign (w6[3], w7[0], offset); + c5[2] = hc_bytealign (w6[2], w6[3], offset); + c5[1] = hc_bytealign (w6[1], w6[2], offset); + c5[0] = hc_bytealign (w6[0], w6[1], offset); + c4[3] = hc_bytealign (w5[3], w6[0], offset); + c4[2] = hc_bytealign (w5[2], w5[3], offset); + c4[1] = hc_bytealign (w5[1], w5[2], offset); + c4[0] = hc_bytealign (w5[0], w5[1], offset); + c3[3] = hc_bytealign (w4[3], w5[0], offset); + c3[2] = hc_bytealign (w4[2], w4[3], offset); + c3[1] = hc_bytealign (w4[1], w4[2], offset); + c3[0] = hc_bytealign (w4[0], w4[1], offset); + c2[3] = hc_bytealign (w3[3], w4[0], offset); + c2[2] = hc_bytealign (w3[2], w3[3], offset); + c2[1] = hc_bytealign (w3[1], w3[2], offset); + c2[0] = hc_bytealign (w3[0], w3[1], offset); + c1[3] = hc_bytealign (w2[3], w3[0], offset); + c1[2] = hc_bytealign (w2[2], w2[3], offset); + c1[1] = hc_bytealign (w2[1], w2[2], offset); + c1[0] = hc_bytealign (w2[0], w2[1], offset); + c0[3] = hc_bytealign (w1[3], w2[0], offset); + c0[2] = hc_bytealign (w1[2], w1[3], offset); + c0[1] = hc_bytealign (w1[1], w1[2], offset); + c0[0] = hc_bytealign (w1[0], w1[1], offset); + w7[3] = hc_bytealign (w0[3], w1[0], offset); + w7[2] = hc_bytealign (w0[2], w0[3], offset); + w7[1] = hc_bytealign (w0[1], w0[2], offset); + w7[0] = hc_bytealign (w0[0], w0[1], offset); + w6[3] = hc_bytealign ( 0, w0[0], offset); w6[2] = 0; w6[1] = 0; w6[0] = 0; @@ -8515,10 +8921,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 28: - w7[3] = hc_bytealign_be (w0[2], w0[3], offset); - w7[2] = hc_bytealign_be (w0[1], w0[2], offset); - w7[1] = hc_bytealign_be (w0[0], w0[1], offset); - w7[0] = hc_bytealign_be ( 0, w0[0], offset); + c7[0] = hc_bytealign (w7[3], 0, offset); + c6[3] = hc_bytealign (w7[2], w7[3], offset); + c6[2] = hc_bytealign (w7[1], w7[2], offset); + c6[1] = hc_bytealign (w7[0], w7[1], offset); + c6[0] = hc_bytealign (w6[3], w7[0], offset); + c5[3] = hc_bytealign (w6[2], w6[3], offset); + c5[2] = hc_bytealign (w6[1], w6[2], offset); + c5[1] = hc_bytealign (w6[0], w6[1], offset); + c5[0] = hc_bytealign (w5[3], w6[0], offset); + c4[3] = hc_bytealign (w5[2], w5[3], offset); + c4[2] = hc_bytealign (w5[1], w5[2], offset); + c4[1] = hc_bytealign (w5[0], w5[1], offset); + c4[0] = hc_bytealign (w4[3], w5[0], offset); + c3[3] = hc_bytealign (w4[2], w4[3], offset); + c3[2] = hc_bytealign (w4[1], w4[2], offset); + c3[1] = hc_bytealign (w4[0], w4[1], offset); + c3[0] = hc_bytealign (w3[3], w4[0], offset); + c2[3] = hc_bytealign (w3[2], w3[3], offset); + c2[2] = hc_bytealign (w3[1], w3[2], offset); + c2[1] = hc_bytealign (w3[0], w3[1], offset); + c2[0] = hc_bytealign (w2[3], w3[0], offset); + c1[3] = hc_bytealign (w2[2], w2[3], offset); + c1[2] = hc_bytealign (w2[1], w2[2], offset); + c1[1] = hc_bytealign (w2[0], w2[1], offset); + c1[0] = hc_bytealign (w1[3], w2[0], offset); + c0[3] = hc_bytealign (w1[2], w1[3], offset); + c0[2] = hc_bytealign (w1[1], w1[2], offset); + c0[1] = hc_bytealign (w1[0], w1[1], offset); + c0[0] = hc_bytealign (w0[3], w1[0], offset); + w7[3] = hc_bytealign (w0[2], w0[3], offset); + w7[2] = hc_bytealign (w0[1], w0[2], offset); + w7[1] = hc_bytealign (w0[0], w0[1], offset); + w7[0] = hc_bytealign ( 0, w0[0], offset); w6[3] = 0; w6[2] = 0; w6[1] = 0; @@ -8551,9 +8986,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 29: - w7[3] = hc_bytealign_be (w0[1], w0[2], offset); - w7[2] = hc_bytealign_be (w0[0], w0[1], offset); - w7[1] = hc_bytealign_be ( 0, w0[0], offset); + c7[1] = hc_bytealign (w7[3], 0, offset); + c7[0] = hc_bytealign (w7[2], w7[3], offset); + c6[3] = hc_bytealign (w7[1], w7[2], offset); + c6[2] = hc_bytealign (w7[0], w7[1], offset); + c6[1] = hc_bytealign (w6[3], w7[0], offset); + c6[0] = hc_bytealign (w6[2], w6[3], offset); + c5[3] = hc_bytealign (w6[1], w6[2], offset); + c5[2] = hc_bytealign (w6[0], w6[1], offset); + c5[1] = hc_bytealign (w5[3], w6[0], offset); + c5[0] = hc_bytealign (w5[2], w5[3], offset); + c4[3] = hc_bytealign (w5[1], w5[2], offset); + c4[2] = hc_bytealign (w5[0], w5[1], offset); + c4[1] = hc_bytealign (w4[3], w5[0], offset); + c4[0] = hc_bytealign (w4[2], w4[3], offset); + c3[3] = hc_bytealign (w4[1], w4[2], offset); + c3[2] = hc_bytealign (w4[0], w4[1], offset); + c3[1] = hc_bytealign (w3[3], w4[0], offset); + c3[0] = hc_bytealign (w3[2], w3[3], offset); + c2[3] = hc_bytealign (w3[1], w3[2], offset); + c2[2] = hc_bytealign (w3[0], w3[1], offset); + c2[1] = hc_bytealign (w2[3], w3[0], offset); + c2[0] = hc_bytealign (w2[2], w2[3], offset); + c1[3] = hc_bytealign (w2[1], w2[2], offset); + c1[2] = hc_bytealign (w2[0], w2[1], offset); + c1[1] = hc_bytealign (w1[3], w2[0], offset); + c1[0] = hc_bytealign (w1[2], w1[3], offset); + c0[3] = hc_bytealign (w1[1], w1[2], offset); + c0[2] = hc_bytealign (w1[0], w1[1], offset); + c0[1] = hc_bytealign (w0[3], w1[0], offset); + c0[0] = hc_bytealign (w0[2], w0[3], offset); + w7[3] = hc_bytealign (w0[1], w0[2], offset); + w7[2] = hc_bytealign (w0[0], w0[1], offset); + w7[1] = hc_bytealign ( 0, w0[0], offset); w7[0] = 0; w6[3] = 0; w6[2] = 0; @@ -8587,8 +9052,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 30: - w7[3] = hc_bytealign_be (w0[0], w0[1], offset); - w7[2] = hc_bytealign_be ( 0, w0[0], offset); + c7[2] = hc_bytealign (w7[3], 0, offset); + c7[1] = hc_bytealign (w7[2], w7[3], offset); + c7[0] = hc_bytealign (w7[1], w7[2], offset); + c6[3] = hc_bytealign (w7[0], w7[1], offset); + c6[2] = hc_bytealign (w6[3], w7[0], offset); + c6[1] = hc_bytealign (w6[2], w6[3], offset); + c6[0] = hc_bytealign (w6[1], w6[2], offset); + c5[3] = hc_bytealign (w6[0], w6[1], offset); + c5[2] = hc_bytealign (w5[3], w6[0], offset); + c5[1] = hc_bytealign (w5[2], w5[3], offset); + c5[0] = hc_bytealign (w5[1], w5[2], offset); + c4[3] = hc_bytealign (w5[0], w5[1], offset); + c4[2] = hc_bytealign (w4[3], w5[0], offset); + c4[1] = hc_bytealign (w4[2], w4[3], offset); + c4[0] = hc_bytealign (w4[1], w4[2], offset); + c3[3] = hc_bytealign (w4[0], w4[1], offset); + c3[2] = hc_bytealign (w3[3], w4[0], offset); + c3[1] = hc_bytealign (w3[2], w3[3], offset); + c3[0] = hc_bytealign (w3[1], w3[2], offset); + c2[3] = hc_bytealign (w3[0], w3[1], offset); + c2[2] = hc_bytealign (w2[3], w3[0], offset); + c2[1] = hc_bytealign (w2[2], w2[3], offset); + c2[0] = hc_bytealign (w2[1], w2[2], offset); + c1[3] = hc_bytealign (w2[0], w2[1], offset); + c1[2] = hc_bytealign (w1[3], w2[0], offset); + c1[1] = hc_bytealign (w1[2], w1[3], offset); + c1[0] = hc_bytealign (w1[1], w1[2], offset); + c0[3] = hc_bytealign (w1[0], w1[1], offset); + c0[2] = hc_bytealign (w0[3], w1[0], offset); + c0[1] = hc_bytealign (w0[2], w0[3], offset); + c0[0] = hc_bytealign (w0[1], w0[2], offset); + w7[3] = hc_bytealign (w0[0], w0[1], offset); + w7[2] = hc_bytealign ( 0, w0[0], offset); w7[1] = 0; w7[0] = 0; w6[3] = 0; @@ -8623,7 +9119,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 31: - w7[3] = hc_bytealign_be ( 0, w0[0], offset); + c7[3] = hc_bytealign (w7[3], 0, offset); + c7[2] = hc_bytealign (w7[2], w7[3], offset); + c7[1] = hc_bytealign (w7[1], w7[2], offset); + c7[0] = hc_bytealign (w7[0], w7[1], offset); + c6[3] = hc_bytealign (w6[3], w7[0], offset); + c6[2] = hc_bytealign (w6[2], w6[3], offset); + c6[1] = hc_bytealign (w6[1], w6[2], offset); + c6[0] = hc_bytealign (w6[0], w6[1], offset); + c5[3] = hc_bytealign (w5[3], w6[0], offset); + c5[2] = hc_bytealign (w5[2], w5[3], offset); + c5[1] = hc_bytealign (w5[1], w5[2], offset); + c5[0] = hc_bytealign (w5[0], w5[1], offset); + c4[3] = hc_bytealign (w4[3], w5[0], offset); + c4[2] = hc_bytealign (w4[2], w4[3], offset); + c4[1] = hc_bytealign (w4[1], w4[2], offset); + c4[0] = hc_bytealign (w4[0], w4[1], offset); + c3[3] = hc_bytealign (w3[3], w4[0], offset); + c3[2] = hc_bytealign (w3[2], w3[3], offset); + c3[1] = hc_bytealign (w3[1], w3[2], offset); + c3[0] = hc_bytealign (w3[0], w3[1], offset); + c2[3] = hc_bytealign (w2[3], w3[0], offset); + c2[2] = hc_bytealign (w2[2], w2[3], offset); + c2[1] = hc_bytealign (w2[1], w2[2], offset); + c2[0] = hc_bytealign (w2[0], w2[1], offset); + c1[3] = hc_bytealign (w1[3], w2[0], offset); + c1[2] = hc_bytealign (w1[2], w1[3], offset); + c1[1] = hc_bytealign (w1[1], w1[2], offset); + c1[0] = hc_bytealign (w1[0], w1[1], offset); + c0[3] = hc_bytealign (w0[3], w1[0], offset); + c0[2] = hc_bytealign (w0[2], w0[3], offset); + c0[1] = hc_bytealign (w0[1], w0[2], offset); + c0[0] = hc_bytealign (w0[0], w0[1], offset); + w7[3] = hc_bytealign ( 0, w0[0], offset); w7[2] = 0; w7[1] = 0; w7[0] = 0; @@ -8662,154 +9190,168 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x #if (defined IS_AMD && HAS_VPERM == 1) || defined IS_NV + const int offset_mod_4 = offset & 3; + + const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_NV - const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; + const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; #endif #if defined IS_AMD - const int selector = 0x0706050403020100 >> ((offset & 3) * 8); + const int selector = 0x0706050403020100 >> (offset_minus_4 * 8); #endif switch (offset_switch) { case 0: - w7[3] = hc_byte_perm (w7[3], w7[2], selector); - w7[2] = hc_byte_perm (w7[2], w7[1], selector); - w7[1] = hc_byte_perm (w7[1], w7[0], selector); - w7[0] = hc_byte_perm (w7[0], w6[3], selector); - w6[3] = hc_byte_perm (w6[3], w6[2], selector); - w6[2] = hc_byte_perm (w6[2], w6[1], selector); - w6[1] = hc_byte_perm (w6[1], w6[0], selector); - w6[0] = hc_byte_perm (w6[0], w5[3], selector); - w5[3] = hc_byte_perm (w5[3], w5[2], selector); - w5[2] = hc_byte_perm (w5[2], w5[1], selector); - w5[1] = hc_byte_perm (w5[1], w5[0], selector); - w5[0] = hc_byte_perm (w5[0], w4[3], selector); - w4[3] = hc_byte_perm (w4[3], w4[2], selector); - w4[2] = hc_byte_perm (w4[2], w4[1], selector); - w4[1] = hc_byte_perm (w4[1], w4[0], selector); - w4[0] = hc_byte_perm (w4[0], w3[3], selector); - w3[3] = hc_byte_perm (w3[3], w3[2], selector); - w3[2] = hc_byte_perm (w3[2], w3[1], selector); - w3[1] = hc_byte_perm (w3[1], w3[0], selector); - w3[0] = hc_byte_perm (w3[0], w2[3], selector); - w2[3] = hc_byte_perm (w2[3], w2[2], selector); - w2[2] = hc_byte_perm (w2[2], w2[1], selector); - w2[1] = hc_byte_perm (w2[1], w2[0], selector); - w2[0] = hc_byte_perm (w2[0], w1[3], selector); - w1[3] = hc_byte_perm (w1[3], w1[2], selector); - w1[2] = hc_byte_perm (w1[2], w1[1], selector); - w1[1] = hc_byte_perm (w1[1], w1[0], selector); - w1[0] = hc_byte_perm (w1[0], w0[3], selector); - w0[3] = hc_byte_perm (w0[3], w0[2], selector); - w0[2] = hc_byte_perm (w0[2], w0[1], selector); - w0[1] = hc_byte_perm (w0[1], w0[0], selector); - w0[0] = hc_byte_perm (w0[0], 0, selector); + c0[0] = hc_byte_perm (w7[3], 0, selector); + w7[3] = hc_byte_perm (w7[2], w7[3], selector); + w7[2] = hc_byte_perm (w7[1], w7[2], selector); + w7[1] = hc_byte_perm (w7[0], w7[1], selector); + w7[0] = hc_byte_perm (w6[3], w7[0], selector); + w6[3] = hc_byte_perm (w6[2], w6[3], selector); + w6[2] = hc_byte_perm (w6[1], w6[2], selector); + w6[1] = hc_byte_perm (w6[0], w6[1], selector); + w6[0] = hc_byte_perm (w5[3], w6[0], selector); + w5[3] = hc_byte_perm (w5[2], w5[3], selector); + w5[2] = hc_byte_perm (w5[1], w5[2], selector); + w5[1] = hc_byte_perm (w5[0], w5[1], selector); + w5[0] = hc_byte_perm (w4[3], w5[0], selector); + w4[3] = hc_byte_perm (w4[2], w4[3], selector); + w4[2] = hc_byte_perm (w4[1], w4[2], selector); + w4[1] = hc_byte_perm (w4[0], w4[1], selector); + w4[0] = hc_byte_perm (w3[3], w4[0], selector); + w3[3] = hc_byte_perm (w3[2], w3[3], selector); + w3[2] = hc_byte_perm (w3[1], w3[2], selector); + w3[1] = hc_byte_perm (w3[0], w3[1], selector); + w3[0] = hc_byte_perm (w2[3], w3[0], selector); + w2[3] = hc_byte_perm (w2[2], w2[3], selector); + w2[2] = hc_byte_perm (w2[1], w2[2], selector); + w2[1] = hc_byte_perm (w2[0], w2[1], selector); + w2[0] = hc_byte_perm (w1[3], w2[0], selector); + w1[3] = hc_byte_perm (w1[2], w1[3], selector); + w1[2] = hc_byte_perm (w1[1], w1[2], selector); + w1[1] = hc_byte_perm (w1[0], w1[1], selector); + w1[0] = hc_byte_perm (w0[3], w1[0], selector); + w0[3] = hc_byte_perm (w0[2], w0[3], selector); + w0[2] = hc_byte_perm (w0[1], w0[2], selector); + w0[1] = hc_byte_perm (w0[0], w0[1], selector); + w0[0] = hc_byte_perm ( 0, w0[0], selector); break; case 1: - w7[3] = hc_byte_perm (w7[2], w7[1], selector); - w7[2] = hc_byte_perm (w7[1], w7[0], selector); - w7[1] = hc_byte_perm (w7[0], w6[3], selector); - w7[0] = hc_byte_perm (w6[3], w6[2], selector); - w6[3] = hc_byte_perm (w6[2], w6[1], selector); - w6[2] = hc_byte_perm (w6[1], w6[0], selector); - w6[1] = hc_byte_perm (w6[0], w5[3], selector); - w6[0] = hc_byte_perm (w5[3], w5[2], selector); - w5[3] = hc_byte_perm (w5[2], w5[1], selector); - w5[2] = hc_byte_perm (w5[1], w5[0], selector); - w5[1] = hc_byte_perm (w5[0], w4[3], selector); - w5[0] = hc_byte_perm (w4[3], w4[2], selector); - w4[3] = hc_byte_perm (w4[2], w4[1], selector); - w4[2] = hc_byte_perm (w4[1], w4[0], selector); - w4[1] = hc_byte_perm (w4[0], w3[3], selector); - w4[0] = hc_byte_perm (w3[3], w3[2], selector); - w3[3] = hc_byte_perm (w3[2], w3[1], selector); - w3[2] = hc_byte_perm (w3[1], w3[0], selector); - w3[1] = hc_byte_perm (w3[0], w2[3], selector); - w3[0] = hc_byte_perm (w2[3], w2[2], selector); - w2[3] = hc_byte_perm (w2[2], w2[1], selector); - w2[2] = hc_byte_perm (w2[1], w2[0], selector); - w2[1] = hc_byte_perm (w2[0], w1[3], selector); - w2[0] = hc_byte_perm (w1[3], w1[2], selector); - w1[3] = hc_byte_perm (w1[2], w1[1], selector); - w1[2] = hc_byte_perm (w1[1], w1[0], selector); - w1[1] = hc_byte_perm (w1[0], w0[3], selector); - w1[0] = hc_byte_perm (w0[3], w0[2], selector); - w0[3] = hc_byte_perm (w0[2], w0[1], selector); - w0[2] = hc_byte_perm (w0[1], w0[0], selector); - w0[1] = hc_byte_perm (w0[0], 0, selector); + c0[1] = hc_byte_perm (w7[3], 0, selector); + c0[0] = hc_byte_perm (w7[2], w7[3], selector); + w7[3] = hc_byte_perm (w7[1], w7[2], selector); + w7[2] = hc_byte_perm (w7[0], w7[1], selector); + w7[1] = hc_byte_perm (w6[3], w7[0], selector); + w7[0] = hc_byte_perm (w6[2], w6[3], selector); + w6[3] = hc_byte_perm (w6[1], w6[2], selector); + w6[2] = hc_byte_perm (w6[0], w6[1], selector); + w6[1] = hc_byte_perm (w5[3], w6[0], selector); + w6[0] = hc_byte_perm (w5[2], w5[3], selector); + w5[3] = hc_byte_perm (w5[1], w5[2], selector); + w5[2] = hc_byte_perm (w5[0], w5[1], selector); + w5[1] = hc_byte_perm (w4[3], w5[0], selector); + w5[0] = hc_byte_perm (w4[2], w4[3], selector); + w4[3] = hc_byte_perm (w4[1], w4[2], selector); + w4[2] = hc_byte_perm (w4[0], w4[1], selector); + w4[1] = hc_byte_perm (w3[3], w4[0], selector); + w4[0] = hc_byte_perm (w3[2], w3[3], selector); + w3[3] = hc_byte_perm (w3[1], w3[2], selector); + w3[2] = hc_byte_perm (w3[0], w3[1], selector); + w3[1] = hc_byte_perm (w2[3], w3[0], selector); + w3[0] = hc_byte_perm (w2[2], w2[3], selector); + w2[3] = hc_byte_perm (w2[1], w2[2], selector); + w2[2] = hc_byte_perm (w2[0], w2[1], selector); + w2[1] = hc_byte_perm (w1[3], w2[0], selector); + w2[0] = hc_byte_perm (w1[2], w1[3], selector); + w1[3] = hc_byte_perm (w1[1], w1[2], selector); + w1[2] = hc_byte_perm (w1[0], w1[1], selector); + w1[1] = hc_byte_perm (w0[3], w1[0], selector); + w1[0] = hc_byte_perm (w0[2], w0[3], selector); + w0[3] = hc_byte_perm (w0[1], w0[2], selector); + w0[2] = hc_byte_perm (w0[0], w0[1], selector); + w0[1] = hc_byte_perm ( 0, w0[0], selector); w0[0] = 0; break; case 2: - w7[3] = hc_byte_perm (w7[1], w7[0], selector); - w7[2] = hc_byte_perm (w7[0], w6[3], selector); - w7[1] = hc_byte_perm (w6[3], w6[2], selector); - w7[0] = hc_byte_perm (w6[2], w6[1], selector); - w6[3] = hc_byte_perm (w6[1], w6[0], selector); - w6[2] = hc_byte_perm (w6[0], w5[3], selector); - w6[1] = hc_byte_perm (w5[3], w5[2], selector); - w6[0] = hc_byte_perm (w5[2], w5[1], selector); - w5[3] = hc_byte_perm (w5[1], w5[0], selector); - w5[2] = hc_byte_perm (w5[0], w4[3], selector); - w5[1] = hc_byte_perm (w4[3], w4[2], selector); - w5[0] = hc_byte_perm (w4[2], w4[1], selector); - w4[3] = hc_byte_perm (w4[1], w4[0], selector); - w4[2] = hc_byte_perm (w4[0], w3[3], selector); - w4[1] = hc_byte_perm (w3[3], w3[2], selector); - w4[0] = hc_byte_perm (w3[2], w3[1], selector); - w3[3] = hc_byte_perm (w3[1], w3[0], selector); - w3[2] = hc_byte_perm (w3[0], w2[3], selector); - w3[1] = hc_byte_perm (w2[3], w2[2], selector); - w3[0] = hc_byte_perm (w2[2], w2[1], selector); - w2[3] = hc_byte_perm (w2[1], w2[0], selector); - w2[2] = hc_byte_perm (w2[0], w1[3], selector); - w2[1] = hc_byte_perm (w1[3], w1[2], selector); - w2[0] = hc_byte_perm (w1[2], w1[1], selector); - w1[3] = hc_byte_perm (w1[1], w1[0], selector); - w1[2] = hc_byte_perm (w1[0], w0[3], selector); - w1[1] = hc_byte_perm (w0[3], w0[2], selector); - w1[0] = hc_byte_perm (w0[2], w0[1], selector); - w0[3] = hc_byte_perm (w0[1], w0[0], selector); - w0[2] = hc_byte_perm (w0[0], 0, selector); + c0[2] = hc_byte_perm (w7[3], 0, selector); + c0[1] = hc_byte_perm (w7[2], w7[3], selector); + c0[0] = hc_byte_perm (w7[1], w7[2], selector); + w7[3] = hc_byte_perm (w7[0], w7[1], selector); + w7[2] = hc_byte_perm (w6[3], w7[0], selector); + w7[1] = hc_byte_perm (w6[2], w6[3], selector); + w7[0] = hc_byte_perm (w6[1], w6[2], selector); + w6[3] = hc_byte_perm (w6[0], w6[1], selector); + w6[2] = hc_byte_perm (w5[3], w6[0], selector); + w6[1] = hc_byte_perm (w5[2], w5[3], selector); + w6[0] = hc_byte_perm (w5[1], w5[2], selector); + w5[3] = hc_byte_perm (w5[0], w5[1], selector); + w5[2] = hc_byte_perm (w4[3], w5[0], selector); + w5[1] = hc_byte_perm (w4[2], w4[3], selector); + w5[0] = hc_byte_perm (w4[1], w4[2], selector); + w4[3] = hc_byte_perm (w4[0], w4[1], selector); + w4[2] = hc_byte_perm (w3[3], w4[0], selector); + w4[1] = hc_byte_perm (w3[2], w3[3], selector); + w4[0] = hc_byte_perm (w3[1], w3[2], selector); + w3[3] = hc_byte_perm (w3[0], w3[1], selector); + w3[2] = hc_byte_perm (w2[3], w3[0], selector); + w3[1] = hc_byte_perm (w2[2], w2[3], selector); + w3[0] = hc_byte_perm (w2[1], w2[2], selector); + w2[3] = hc_byte_perm (w2[0], w2[1], selector); + w2[2] = hc_byte_perm (w1[3], w2[0], selector); + w2[1] = hc_byte_perm (w1[2], w1[3], selector); + w2[0] = hc_byte_perm (w1[1], w1[2], selector); + w1[3] = hc_byte_perm (w1[0], w1[1], selector); + w1[2] = hc_byte_perm (w0[3], w1[0], selector); + w1[1] = hc_byte_perm (w0[2], w0[3], selector); + w1[0] = hc_byte_perm (w0[1], w0[2], selector); + w0[3] = hc_byte_perm (w0[0], w0[1], selector); + w0[2] = hc_byte_perm ( 0, w0[0], selector); w0[1] = 0; w0[0] = 0; break; case 3: - w7[3] = hc_byte_perm (w7[0], w6[3], selector); - w7[2] = hc_byte_perm (w6[3], w6[2], selector); - w7[1] = hc_byte_perm (w6[2], w6[1], selector); - w7[0] = hc_byte_perm (w6[1], w6[0], selector); - w6[3] = hc_byte_perm (w6[0], w5[3], selector); - w6[2] = hc_byte_perm (w5[3], w5[2], selector); - w6[1] = hc_byte_perm (w5[2], w5[1], selector); - w6[0] = hc_byte_perm (w5[1], w5[0], selector); - w5[3] = hc_byte_perm (w5[0], w4[3], selector); - w5[2] = hc_byte_perm (w4[3], w4[2], selector); - w5[1] = hc_byte_perm (w4[2], w4[1], selector); - w5[0] = hc_byte_perm (w4[1], w4[0], selector); - w4[3] = hc_byte_perm (w4[0], w3[3], selector); - w4[2] = hc_byte_perm (w3[3], w3[2], selector); - w4[1] = hc_byte_perm (w3[2], w3[1], selector); - w4[0] = hc_byte_perm (w3[1], w3[0], selector); - w3[3] = hc_byte_perm (w3[0], w2[3], selector); - w3[2] = hc_byte_perm (w2[3], w2[2], selector); - w3[1] = hc_byte_perm (w2[2], w2[1], selector); - w3[0] = hc_byte_perm (w2[1], w2[0], selector); - w2[3] = hc_byte_perm (w2[0], w1[3], selector); - w2[2] = hc_byte_perm (w1[3], w1[2], selector); - w2[1] = hc_byte_perm (w1[2], w1[1], selector); - w2[0] = hc_byte_perm (w1[1], w1[0], selector); - w1[3] = hc_byte_perm (w1[0], w0[3], selector); - w1[2] = hc_byte_perm (w0[3], w0[2], selector); - w1[1] = hc_byte_perm (w0[2], w0[1], selector); - w1[0] = hc_byte_perm (w0[1], w0[0], selector); - w0[3] = hc_byte_perm (w0[0], 0, selector); + c0[3] = hc_byte_perm (w7[3], 0, selector); + c0[2] = hc_byte_perm (w7[2], w7[3], selector); + c0[1] = hc_byte_perm (w7[1], w7[2], selector); + c0[0] = hc_byte_perm (w7[0], w7[1], selector); + w7[3] = hc_byte_perm (w6[3], w7[0], selector); + w7[2] = hc_byte_perm (w6[2], w6[3], selector); + w7[1] = hc_byte_perm (w6[1], w6[2], selector); + w7[0] = hc_byte_perm (w6[0], w6[1], selector); + w6[3] = hc_byte_perm (w5[3], w6[0], selector); + w6[2] = hc_byte_perm (w5[2], w5[3], selector); + w6[1] = hc_byte_perm (w5[1], w5[2], selector); + w6[0] = hc_byte_perm (w5[0], w5[1], selector); + w5[3] = hc_byte_perm (w4[3], w5[0], selector); + w5[2] = hc_byte_perm (w4[2], w4[3], selector); + w5[1] = hc_byte_perm (w4[1], w4[2], selector); + w5[0] = hc_byte_perm (w4[0], w4[1], selector); + w4[3] = hc_byte_perm (w3[3], w4[0], selector); + w4[2] = hc_byte_perm (w3[2], w3[3], selector); + w4[1] = hc_byte_perm (w3[1], w3[2], selector); + w4[0] = hc_byte_perm (w3[0], w3[1], selector); + w3[3] = hc_byte_perm (w2[3], w3[0], selector); + w3[2] = hc_byte_perm (w2[2], w2[3], selector); + w3[1] = hc_byte_perm (w2[1], w2[2], selector); + w3[0] = hc_byte_perm (w2[0], w2[1], selector); + w2[3] = hc_byte_perm (w1[3], w2[0], selector); + w2[2] = hc_byte_perm (w1[2], w1[3], selector); + w2[1] = hc_byte_perm (w1[1], w1[2], selector); + w2[0] = hc_byte_perm (w1[0], w1[1], selector); + w1[3] = hc_byte_perm (w0[3], w1[0], selector); + w1[2] = hc_byte_perm (w0[2], w0[3], selector); + w1[1] = hc_byte_perm (w0[1], w0[2], selector); + w1[0] = hc_byte_perm (w0[0], w0[1], selector); + w0[3] = hc_byte_perm ( 0, w0[0], selector); w0[2] = 0; w0[1] = 0; w0[0] = 0; @@ -8817,34 +9359,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 4: - w7[3] = hc_byte_perm (w6[3], w6[2], selector); - w7[2] = hc_byte_perm (w6[2], w6[1], selector); - w7[1] = hc_byte_perm (w6[1], w6[0], selector); - w7[0] = hc_byte_perm (w6[0], w5[3], selector); - w6[3] = hc_byte_perm (w5[3], w5[2], selector); - w6[2] = hc_byte_perm (w5[2], w5[1], selector); - w6[1] = hc_byte_perm (w5[1], w5[0], selector); - w6[0] = hc_byte_perm (w5[0], w4[3], selector); - w5[3] = hc_byte_perm (w4[3], w4[2], selector); - w5[2] = hc_byte_perm (w4[2], w4[1], selector); - w5[1] = hc_byte_perm (w4[1], w4[0], selector); - w5[0] = hc_byte_perm (w4[0], w3[3], selector); - w4[3] = hc_byte_perm (w3[3], w3[2], selector); - w4[2] = hc_byte_perm (w3[2], w3[1], selector); - w4[1] = hc_byte_perm (w3[1], w3[0], selector); - w4[0] = hc_byte_perm (w3[0], w2[3], selector); - w3[3] = hc_byte_perm (w2[3], w2[2], selector); - w3[2] = hc_byte_perm (w2[2], w2[1], selector); - w3[1] = hc_byte_perm (w2[1], w2[0], selector); - w3[0] = hc_byte_perm (w2[0], w1[3], selector); - w2[3] = hc_byte_perm (w1[3], w1[2], selector); - w2[2] = hc_byte_perm (w1[2], w1[1], selector); - w2[1] = hc_byte_perm (w1[1], w1[0], selector); - w2[0] = hc_byte_perm (w1[0], w0[3], selector); - w1[3] = hc_byte_perm (w0[3], w0[2], selector); - w1[2] = hc_byte_perm (w0[2], w0[1], selector); - w1[1] = hc_byte_perm (w0[1], w0[0], selector); - w1[0] = hc_byte_perm (w0[0], 0, selector); + c1[0] = hc_byte_perm (w7[3], 0, selector); + c0[3] = hc_byte_perm (w7[2], w7[3], selector); + c0[2] = hc_byte_perm (w7[1], w7[2], selector); + c0[1] = hc_byte_perm (w7[0], w7[1], selector); + c0[0] = hc_byte_perm (w6[3], w7[0], selector); + w7[3] = hc_byte_perm (w6[2], w6[3], selector); + w7[2] = hc_byte_perm (w6[1], w6[2], selector); + w7[1] = hc_byte_perm (w6[0], w6[1], selector); + w7[0] = hc_byte_perm (w5[3], w6[0], selector); + w6[3] = hc_byte_perm (w5[2], w5[3], selector); + w6[2] = hc_byte_perm (w5[1], w5[2], selector); + w6[1] = hc_byte_perm (w5[0], w5[1], selector); + w6[0] = hc_byte_perm (w4[3], w5[0], selector); + w5[3] = hc_byte_perm (w4[2], w4[3], selector); + w5[2] = hc_byte_perm (w4[1], w4[2], selector); + w5[1] = hc_byte_perm (w4[0], w4[1], selector); + w5[0] = hc_byte_perm (w3[3], w4[0], selector); + w4[3] = hc_byte_perm (w3[2], w3[3], selector); + w4[2] = hc_byte_perm (w3[1], w3[2], selector); + w4[1] = hc_byte_perm (w3[0], w3[1], selector); + w4[0] = hc_byte_perm (w2[3], w3[0], selector); + w3[3] = hc_byte_perm (w2[2], w2[3], selector); + w3[2] = hc_byte_perm (w2[1], w2[2], selector); + w3[1] = hc_byte_perm (w2[0], w2[1], selector); + w3[0] = hc_byte_perm (w1[3], w2[0], selector); + w2[3] = hc_byte_perm (w1[2], w1[3], selector); + w2[2] = hc_byte_perm (w1[1], w1[2], selector); + w2[1] = hc_byte_perm (w1[0], w1[1], selector); + w2[0] = hc_byte_perm (w0[3], w1[0], selector); + w1[3] = hc_byte_perm (w0[2], w0[3], selector); + w1[2] = hc_byte_perm (w0[1], w0[2], selector); + w1[1] = hc_byte_perm (w0[0], w0[1], selector); + w1[0] = hc_byte_perm ( 0, w0[0], selector); w0[3] = 0; w0[2] = 0; w0[1] = 0; @@ -8853,69 +9400,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 5: - w7[3] = hc_byte_perm (w6[2], w6[1], selector); - w7[2] = hc_byte_perm (w6[1], w6[0], selector); - w7[1] = hc_byte_perm (w6[0], w5[3], selector); - w7[0] = hc_byte_perm (w5[3], w5[2], selector); - w6[3] = hc_byte_perm (w5[2], w5[1], selector); - w6[2] = hc_byte_perm (w5[1], w5[0], selector); - w6[1] = hc_byte_perm (w5[0], w4[3], selector); - w6[0] = hc_byte_perm (w4[3], w4[2], selector); - w5[3] = hc_byte_perm (w4[2], w4[1], selector); - w5[2] = hc_byte_perm (w4[1], w4[0], selector); - w5[1] = hc_byte_perm (w4[0], w3[3], selector); - w5[0] = hc_byte_perm (w3[3], w3[2], selector); - w4[3] = hc_byte_perm (w3[2], w3[1], selector); - w4[2] = hc_byte_perm (w3[1], w3[0], selector); - w4[1] = hc_byte_perm (w3[0], w2[3], selector); - w4[0] = hc_byte_perm (w2[3], w2[2], selector); - w3[3] = hc_byte_perm (w2[2], w2[1], selector); - w3[2] = hc_byte_perm (w2[1], w2[0], selector); - w3[1] = hc_byte_perm (w2[0], w1[3], selector); - w3[0] = hc_byte_perm (w1[3], w1[2], selector); - w2[3] = hc_byte_perm (w1[2], w1[1], selector); - w2[2] = hc_byte_perm (w1[1], w1[0], selector); - w2[1] = hc_byte_perm (w1[0], w0[3], selector); - w2[0] = hc_byte_perm (w0[3], w0[2], selector); - w1[3] = hc_byte_perm (w0[2], w0[1], selector); - w1[2] = hc_byte_perm (w0[1], w0[0], selector); - w1[1] = hc_byte_perm (w0[0], 0, selector); - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - break; - - case 6: - w7[3] = hc_byte_perm (w6[1], w6[0], selector); - w7[2] = hc_byte_perm (w6[0], w5[3], selector); - w7[1] = hc_byte_perm (w5[3], w5[2], selector); - w7[0] = hc_byte_perm (w5[2], w5[1], selector); - w6[3] = hc_byte_perm (w5[1], w5[0], selector); - w6[2] = hc_byte_perm (w5[0], w4[3], selector); - w6[1] = hc_byte_perm (w4[3], w4[2], selector); - w6[0] = hc_byte_perm (w4[2], w4[1], selector); - w5[3] = hc_byte_perm (w4[1], w4[0], selector); - w5[2] = hc_byte_perm (w4[0], w3[3], selector); - w5[1] = hc_byte_perm (w3[3], w3[2], selector); - w5[0] = hc_byte_perm (w3[2], w3[1], selector); - w4[3] = hc_byte_perm (w3[1], w3[0], selector); - w4[2] = hc_byte_perm (w3[0], w2[3], selector); - w4[1] = hc_byte_perm (w2[3], w2[2], selector); - w4[0] = hc_byte_perm (w2[2], w2[1], selector); - w3[3] = hc_byte_perm (w2[1], w2[0], selector); - w3[2] = hc_byte_perm (w2[0], w1[3], selector); - w3[1] = hc_byte_perm (w1[3], w1[2], selector); - w3[0] = hc_byte_perm (w1[2], w1[1], selector); - w2[3] = hc_byte_perm (w1[1], w1[0], selector); - w2[2] = hc_byte_perm (w1[0], w0[3], selector); - w2[1] = hc_byte_perm (w0[3], w0[2], selector); - w2[0] = hc_byte_perm (w0[2], w0[1], selector); - w1[3] = hc_byte_perm (w0[1], w0[0], selector); - w1[2] = hc_byte_perm (w0[0], 0, selector); - w1[1] = 0; + c1[1] = hc_byte_perm (w7[3], 0, selector); + c1[0] = hc_byte_perm (w7[2], w7[3], selector); + c0[3] = hc_byte_perm (w7[1], w7[2], selector); + c0[2] = hc_byte_perm (w7[0], w7[1], selector); + c0[1] = hc_byte_perm (w6[3], w7[0], selector); + c0[0] = hc_byte_perm (w6[2], w6[3], selector); + w7[3] = hc_byte_perm (w6[1], w6[2], selector); + w7[2] = hc_byte_perm (w6[0], w6[1], selector); + w7[1] = hc_byte_perm (w5[3], w6[0], selector); + w7[0] = hc_byte_perm (w5[2], w5[3], selector); + w6[3] = hc_byte_perm (w5[1], w5[2], selector); + w6[2] = hc_byte_perm (w5[0], w5[1], selector); + w6[1] = hc_byte_perm (w4[3], w5[0], selector); + w6[0] = hc_byte_perm (w4[2], w4[3], selector); + w5[3] = hc_byte_perm (w4[1], w4[2], selector); + w5[2] = hc_byte_perm (w4[0], w4[1], selector); + w5[1] = hc_byte_perm (w3[3], w4[0], selector); + w5[0] = hc_byte_perm (w3[2], w3[3], selector); + w4[3] = hc_byte_perm (w3[1], w3[2], selector); + w4[2] = hc_byte_perm (w3[0], w3[1], selector); + w4[1] = hc_byte_perm (w2[3], w3[0], selector); + w4[0] = hc_byte_perm (w2[2], w2[3], selector); + w3[3] = hc_byte_perm (w2[1], w2[2], selector); + w3[2] = hc_byte_perm (w2[0], w2[1], selector); + w3[1] = hc_byte_perm (w1[3], w2[0], selector); + w3[0] = hc_byte_perm (w1[2], w1[3], selector); + w2[3] = hc_byte_perm (w1[1], w1[2], selector); + w2[2] = hc_byte_perm (w1[0], w1[1], selector); + w2[1] = hc_byte_perm (w0[3], w1[0], selector); + w2[0] = hc_byte_perm (w0[2], w0[3], selector); + w1[3] = hc_byte_perm (w0[1], w0[2], selector); + w1[2] = hc_byte_perm (w0[0], w0[1], selector); + w1[1] = hc_byte_perm ( 0, w0[0], selector); w1[0] = 0; w0[3] = 0; w0[2] = 0; @@ -8924,33 +9441,40 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; - case 7: - w7[3] = hc_byte_perm (w6[0], w5[3], selector); - w7[2] = hc_byte_perm (w5[3], w5[2], selector); - w7[1] = hc_byte_perm (w5[2], w5[1], selector); - w7[0] = hc_byte_perm (w5[1], w5[0], selector); - w6[3] = hc_byte_perm (w5[0], w4[3], selector); - w6[2] = hc_byte_perm (w4[3], w4[2], selector); - w6[1] = hc_byte_perm (w4[2], w4[1], selector); - w6[0] = hc_byte_perm (w4[1], w4[0], selector); - w5[3] = hc_byte_perm (w4[0], w3[3], selector); - w5[2] = hc_byte_perm (w3[3], w3[2], selector); - w5[1] = hc_byte_perm (w3[2], w3[1], selector); - w5[0] = hc_byte_perm (w3[1], w3[0], selector); - w4[3] = hc_byte_perm (w3[0], w2[3], selector); - w4[2] = hc_byte_perm (w2[3], w2[2], selector); - w4[1] = hc_byte_perm (w2[2], w2[1], selector); - w4[0] = hc_byte_perm (w2[1], w2[0], selector); - w3[3] = hc_byte_perm (w2[0], w1[3], selector); - w3[2] = hc_byte_perm (w1[3], w1[2], selector); - w3[1] = hc_byte_perm (w1[2], w1[1], selector); - w3[0] = hc_byte_perm (w1[1], w1[0], selector); - w2[3] = hc_byte_perm (w1[0], w0[3], selector); - w2[2] = hc_byte_perm (w0[3], w0[2], selector); - w2[1] = hc_byte_perm (w0[2], w0[1], selector); - w2[0] = hc_byte_perm (w0[1], w0[0], selector); - w1[3] = hc_byte_perm (w0[0], 0, selector); - w1[2] = 0; + case 6: + c1[2] = hc_byte_perm (w7[3], 0, selector); + c1[1] = hc_byte_perm (w7[2], w7[3], selector); + c1[0] = hc_byte_perm (w7[1], w7[2], selector); + c0[3] = hc_byte_perm (w7[0], w7[1], selector); + c0[2] = hc_byte_perm (w6[3], w7[0], selector); + c0[1] = hc_byte_perm (w6[2], w6[3], selector); + c0[0] = hc_byte_perm (w6[1], w6[2], selector); + w7[3] = hc_byte_perm (w6[0], w6[1], selector); + w7[2] = hc_byte_perm (w5[3], w6[0], selector); + w7[1] = hc_byte_perm (w5[2], w5[3], selector); + w7[0] = hc_byte_perm (w5[1], w5[2], selector); + w6[3] = hc_byte_perm (w5[0], w5[1], selector); + w6[2] = hc_byte_perm (w4[3], w5[0], selector); + w6[1] = hc_byte_perm (w4[2], w4[3], selector); + w6[0] = hc_byte_perm (w4[1], w4[2], selector); + w5[3] = hc_byte_perm (w4[0], w4[1], selector); + w5[2] = hc_byte_perm (w3[3], w4[0], selector); + w5[1] = hc_byte_perm (w3[2], w3[3], selector); + w5[0] = hc_byte_perm (w3[1], w3[2], selector); + w4[3] = hc_byte_perm (w3[0], w3[1], selector); + w4[2] = hc_byte_perm (w2[3], w3[0], selector); + w4[1] = hc_byte_perm (w2[2], w2[3], selector); + w4[0] = hc_byte_perm (w2[1], w2[2], selector); + w3[3] = hc_byte_perm (w2[0], w2[1], selector); + w3[2] = hc_byte_perm (w1[3], w2[0], selector); + w3[1] = hc_byte_perm (w1[2], w1[3], selector); + w3[0] = hc_byte_perm (w1[1], w1[2], selector); + w2[3] = hc_byte_perm (w1[0], w1[1], selector); + w2[2] = hc_byte_perm (w0[3], w1[0], selector); + w2[1] = hc_byte_perm (w0[2], w0[3], selector); + w2[0] = hc_byte_perm (w0[1], w0[2], selector); + w1[3] = hc_byte_perm (w0[0], w0[1], selector); + w1[2] = hc_byte_perm ( 0, w0[0], selector); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -8960,32 +9484,40 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; - case 8: - w7[3] = hc_byte_perm (w5[3], w5[2], selector); - w7[2] = hc_byte_perm (w5[2], w5[1], selector); - w7[1] = hc_byte_perm (w5[1], w5[0], selector); - w7[0] = hc_byte_perm (w5[0], w4[3], selector); - w6[3] = hc_byte_perm (w4[3], w4[2], selector); - w6[2] = hc_byte_perm (w4[2], w4[1], selector); - w6[1] = hc_byte_perm (w4[1], w4[0], selector); - w6[0] = hc_byte_perm (w4[0], w3[3], selector); - w5[3] = hc_byte_perm (w3[3], w3[2], selector); - w5[2] = hc_byte_perm (w3[2], w3[1], selector); - w5[1] = hc_byte_perm (w3[1], w3[0], selector); - w5[0] = hc_byte_perm (w3[0], w2[3], selector); - w4[3] = hc_byte_perm (w2[3], w2[2], selector); - w4[2] = hc_byte_perm (w2[2], w2[1], selector); - w4[1] = hc_byte_perm (w2[1], w2[0], selector); - w4[0] = hc_byte_perm (w2[0], w1[3], selector); - w3[3] = hc_byte_perm (w1[3], w1[2], selector); - w3[2] = hc_byte_perm (w1[2], w1[1], selector); - w3[1] = hc_byte_perm (w1[1], w1[0], selector); - w3[0] = hc_byte_perm (w1[0], w0[3], selector); - w2[3] = hc_byte_perm (w0[3], w0[2], selector); - w2[2] = hc_byte_perm (w0[2], w0[1], selector); - w2[1] = hc_byte_perm (w0[1], w0[0], selector); - w2[0] = hc_byte_perm (w0[0], 0, selector); - w1[3] = 0; + case 7: + c1[3] = hc_byte_perm (w7[3], 0, selector); + c1[2] = hc_byte_perm (w7[2], w7[3], selector); + c1[1] = hc_byte_perm (w7[1], w7[2], selector); + c1[0] = hc_byte_perm (w7[0], w7[1], selector); + c0[3] = hc_byte_perm (w6[3], w7[0], selector); + c0[2] = hc_byte_perm (w6[2], w6[3], selector); + c0[1] = hc_byte_perm (w6[1], w6[2], selector); + c0[0] = hc_byte_perm (w6[0], w6[1], selector); + w7[3] = hc_byte_perm (w5[3], w6[0], selector); + w7[2] = hc_byte_perm (w5[2], w5[3], selector); + w7[1] = hc_byte_perm (w5[1], w5[2], selector); + w7[0] = hc_byte_perm (w5[0], w5[1], selector); + w6[3] = hc_byte_perm (w4[3], w5[0], selector); + w6[2] = hc_byte_perm (w4[2], w4[3], selector); + w6[1] = hc_byte_perm (w4[1], w4[2], selector); + w6[0] = hc_byte_perm (w4[0], w4[1], selector); + w5[3] = hc_byte_perm (w3[3], w4[0], selector); + w5[2] = hc_byte_perm (w3[2], w3[3], selector); + w5[1] = hc_byte_perm (w3[1], w3[2], selector); + w5[0] = hc_byte_perm (w3[0], w3[1], selector); + w4[3] = hc_byte_perm (w2[3], w3[0], selector); + w4[2] = hc_byte_perm (w2[2], w2[3], selector); + w4[1] = hc_byte_perm (w2[1], w2[2], selector); + w4[0] = hc_byte_perm (w2[0], w2[1], selector); + w3[3] = hc_byte_perm (w1[3], w2[0], selector); + w3[2] = hc_byte_perm (w1[2], w1[3], selector); + w3[1] = hc_byte_perm (w1[1], w1[2], selector); + w3[0] = hc_byte_perm (w1[0], w1[1], selector); + w2[3] = hc_byte_perm (w0[3], w1[0], selector); + w2[2] = hc_byte_perm (w0[2], w0[3], selector); + w2[1] = hc_byte_perm (w0[1], w0[2], selector); + w2[0] = hc_byte_perm (w0[0], w0[1], selector); + w1[3] = hc_byte_perm ( 0, w0[0], selector); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -8996,31 +9528,40 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; - case 9: - w7[3] = hc_byte_perm (w5[2], w5[1], selector); - w7[2] = hc_byte_perm (w5[1], w5[0], selector); - w7[1] = hc_byte_perm (w5[0], w4[3], selector); - w7[0] = hc_byte_perm (w4[3], w4[2], selector); - w6[3] = hc_byte_perm (w4[2], w4[1], selector); - w6[2] = hc_byte_perm (w4[1], w4[0], selector); - w6[1] = hc_byte_perm (w4[0], w3[3], selector); - w6[0] = hc_byte_perm (w3[3], w3[2], selector); - w5[3] = hc_byte_perm (w3[2], w3[1], selector); - w5[2] = hc_byte_perm (w3[1], w3[0], selector); - w5[1] = hc_byte_perm (w3[0], w2[3], selector); - w5[0] = hc_byte_perm (w2[3], w2[2], selector); - w4[3] = hc_byte_perm (w2[2], w2[1], selector); - w4[2] = hc_byte_perm (w2[1], w2[0], selector); - w4[1] = hc_byte_perm (w2[0], w1[3], selector); - w4[0] = hc_byte_perm (w1[3], w1[2], selector); - w3[3] = hc_byte_perm (w1[2], w1[1], selector); - w3[2] = hc_byte_perm (w1[1], w1[0], selector); - w3[1] = hc_byte_perm (w1[0], w0[3], selector); - w3[0] = hc_byte_perm (w0[3], w0[2], selector); - w2[3] = hc_byte_perm (w0[2], w0[1], selector); - w2[2] = hc_byte_perm (w0[1], w0[0], selector); - w2[1] = hc_byte_perm (w0[0], 0, selector); - w2[0] = 0; + case 8: + c2[0] = hc_byte_perm (w7[3], 0, selector); + c1[3] = hc_byte_perm (w7[2], w7[3], selector); + c1[2] = hc_byte_perm (w7[1], w7[2], selector); + c1[1] = hc_byte_perm (w7[0], w7[1], selector); + c1[0] = hc_byte_perm (w6[3], w7[0], selector); + c0[3] = hc_byte_perm (w6[2], w6[3], selector); + c0[2] = hc_byte_perm (w6[1], w6[2], selector); + c0[1] = hc_byte_perm (w6[0], w6[1], selector); + c0[0] = hc_byte_perm (w5[3], w6[0], selector); + w7[3] = hc_byte_perm (w5[2], w5[3], selector); + w7[2] = hc_byte_perm (w5[1], w5[2], selector); + w7[1] = hc_byte_perm (w5[0], w5[1], selector); + w7[0] = hc_byte_perm (w4[3], w5[0], selector); + w6[3] = hc_byte_perm (w4[2], w4[3], selector); + w6[2] = hc_byte_perm (w4[1], w4[2], selector); + w6[1] = hc_byte_perm (w4[0], w4[1], selector); + w6[0] = hc_byte_perm (w3[3], w4[0], selector); + w5[3] = hc_byte_perm (w3[2], w3[3], selector); + w5[2] = hc_byte_perm (w3[1], w3[2], selector); + w5[1] = hc_byte_perm (w3[0], w3[1], selector); + w5[0] = hc_byte_perm (w2[3], w3[0], selector); + w4[3] = hc_byte_perm (w2[2], w2[3], selector); + w4[2] = hc_byte_perm (w2[1], w2[2], selector); + w4[1] = hc_byte_perm (w2[0], w2[1], selector); + w4[0] = hc_byte_perm (w1[3], w2[0], selector); + w3[3] = hc_byte_perm (w1[2], w1[3], selector); + w3[2] = hc_byte_perm (w1[1], w1[2], selector); + w3[1] = hc_byte_perm (w1[0], w1[1], selector); + w3[0] = hc_byte_perm (w0[3], w1[0], selector); + w2[3] = hc_byte_perm (w0[2], w0[3], selector); + w2[2] = hc_byte_perm (w0[1], w0[2], selector); + w2[1] = hc_byte_perm (w0[0], w0[1], selector); + w2[0] = hc_byte_perm ( 0, w0[0], selector); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -9032,30 +9573,40 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; - case 10: - w7[3] = hc_byte_perm (w5[1], w5[0], selector); - w7[2] = hc_byte_perm (w5[0], w4[3], selector); - w7[1] = hc_byte_perm (w4[3], w4[2], selector); - w7[0] = hc_byte_perm (w4[2], w4[1], selector); - w6[3] = hc_byte_perm (w4[1], w4[0], selector); - w6[2] = hc_byte_perm (w4[0], w3[3], selector); - w6[1] = hc_byte_perm (w3[3], w3[2], selector); - w6[0] = hc_byte_perm (w3[2], w3[1], selector); - w5[3] = hc_byte_perm (w3[1], w3[0], selector); - w5[2] = hc_byte_perm (w3[0], w2[3], selector); - w5[1] = hc_byte_perm (w2[3], w2[2], selector); - w5[0] = hc_byte_perm (w2[2], w2[1], selector); - w4[3] = hc_byte_perm (w2[1], w2[0], selector); - w4[2] = hc_byte_perm (w2[0], w1[3], selector); - w4[1] = hc_byte_perm (w1[3], w1[2], selector); - w4[0] = hc_byte_perm (w1[2], w1[1], selector); - w3[3] = hc_byte_perm (w1[1], w1[0], selector); - w3[2] = hc_byte_perm (w1[0], w0[3], selector); - w3[1] = hc_byte_perm (w0[3], w0[2], selector); - w3[0] = hc_byte_perm (w0[2], w0[1], selector); - w2[3] = hc_byte_perm (w0[1], w0[0], selector); - w2[2] = hc_byte_perm (w0[0], 0, selector); - w2[1] = 0; + case 9: + c2[1] = hc_byte_perm (w7[3], 0, selector); + c2[0] = hc_byte_perm (w7[2], w7[3], selector); + c1[3] = hc_byte_perm (w7[1], w7[2], selector); + c1[2] = hc_byte_perm (w7[0], w7[1], selector); + c1[1] = hc_byte_perm (w6[3], w7[0], selector); + c1[0] = hc_byte_perm (w6[2], w6[3], selector); + c0[3] = hc_byte_perm (w6[1], w6[2], selector); + c0[2] = hc_byte_perm (w6[0], w6[1], selector); + c0[1] = hc_byte_perm (w5[3], w6[0], selector); + c0[0] = hc_byte_perm (w5[2], w5[3], selector); + w7[3] = hc_byte_perm (w5[1], w5[2], selector); + w7[2] = hc_byte_perm (w5[0], w5[1], selector); + w7[1] = hc_byte_perm (w4[3], w5[0], selector); + w7[0] = hc_byte_perm (w4[2], w4[3], selector); + w6[3] = hc_byte_perm (w4[1], w4[2], selector); + w6[2] = hc_byte_perm (w4[0], w4[1], selector); + w6[1] = hc_byte_perm (w3[3], w4[0], selector); + w6[0] = hc_byte_perm (w3[2], w3[3], selector); + w5[3] = hc_byte_perm (w3[1], w3[2], selector); + w5[2] = hc_byte_perm (w3[0], w3[1], selector); + w5[1] = hc_byte_perm (w2[3], w3[0], selector); + w5[0] = hc_byte_perm (w2[2], w2[3], selector); + w4[3] = hc_byte_perm (w2[1], w2[2], selector); + w4[2] = hc_byte_perm (w2[0], w2[1], selector); + w4[1] = hc_byte_perm (w1[3], w2[0], selector); + w4[0] = hc_byte_perm (w1[2], w1[3], selector); + w3[3] = hc_byte_perm (w1[1], w1[2], selector); + w3[2] = hc_byte_perm (w1[0], w1[1], selector); + w3[1] = hc_byte_perm (w0[3], w1[0], selector); + w3[0] = hc_byte_perm (w0[2], w0[3], selector); + w2[3] = hc_byte_perm (w0[1], w0[2], selector); + w2[2] = hc_byte_perm (w0[0], w0[1], selector); + w2[1] = hc_byte_perm ( 0, w0[0], selector); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -9068,28 +9619,87 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; - case 11: - w7[3] = hc_byte_perm (w5[0], w4[3], selector); - w7[2] = hc_byte_perm (w4[3], w4[2], selector); - w7[1] = hc_byte_perm (w4[2], w4[1], selector); - w7[0] = hc_byte_perm (w4[1], w4[0], selector); - w6[3] = hc_byte_perm (w4[0], w3[3], selector); - w6[2] = hc_byte_perm (w3[3], w3[2], selector); - w6[1] = hc_byte_perm (w3[2], w3[1], selector); - w6[0] = hc_byte_perm (w3[1], w3[0], selector); - w5[3] = hc_byte_perm (w3[0], w2[3], selector); - w5[2] = hc_byte_perm (w2[3], w2[2], selector); - w5[1] = hc_byte_perm (w2[2], w2[1], selector); - w5[0] = hc_byte_perm (w2[1], w2[0], selector); - w4[3] = hc_byte_perm (w2[0], w1[3], selector); - w4[2] = hc_byte_perm (w1[3], w1[2], selector); - w4[1] = hc_byte_perm (w1[2], w1[1], selector); - w4[0] = hc_byte_perm (w1[1], w1[0], selector); - w3[3] = hc_byte_perm (w1[0], w0[3], selector); - w3[2] = hc_byte_perm (w0[3], w0[2], selector); - w3[1] = hc_byte_perm (w0[2], w0[1], selector); - w3[0] = hc_byte_perm (w0[1], w0[0], selector); - w2[3] = hc_byte_perm (w0[0], 0, selector); + case 10: + c2[2] = hc_byte_perm (w7[3], 0, selector); + c2[1] = hc_byte_perm (w7[2], w7[3], selector); + c2[0] = hc_byte_perm (w7[1], w7[2], selector); + c1[3] = hc_byte_perm (w7[0], w7[1], selector); + c1[2] = hc_byte_perm (w6[3], w7[0], selector); + c1[1] = hc_byte_perm (w6[2], w6[3], selector); + c1[0] = hc_byte_perm (w6[1], w6[2], selector); + c0[3] = hc_byte_perm (w6[0], w6[1], selector); + c0[2] = hc_byte_perm (w5[3], w6[0], selector); + c0[1] = hc_byte_perm (w5[2], w5[3], selector); + c0[0] = hc_byte_perm (w5[1], w5[2], selector); + w7[3] = hc_byte_perm (w5[0], w5[1], selector); + w7[2] = hc_byte_perm (w4[3], w5[0], selector); + w7[1] = hc_byte_perm (w4[2], w4[3], selector); + w7[0] = hc_byte_perm (w4[1], w4[2], selector); + w6[3] = hc_byte_perm (w4[0], w4[1], selector); + w6[2] = hc_byte_perm (w3[3], w4[0], selector); + w6[1] = hc_byte_perm (w3[2], w3[3], selector); + w6[0] = hc_byte_perm (w3[1], w3[2], selector); + w5[3] = hc_byte_perm (w3[0], w3[1], selector); + w5[2] = hc_byte_perm (w2[3], w3[0], selector); + w5[1] = hc_byte_perm (w2[2], w2[3], selector); + w5[0] = hc_byte_perm (w2[1], w2[2], selector); + w4[3] = hc_byte_perm (w2[0], w2[1], selector); + w4[2] = hc_byte_perm (w1[3], w2[0], selector); + w4[1] = hc_byte_perm (w1[2], w1[3], selector); + w4[0] = hc_byte_perm (w1[1], w1[2], selector); + w3[3] = hc_byte_perm (w1[0], w1[1], selector); + w3[2] = hc_byte_perm (w0[3], w1[0], selector); + w3[1] = hc_byte_perm (w0[2], w0[3], selector); + w3[0] = hc_byte_perm (w0[1], w0[2], selector); + w2[3] = hc_byte_perm (w0[0], w0[1], selector); + w2[2] = hc_byte_perm ( 0, w0[0], selector); + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 11: + c2[3] = hc_byte_perm (w7[3], 0, selector); + c2[2] = hc_byte_perm (w7[2], w7[3], selector); + c2[1] = hc_byte_perm (w7[1], w7[2], selector); + c2[0] = hc_byte_perm (w7[0], w7[1], selector); + c1[3] = hc_byte_perm (w6[3], w7[0], selector); + c1[2] = hc_byte_perm (w6[2], w6[3], selector); + c1[1] = hc_byte_perm (w6[1], w6[2], selector); + c1[0] = hc_byte_perm (w6[0], w6[1], selector); + c0[3] = hc_byte_perm (w5[3], w6[0], selector); + c0[2] = hc_byte_perm (w5[2], w5[3], selector); + c0[1] = hc_byte_perm (w5[1], w5[2], selector); + c0[0] = hc_byte_perm (w5[0], w5[1], selector); + w7[3] = hc_byte_perm (w4[3], w5[0], selector); + w7[2] = hc_byte_perm (w4[2], w4[3], selector); + w7[1] = hc_byte_perm (w4[1], w4[2], selector); + w7[0] = hc_byte_perm (w4[0], w4[1], selector); + w6[3] = hc_byte_perm (w3[3], w4[0], selector); + w6[2] = hc_byte_perm (w3[2], w3[3], selector); + w6[1] = hc_byte_perm (w3[1], w3[2], selector); + w6[0] = hc_byte_perm (w3[0], w3[1], selector); + w5[3] = hc_byte_perm (w2[3], w3[0], selector); + w5[2] = hc_byte_perm (w2[2], w2[3], selector); + w5[1] = hc_byte_perm (w2[1], w2[2], selector); + w5[0] = hc_byte_perm (w2[0], w2[1], selector); + w4[3] = hc_byte_perm (w1[3], w2[0], selector); + w4[2] = hc_byte_perm (w1[2], w1[3], selector); + w4[1] = hc_byte_perm (w1[1], w1[2], selector); + w4[0] = hc_byte_perm (w1[0], w1[1], selector); + w3[3] = hc_byte_perm (w0[3], w1[0], selector); + w3[2] = hc_byte_perm (w0[2], w0[3], selector); + w3[1] = hc_byte_perm (w0[1], w0[2], selector); + w3[0] = hc_byte_perm (w0[0], w0[1], selector); + w2[3] = hc_byte_perm ( 0, w0[0], selector); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -9105,26 +9715,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 12: - w7[3] = hc_byte_perm (w4[3], w4[2], selector); - w7[2] = hc_byte_perm (w4[2], w4[1], selector); - w7[1] = hc_byte_perm (w4[1], w4[0], selector); - w7[0] = hc_byte_perm (w4[0], w3[3], selector); - w6[3] = hc_byte_perm (w3[3], w3[2], selector); - w6[2] = hc_byte_perm (w3[2], w3[1], selector); - w6[1] = hc_byte_perm (w3[1], w3[0], selector); - w6[0] = hc_byte_perm (w3[0], w2[3], selector); - w5[3] = hc_byte_perm (w2[3], w2[2], selector); - w5[2] = hc_byte_perm (w2[2], w2[1], selector); - w5[1] = hc_byte_perm (w2[1], w2[0], selector); - w5[0] = hc_byte_perm (w2[0], w1[3], selector); - w4[3] = hc_byte_perm (w1[3], w1[2], selector); - w4[2] = hc_byte_perm (w1[2], w1[1], selector); - w4[1] = hc_byte_perm (w1[1], w1[0], selector); - w4[0] = hc_byte_perm (w1[0], w0[3], selector); - w3[3] = hc_byte_perm (w0[3], w0[2], selector); - w3[2] = hc_byte_perm (w0[2], w0[1], selector); - w3[1] = hc_byte_perm (w0[1], w0[0], selector); - w3[0] = hc_byte_perm (w0[0], 0, selector); + c3[0] = hc_byte_perm (w7[3], 0, selector); + c2[3] = hc_byte_perm (w7[2], w7[3], selector); + c2[2] = hc_byte_perm (w7[1], w7[2], selector); + c2[1] = hc_byte_perm (w7[0], w7[1], selector); + c2[0] = hc_byte_perm (w6[3], w7[0], selector); + c1[3] = hc_byte_perm (w6[2], w6[3], selector); + c1[2] = hc_byte_perm (w6[1], w6[2], selector); + c1[1] = hc_byte_perm (w6[0], w6[1], selector); + c1[0] = hc_byte_perm (w5[3], w6[0], selector); + c0[3] = hc_byte_perm (w5[2], w5[3], selector); + c0[2] = hc_byte_perm (w5[1], w5[2], selector); + c0[1] = hc_byte_perm (w5[0], w5[1], selector); + c0[0] = hc_byte_perm (w4[3], w5[0], selector); + w7[3] = hc_byte_perm (w4[2], w4[3], selector); + w7[2] = hc_byte_perm (w4[1], w4[2], selector); + w7[1] = hc_byte_perm (w4[0], w4[1], selector); + w7[0] = hc_byte_perm (w3[3], w4[0], selector); + w6[3] = hc_byte_perm (w3[2], w3[3], selector); + w6[2] = hc_byte_perm (w3[1], w3[2], selector); + w6[1] = hc_byte_perm (w3[0], w3[1], selector); + w6[0] = hc_byte_perm (w2[3], w3[0], selector); + w5[3] = hc_byte_perm (w2[2], w2[3], selector); + w5[2] = hc_byte_perm (w2[1], w2[2], selector); + w5[1] = hc_byte_perm (w2[0], w2[1], selector); + w5[0] = hc_byte_perm (w1[3], w2[0], selector); + w4[3] = hc_byte_perm (w1[2], w1[3], selector); + w4[2] = hc_byte_perm (w1[1], w1[2], selector); + w4[1] = hc_byte_perm (w1[0], w1[1], selector); + w4[0] = hc_byte_perm (w0[3], w1[0], selector); + w3[3] = hc_byte_perm (w0[2], w0[3], selector); + w3[2] = hc_byte_perm (w0[1], w0[2], selector); + w3[1] = hc_byte_perm (w0[0], w0[1], selector); + w3[0] = hc_byte_perm ( 0, w0[0], selector); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -9141,25 +9764,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 13: - w7[3] = hc_byte_perm (w4[2], w4[1], selector); - w7[2] = hc_byte_perm (w4[1], w4[0], selector); - w7[1] = hc_byte_perm (w4[0], w3[3], selector); - w7[0] = hc_byte_perm (w3[3], w3[2], selector); - w6[3] = hc_byte_perm (w3[2], w3[1], selector); - w6[2] = hc_byte_perm (w3[1], w3[0], selector); - w6[1] = hc_byte_perm (w3[0], w2[3], selector); - w6[0] = hc_byte_perm (w2[3], w2[2], selector); - w5[3] = hc_byte_perm (w2[2], w2[1], selector); - w5[2] = hc_byte_perm (w2[1], w2[0], selector); - w5[1] = hc_byte_perm (w2[0], w1[3], selector); - w5[0] = hc_byte_perm (w1[3], w1[2], selector); - w4[3] = hc_byte_perm (w1[2], w1[1], selector); - w4[2] = hc_byte_perm (w1[1], w1[0], selector); - w4[1] = hc_byte_perm (w1[0], w0[3], selector); - w4[0] = hc_byte_perm (w0[3], w0[2], selector); - w3[3] = hc_byte_perm (w0[2], w0[1], selector); - w3[2] = hc_byte_perm (w0[1], w0[0], selector); - w3[1] = hc_byte_perm (w0[0], 0, selector); + c3[1] = hc_byte_perm (w7[3], 0, selector); + c3[0] = hc_byte_perm (w7[2], w7[3], selector); + c2[3] = hc_byte_perm (w7[1], w7[2], selector); + c2[2] = hc_byte_perm (w7[0], w7[1], selector); + c2[1] = hc_byte_perm (w6[3], w7[0], selector); + c2[0] = hc_byte_perm (w6[2], w6[3], selector); + c1[3] = hc_byte_perm (w6[1], w6[2], selector); + c1[2] = hc_byte_perm (w6[0], w6[1], selector); + c1[1] = hc_byte_perm (w5[3], w6[0], selector); + c1[0] = hc_byte_perm (w5[2], w5[3], selector); + c0[3] = hc_byte_perm (w5[1], w5[2], selector); + c0[2] = hc_byte_perm (w5[0], w5[1], selector); + c0[1] = hc_byte_perm (w4[3], w5[0], selector); + c0[0] = hc_byte_perm (w4[2], w4[3], selector); + w7[3] = hc_byte_perm (w4[1], w4[2], selector); + w7[2] = hc_byte_perm (w4[0], w4[1], selector); + w7[1] = hc_byte_perm (w3[3], w4[0], selector); + w7[0] = hc_byte_perm (w3[2], w3[3], selector); + w6[3] = hc_byte_perm (w3[1], w3[2], selector); + w6[2] = hc_byte_perm (w3[0], w3[1], selector); + w6[1] = hc_byte_perm (w2[3], w3[0], selector); + w6[0] = hc_byte_perm (w2[2], w2[3], selector); + w5[3] = hc_byte_perm (w2[1], w2[2], selector); + w5[2] = hc_byte_perm (w2[0], w2[1], selector); + w5[1] = hc_byte_perm (w1[3], w2[0], selector); + w5[0] = hc_byte_perm (w1[2], w1[3], selector); + w4[3] = hc_byte_perm (w1[1], w1[2], selector); + w4[2] = hc_byte_perm (w1[0], w1[1], selector); + w4[1] = hc_byte_perm (w0[3], w1[0], selector); + w4[0] = hc_byte_perm (w0[2], w0[3], selector); + w3[3] = hc_byte_perm (w0[1], w0[2], selector); + w3[2] = hc_byte_perm (w0[0], w0[1], selector); + w3[1] = hc_byte_perm ( 0, w0[0], selector); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -9177,24 +9814,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 14: - w7[3] = hc_byte_perm (w4[1], w4[0], selector); - w7[2] = hc_byte_perm (w4[0], w3[3], selector); - w7[1] = hc_byte_perm (w3[3], w3[2], selector); - w7[0] = hc_byte_perm (w3[2], w3[1], selector); - w6[3] = hc_byte_perm (w3[1], w3[0], selector); - w6[2] = hc_byte_perm (w3[0], w2[3], selector); - w6[1] = hc_byte_perm (w2[3], w2[2], selector); - w6[0] = hc_byte_perm (w2[2], w2[1], selector); - w5[3] = hc_byte_perm (w2[1], w2[0], selector); - w5[2] = hc_byte_perm (w2[0], w1[3], selector); - w5[1] = hc_byte_perm (w1[3], w1[2], selector); - w5[0] = hc_byte_perm (w1[2], w1[1], selector); - w4[3] = hc_byte_perm (w1[1], w1[0], selector); - w4[2] = hc_byte_perm (w1[0], w0[3], selector); - w4[1] = hc_byte_perm (w0[3], w0[2], selector); - w4[0] = hc_byte_perm (w0[2], w0[1], selector); - w3[3] = hc_byte_perm (w0[1], w0[0], selector); - w3[2] = hc_byte_perm (w0[0], 0, selector); + c3[2] = hc_byte_perm (w7[3], 0, selector); + c3[1] = hc_byte_perm (w7[2], w7[3], selector); + c3[0] = hc_byte_perm (w7[1], w7[2], selector); + c2[3] = hc_byte_perm (w7[0], w7[1], selector); + c2[2] = hc_byte_perm (w6[3], w7[0], selector); + c2[1] = hc_byte_perm (w6[2], w6[3], selector); + c2[0] = hc_byte_perm (w6[1], w6[2], selector); + c1[3] = hc_byte_perm (w6[0], w6[1], selector); + c1[2] = hc_byte_perm (w5[3], w6[0], selector); + c1[1] = hc_byte_perm (w5[2], w5[3], selector); + c1[0] = hc_byte_perm (w5[1], w5[2], selector); + c0[3] = hc_byte_perm (w5[0], w5[1], selector); + c0[2] = hc_byte_perm (w4[3], w5[0], selector); + c0[1] = hc_byte_perm (w4[2], w4[3], selector); + c0[0] = hc_byte_perm (w4[1], w4[2], selector); + w7[3] = hc_byte_perm (w4[0], w4[1], selector); + w7[2] = hc_byte_perm (w3[3], w4[0], selector); + w7[1] = hc_byte_perm (w3[2], w3[3], selector); + w7[0] = hc_byte_perm (w3[1], w3[2], selector); + w6[3] = hc_byte_perm (w3[0], w3[1], selector); + w6[2] = hc_byte_perm (w2[3], w3[0], selector); + w6[1] = hc_byte_perm (w2[2], w2[3], selector); + w6[0] = hc_byte_perm (w2[1], w2[2], selector); + w5[3] = hc_byte_perm (w2[0], w2[1], selector); + w5[2] = hc_byte_perm (w1[3], w2[0], selector); + w5[1] = hc_byte_perm (w1[2], w1[3], selector); + w5[0] = hc_byte_perm (w1[1], w1[2], selector); + w4[3] = hc_byte_perm (w1[0], w1[1], selector); + w4[2] = hc_byte_perm (w0[3], w1[0], selector); + w4[1] = hc_byte_perm (w0[2], w0[3], selector); + w4[0] = hc_byte_perm (w0[1], w0[2], selector); + w3[3] = hc_byte_perm (w0[0], w0[1], selector); + w3[2] = hc_byte_perm ( 0, w0[0], selector); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -9213,23 +9865,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 15: - w7[3] = hc_byte_perm (w4[0], w3[3], selector); - w7[2] = hc_byte_perm (w3[3], w3[2], selector); - w7[1] = hc_byte_perm (w3[2], w3[1], selector); - w7[0] = hc_byte_perm (w3[1], w3[0], selector); - w6[3] = hc_byte_perm (w3[0], w2[3], selector); - w6[2] = hc_byte_perm (w2[3], w2[2], selector); - w6[1] = hc_byte_perm (w2[2], w2[1], selector); - w6[0] = hc_byte_perm (w2[1], w2[0], selector); - w5[3] = hc_byte_perm (w2[0], w1[3], selector); - w5[2] = hc_byte_perm (w1[3], w1[2], selector); - w5[1] = hc_byte_perm (w1[2], w1[1], selector); - w5[0] = hc_byte_perm (w1[1], w1[0], selector); - w4[3] = hc_byte_perm (w1[0], w0[3], selector); - w4[2] = hc_byte_perm (w0[3], w0[2], selector); - w4[1] = hc_byte_perm (w0[2], w0[1], selector); - w4[0] = hc_byte_perm (w0[1], w0[0], selector); - w3[3] = hc_byte_perm (w0[0], 0, selector); + c3[3] = hc_byte_perm (w7[3], 0, selector); + c3[2] = hc_byte_perm (w7[2], w7[3], selector); + c3[1] = hc_byte_perm (w7[1], w7[2], selector); + c3[0] = hc_byte_perm (w7[0], w7[1], selector); + c2[3] = hc_byte_perm (w6[3], w7[0], selector); + c2[2] = hc_byte_perm (w6[2], w6[3], selector); + c2[1] = hc_byte_perm (w6[1], w6[2], selector); + c2[0] = hc_byte_perm (w6[0], w6[1], selector); + c1[3] = hc_byte_perm (w5[3], w6[0], selector); + c1[2] = hc_byte_perm (w5[2], w5[3], selector); + c1[1] = hc_byte_perm (w5[1], w5[2], selector); + c1[0] = hc_byte_perm (w5[0], w5[1], selector); + c0[3] = hc_byte_perm (w4[3], w5[0], selector); + c0[2] = hc_byte_perm (w4[2], w4[3], selector); + c0[1] = hc_byte_perm (w4[1], w4[2], selector); + c0[0] = hc_byte_perm (w4[0], w4[1], selector); + w7[3] = hc_byte_perm (w3[3], w4[0], selector); + w7[2] = hc_byte_perm (w3[2], w3[3], selector); + w7[1] = hc_byte_perm (w3[1], w3[2], selector); + w7[0] = hc_byte_perm (w3[0], w3[1], selector); + w6[3] = hc_byte_perm (w2[3], w3[0], selector); + w6[2] = hc_byte_perm (w2[2], w2[3], selector); + w6[1] = hc_byte_perm (w2[1], w2[2], selector); + w6[0] = hc_byte_perm (w2[0], w2[1], selector); + w5[3] = hc_byte_perm (w1[3], w2[0], selector); + w5[2] = hc_byte_perm (w1[2], w1[3], selector); + w5[1] = hc_byte_perm (w1[1], w1[2], selector); + w5[0] = hc_byte_perm (w1[0], w1[1], selector); + w4[3] = hc_byte_perm (w0[3], w1[0], selector); + w4[2] = hc_byte_perm (w0[2], w0[3], selector); + w4[1] = hc_byte_perm (w0[1], w0[2], selector); + w4[0] = hc_byte_perm (w0[0], w0[1], selector); + w3[3] = hc_byte_perm ( 0, w0[0], selector); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -9249,22 +9917,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 16: - w7[3] = hc_byte_perm (w3[3], w3[2], selector); - w7[2] = hc_byte_perm (w3[2], w3[1], selector); - w7[1] = hc_byte_perm (w3[1], w3[0], selector); - w7[0] = hc_byte_perm (w3[0], w2[3], selector); - w6[3] = hc_byte_perm (w2[3], w2[2], selector); - w6[2] = hc_byte_perm (w2[2], w2[1], selector); - w6[1] = hc_byte_perm (w2[1], w2[0], selector); - w6[0] = hc_byte_perm (w2[0], w1[3], selector); - w5[3] = hc_byte_perm (w1[3], w1[2], selector); - w5[2] = hc_byte_perm (w1[2], w1[1], selector); - w5[1] = hc_byte_perm (w1[1], w1[0], selector); - w5[0] = hc_byte_perm (w1[0], w0[3], selector); - w4[3] = hc_byte_perm (w0[3], w0[2], selector); - w4[2] = hc_byte_perm (w0[2], w0[1], selector); - w4[1] = hc_byte_perm (w0[1], w0[0], selector); - w4[0] = hc_byte_perm (w0[0], 0, selector); + c4[0] = hc_byte_perm (w7[3], 0, selector); + c3[3] = hc_byte_perm (w7[2], w7[3], selector); + c3[2] = hc_byte_perm (w7[1], w7[2], selector); + c3[1] = hc_byte_perm (w7[0], w7[1], selector); + c3[0] = hc_byte_perm (w6[3], w7[0], selector); + c2[3] = hc_byte_perm (w6[2], w6[3], selector); + c2[2] = hc_byte_perm (w6[1], w6[2], selector); + c2[1] = hc_byte_perm (w6[0], w6[1], selector); + c2[0] = hc_byte_perm (w5[3], w6[0], selector); + c1[3] = hc_byte_perm (w5[2], w5[3], selector); + c1[2] = hc_byte_perm (w5[1], w5[2], selector); + c1[1] = hc_byte_perm (w5[0], w5[1], selector); + c1[0] = hc_byte_perm (w4[3], w5[0], selector); + c0[3] = hc_byte_perm (w4[2], w4[3], selector); + c0[2] = hc_byte_perm (w4[1], w4[2], selector); + c0[1] = hc_byte_perm (w4[0], w4[1], selector); + c0[0] = hc_byte_perm (w3[3], w4[0], selector); + w7[3] = hc_byte_perm (w3[2], w3[3], selector); + w7[2] = hc_byte_perm (w3[1], w3[2], selector); + w7[1] = hc_byte_perm (w3[0], w3[1], selector); + w7[0] = hc_byte_perm (w2[3], w3[0], selector); + w6[3] = hc_byte_perm (w2[2], w2[3], selector); + w6[2] = hc_byte_perm (w2[1], w2[2], selector); + w6[1] = hc_byte_perm (w2[0], w2[1], selector); + w6[0] = hc_byte_perm (w1[3], w2[0], selector); + w5[3] = hc_byte_perm (w1[2], w1[3], selector); + w5[2] = hc_byte_perm (w1[1], w1[2], selector); + w5[1] = hc_byte_perm (w1[0], w1[1], selector); + w5[0] = hc_byte_perm (w0[3], w1[0], selector); + w4[3] = hc_byte_perm (w0[2], w0[3], selector); + w4[2] = hc_byte_perm (w0[1], w0[2], selector); + w4[1] = hc_byte_perm (w0[0], w0[1], selector); + w4[0] = hc_byte_perm ( 0, w0[0], selector); w3[3] = 0; w3[2] = 0; w3[1] = 0; @@ -9285,21 +9970,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 17: - w7[3] = hc_byte_perm (w3[2], w3[1], selector); - w7[2] = hc_byte_perm (w3[1], w3[0], selector); - w7[1] = hc_byte_perm (w3[0], w2[3], selector); - w7[0] = hc_byte_perm (w2[3], w2[2], selector); - w6[3] = hc_byte_perm (w2[2], w2[1], selector); - w6[2] = hc_byte_perm (w2[1], w2[0], selector); - w6[1] = hc_byte_perm (w2[0], w1[3], selector); - w6[0] = hc_byte_perm (w1[3], w1[2], selector); - w5[3] = hc_byte_perm (w1[2], w1[1], selector); - w5[2] = hc_byte_perm (w1[1], w1[0], selector); - w5[1] = hc_byte_perm (w1[0], w0[3], selector); - w5[0] = hc_byte_perm (w0[3], w0[2], selector); - w4[3] = hc_byte_perm (w0[2], w0[1], selector); - w4[2] = hc_byte_perm (w0[1], w0[0], selector); - w4[1] = hc_byte_perm (w0[0], 0, selector); + c4[1] = hc_byte_perm (w7[3], 0, selector); + c4[0] = hc_byte_perm (w7[2], w7[3], selector); + c3[3] = hc_byte_perm (w7[1], w7[2], selector); + c3[2] = hc_byte_perm (w7[0], w7[1], selector); + c3[1] = hc_byte_perm (w6[3], w7[0], selector); + c3[0] = hc_byte_perm (w6[2], w6[3], selector); + c2[3] = hc_byte_perm (w6[1], w6[2], selector); + c2[2] = hc_byte_perm (w6[0], w6[1], selector); + c2[1] = hc_byte_perm (w5[3], w6[0], selector); + c2[0] = hc_byte_perm (w5[2], w5[3], selector); + c1[3] = hc_byte_perm (w5[1], w5[2], selector); + c1[2] = hc_byte_perm (w5[0], w5[1], selector); + c1[1] = hc_byte_perm (w4[3], w5[0], selector); + c1[0] = hc_byte_perm (w4[2], w4[3], selector); + c0[3] = hc_byte_perm (w4[1], w4[2], selector); + c0[2] = hc_byte_perm (w4[0], w4[1], selector); + c0[1] = hc_byte_perm (w3[3], w4[0], selector); + c0[0] = hc_byte_perm (w3[2], w3[3], selector); + w7[3] = hc_byte_perm (w3[1], w3[2], selector); + w7[2] = hc_byte_perm (w3[0], w3[1], selector); + w7[1] = hc_byte_perm (w2[3], w3[0], selector); + w7[0] = hc_byte_perm (w2[2], w2[3], selector); + w6[3] = hc_byte_perm (w2[1], w2[2], selector); + w6[2] = hc_byte_perm (w2[0], w2[1], selector); + w6[1] = hc_byte_perm (w1[3], w2[0], selector); + w6[0] = hc_byte_perm (w1[2], w1[3], selector); + w5[3] = hc_byte_perm (w1[1], w1[2], selector); + w5[2] = hc_byte_perm (w1[0], w1[1], selector); + w5[1] = hc_byte_perm (w0[3], w1[0], selector); + w5[0] = hc_byte_perm (w0[2], w0[3], selector); + w4[3] = hc_byte_perm (w0[1], w0[2], selector); + w4[2] = hc_byte_perm (w0[0], w0[1], selector); + w4[1] = hc_byte_perm ( 0, w0[0], selector); w4[0] = 0; w3[3] = 0; w3[2] = 0; @@ -9321,20 +10024,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 18: - w7[3] = hc_byte_perm (w3[1], w3[0], selector); - w7[2] = hc_byte_perm (w3[0], w2[3], selector); - w7[1] = hc_byte_perm (w2[3], w2[2], selector); - w7[0] = hc_byte_perm (w2[2], w2[1], selector); - w6[3] = hc_byte_perm (w2[1], w2[0], selector); - w6[2] = hc_byte_perm (w2[0], w1[3], selector); - w6[1] = hc_byte_perm (w1[3], w1[2], selector); - w6[0] = hc_byte_perm (w1[2], w1[1], selector); - w5[3] = hc_byte_perm (w1[1], w1[0], selector); - w5[2] = hc_byte_perm (w1[0], w0[3], selector); - w5[1] = hc_byte_perm (w0[3], w0[2], selector); - w5[0] = hc_byte_perm (w0[2], w0[1], selector); - w4[3] = hc_byte_perm (w0[1], w0[0], selector); - w4[2] = hc_byte_perm (w0[0], 0, selector); + c4[2] = hc_byte_perm (w7[3], 0, selector); + c4[1] = hc_byte_perm (w7[2], w7[3], selector); + c4[0] = hc_byte_perm (w7[1], w7[2], selector); + c3[3] = hc_byte_perm (w7[0], w7[1], selector); + c3[2] = hc_byte_perm (w6[3], w7[0], selector); + c3[1] = hc_byte_perm (w6[2], w6[3], selector); + c3[0] = hc_byte_perm (w6[1], w6[2], selector); + c2[3] = hc_byte_perm (w6[0], w6[1], selector); + c2[2] = hc_byte_perm (w5[3], w6[0], selector); + c2[1] = hc_byte_perm (w5[2], w5[3], selector); + c2[0] = hc_byte_perm (w5[1], w5[2], selector); + c1[3] = hc_byte_perm (w5[0], w5[1], selector); + c1[2] = hc_byte_perm (w4[3], w5[0], selector); + c1[1] = hc_byte_perm (w4[2], w4[3], selector); + c1[0] = hc_byte_perm (w4[1], w4[2], selector); + c0[3] = hc_byte_perm (w4[0], w4[1], selector); + c0[2] = hc_byte_perm (w3[3], w4[0], selector); + c0[1] = hc_byte_perm (w3[2], w3[3], selector); + c0[0] = hc_byte_perm (w3[1], w3[2], selector); + w7[3] = hc_byte_perm (w3[0], w3[1], selector); + w7[2] = hc_byte_perm (w2[3], w3[0], selector); + w7[1] = hc_byte_perm (w2[2], w2[3], selector); + w7[0] = hc_byte_perm (w2[1], w2[2], selector); + w6[3] = hc_byte_perm (w2[0], w2[1], selector); + w6[2] = hc_byte_perm (w1[3], w2[0], selector); + w6[1] = hc_byte_perm (w1[2], w1[3], selector); + w6[0] = hc_byte_perm (w1[1], w1[2], selector); + w5[3] = hc_byte_perm (w1[0], w1[1], selector); + w5[2] = hc_byte_perm (w0[3], w1[0], selector); + w5[1] = hc_byte_perm (w0[2], w0[3], selector); + w5[0] = hc_byte_perm (w0[1], w0[2], selector); + w4[3] = hc_byte_perm (w0[0], w0[1], selector); + w4[2] = hc_byte_perm ( 0, w0[0], selector); w4[1] = 0; w4[0] = 0; w3[3] = 0; @@ -9357,19 +10079,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 19: - w7[3] = hc_byte_perm (w3[0], w2[3], selector); - w7[2] = hc_byte_perm (w2[3], w2[2], selector); - w7[1] = hc_byte_perm (w2[2], w2[1], selector); - w7[0] = hc_byte_perm (w2[1], w2[0], selector); - w6[3] = hc_byte_perm (w2[0], w1[3], selector); - w6[2] = hc_byte_perm (w1[3], w1[2], selector); - w6[1] = hc_byte_perm (w1[2], w1[1], selector); - w6[0] = hc_byte_perm (w1[1], w1[0], selector); - w5[3] = hc_byte_perm (w1[0], w0[3], selector); - w5[2] = hc_byte_perm (w0[3], w0[2], selector); - w5[1] = hc_byte_perm (w0[2], w0[1], selector); - w5[0] = hc_byte_perm (w0[1], w0[0], selector); - w4[3] = hc_byte_perm (w0[0], 0, selector); + c4[3] = hc_byte_perm (w7[3], 0, selector); + c4[2] = hc_byte_perm (w7[2], w7[3], selector); + c4[1] = hc_byte_perm (w7[1], w7[2], selector); + c4[0] = hc_byte_perm (w7[0], w7[1], selector); + c3[3] = hc_byte_perm (w6[3], w7[0], selector); + c3[2] = hc_byte_perm (w6[2], w6[3], selector); + c3[1] = hc_byte_perm (w6[1], w6[2], selector); + c3[0] = hc_byte_perm (w6[0], w6[1], selector); + c2[3] = hc_byte_perm (w5[3], w6[0], selector); + c2[2] = hc_byte_perm (w5[2], w5[3], selector); + c2[1] = hc_byte_perm (w5[1], w5[2], selector); + c2[0] = hc_byte_perm (w5[0], w5[1], selector); + c1[3] = hc_byte_perm (w4[3], w5[0], selector); + c1[2] = hc_byte_perm (w4[2], w4[3], selector); + c1[1] = hc_byte_perm (w4[1], w4[2], selector); + c1[0] = hc_byte_perm (w4[0], w4[1], selector); + c0[3] = hc_byte_perm (w3[3], w4[0], selector); + c0[2] = hc_byte_perm (w3[2], w3[3], selector); + c0[1] = hc_byte_perm (w3[1], w3[2], selector); + c0[0] = hc_byte_perm (w3[0], w3[1], selector); + w7[3] = hc_byte_perm (w2[3], w3[0], selector); + w7[2] = hc_byte_perm (w2[2], w2[3], selector); + w7[1] = hc_byte_perm (w2[1], w2[2], selector); + w7[0] = hc_byte_perm (w2[0], w2[1], selector); + w6[3] = hc_byte_perm (w1[3], w2[0], selector); + w6[2] = hc_byte_perm (w1[2], w1[3], selector); + w6[1] = hc_byte_perm (w1[1], w1[2], selector); + w6[0] = hc_byte_perm (w1[0], w1[1], selector); + w5[3] = hc_byte_perm (w0[3], w1[0], selector); + w5[2] = hc_byte_perm (w0[2], w0[3], selector); + w5[1] = hc_byte_perm (w0[1], w0[2], selector); + w5[0] = hc_byte_perm (w0[0], w0[1], selector); + w4[3] = hc_byte_perm ( 0, w0[0], selector); w4[2] = 0; w4[1] = 0; w4[0] = 0; @@ -9393,18 +10135,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 20: - w7[3] = hc_byte_perm (w2[3], w2[2], selector); - w7[2] = hc_byte_perm (w2[2], w2[1], selector); - w7[1] = hc_byte_perm (w2[1], w2[0], selector); - w7[0] = hc_byte_perm (w2[0], w1[3], selector); - w6[3] = hc_byte_perm (w1[3], w1[2], selector); - w6[2] = hc_byte_perm (w1[2], w1[1], selector); - w6[1] = hc_byte_perm (w1[1], w1[0], selector); - w6[0] = hc_byte_perm (w1[0], w0[3], selector); - w5[3] = hc_byte_perm (w0[3], w0[2], selector); - w5[2] = hc_byte_perm (w0[2], w0[1], selector); - w5[1] = hc_byte_perm (w0[1], w0[0], selector); - w5[0] = hc_byte_perm (w0[0], 0, selector); + c5[0] = hc_byte_perm (w7[3], 0, selector); + c4[3] = hc_byte_perm (w7[2], w7[3], selector); + c4[2] = hc_byte_perm (w7[1], w7[2], selector); + c4[1] = hc_byte_perm (w7[0], w7[1], selector); + c4[0] = hc_byte_perm (w6[3], w7[0], selector); + c3[3] = hc_byte_perm (w6[2], w6[3], selector); + c3[2] = hc_byte_perm (w6[1], w6[2], selector); + c3[1] = hc_byte_perm (w6[0], w6[1], selector); + c3[0] = hc_byte_perm (w5[3], w6[0], selector); + c2[3] = hc_byte_perm (w5[2], w5[3], selector); + c2[2] = hc_byte_perm (w5[1], w5[2], selector); + c2[1] = hc_byte_perm (w5[0], w5[1], selector); + c2[0] = hc_byte_perm (w4[3], w5[0], selector); + c1[3] = hc_byte_perm (w4[2], w4[3], selector); + c1[2] = hc_byte_perm (w4[1], w4[2], selector); + c1[1] = hc_byte_perm (w4[0], w4[1], selector); + c1[0] = hc_byte_perm (w3[3], w4[0], selector); + c0[3] = hc_byte_perm (w3[2], w3[3], selector); + c0[2] = hc_byte_perm (w3[1], w3[2], selector); + c0[1] = hc_byte_perm (w3[0], w3[1], selector); + c0[0] = hc_byte_perm (w2[3], w3[0], selector); + w7[3] = hc_byte_perm (w2[2], w2[3], selector); + w7[2] = hc_byte_perm (w2[1], w2[2], selector); + w7[1] = hc_byte_perm (w2[0], w2[1], selector); + w7[0] = hc_byte_perm (w1[3], w2[0], selector); + w6[3] = hc_byte_perm (w1[2], w1[3], selector); + w6[2] = hc_byte_perm (w1[1], w1[2], selector); + w6[1] = hc_byte_perm (w1[0], w1[1], selector); + w6[0] = hc_byte_perm (w0[3], w1[0], selector); + w5[3] = hc_byte_perm (w0[2], w0[3], selector); + w5[2] = hc_byte_perm (w0[1], w0[2], selector); + w5[1] = hc_byte_perm (w0[0], w0[1], selector); + w5[0] = hc_byte_perm ( 0, w0[0], selector); w4[3] = 0; w4[2] = 0; w4[1] = 0; @@ -9429,17 +10192,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 21: - w7[3] = hc_byte_perm (w2[2], w2[1], selector); - w7[2] = hc_byte_perm (w2[1], w2[0], selector); - w7[1] = hc_byte_perm (w2[0], w1[3], selector); - w7[0] = hc_byte_perm (w1[3], w1[2], selector); - w6[3] = hc_byte_perm (w1[2], w1[1], selector); - w6[2] = hc_byte_perm (w1[1], w1[0], selector); - w6[1] = hc_byte_perm (w1[0], w0[3], selector); - w6[0] = hc_byte_perm (w0[3], w0[2], selector); - w5[3] = hc_byte_perm (w0[2], w0[1], selector); - w5[2] = hc_byte_perm (w0[1], w0[0], selector); - w5[1] = hc_byte_perm (w0[0], 0, selector); + c5[1] = hc_byte_perm (w7[3], 0, selector); + c5[0] = hc_byte_perm (w7[2], w7[3], selector); + c4[3] = hc_byte_perm (w7[1], w7[2], selector); + c4[2] = hc_byte_perm (w7[0], w7[1], selector); + c4[1] = hc_byte_perm (w6[3], w7[0], selector); + c4[0] = hc_byte_perm (w6[2], w6[3], selector); + c3[3] = hc_byte_perm (w6[1], w6[2], selector); + c3[2] = hc_byte_perm (w6[0], w6[1], selector); + c3[1] = hc_byte_perm (w5[3], w6[0], selector); + c3[0] = hc_byte_perm (w5[2], w5[3], selector); + c2[3] = hc_byte_perm (w5[1], w5[2], selector); + c2[2] = hc_byte_perm (w5[0], w5[1], selector); + c2[1] = hc_byte_perm (w4[3], w5[0], selector); + c2[0] = hc_byte_perm (w4[2], w4[3], selector); + c1[3] = hc_byte_perm (w4[1], w4[2], selector); + c1[2] = hc_byte_perm (w4[0], w4[1], selector); + c1[1] = hc_byte_perm (w3[3], w4[0], selector); + c1[0] = hc_byte_perm (w3[2], w3[3], selector); + c0[3] = hc_byte_perm (w3[1], w3[2], selector); + c0[2] = hc_byte_perm (w3[0], w3[1], selector); + c0[1] = hc_byte_perm (w2[3], w3[0], selector); + c0[0] = hc_byte_perm (w2[2], w2[3], selector); + w7[3] = hc_byte_perm (w2[1], w2[2], selector); + w7[2] = hc_byte_perm (w2[0], w2[1], selector); + w7[1] = hc_byte_perm (w1[3], w2[0], selector); + w7[0] = hc_byte_perm (w1[2], w1[3], selector); + w6[3] = hc_byte_perm (w1[1], w1[2], selector); + w6[2] = hc_byte_perm (w1[0], w1[1], selector); + w6[1] = hc_byte_perm (w0[3], w1[0], selector); + w6[0] = hc_byte_perm (w0[2], w0[3], selector); + w5[3] = hc_byte_perm (w0[1], w0[2], selector); + w5[2] = hc_byte_perm (w0[0], w0[1], selector); + w5[1] = hc_byte_perm ( 0, w0[0], selector); w5[0] = 0; w4[3] = 0; w4[2] = 0; @@ -9465,16 +10250,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 22: - w7[3] = hc_byte_perm (w2[1], w2[0], selector); - w7[2] = hc_byte_perm (w2[0], w1[3], selector); - w7[1] = hc_byte_perm (w1[3], w1[2], selector); - w7[0] = hc_byte_perm (w1[2], w1[1], selector); - w6[3] = hc_byte_perm (w1[1], w1[0], selector); - w6[2] = hc_byte_perm (w1[0], w0[3], selector); - w6[1] = hc_byte_perm (w0[3], w0[2], selector); - w6[0] = hc_byte_perm (w0[2], w0[1], selector); - w5[3] = hc_byte_perm (w0[1], w0[0], selector); - w5[2] = hc_byte_perm (w0[0], 0, selector); + c5[2] = hc_byte_perm (w7[3], 0, selector); + c5[1] = hc_byte_perm (w7[2], w7[3], selector); + c5[0] = hc_byte_perm (w7[1], w7[2], selector); + c4[3] = hc_byte_perm (w7[0], w7[1], selector); + c4[2] = hc_byte_perm (w6[3], w7[0], selector); + c4[1] = hc_byte_perm (w6[2], w6[3], selector); + c4[0] = hc_byte_perm (w6[1], w6[2], selector); + c3[3] = hc_byte_perm (w6[0], w6[1], selector); + c3[2] = hc_byte_perm (w5[3], w6[0], selector); + c3[1] = hc_byte_perm (w5[2], w5[3], selector); + c3[0] = hc_byte_perm (w5[1], w5[2], selector); + c2[3] = hc_byte_perm (w5[0], w5[1], selector); + c2[2] = hc_byte_perm (w4[3], w5[0], selector); + c2[1] = hc_byte_perm (w4[2], w4[3], selector); + c2[0] = hc_byte_perm (w4[1], w4[2], selector); + c1[3] = hc_byte_perm (w4[0], w4[1], selector); + c1[2] = hc_byte_perm (w3[3], w4[0], selector); + c1[1] = hc_byte_perm (w3[2], w3[3], selector); + c1[0] = hc_byte_perm (w3[1], w3[2], selector); + c0[3] = hc_byte_perm (w3[0], w3[1], selector); + c0[2] = hc_byte_perm (w2[3], w3[0], selector); + c0[1] = hc_byte_perm (w2[2], w2[3], selector); + c0[0] = hc_byte_perm (w2[1], w2[2], selector); + w7[3] = hc_byte_perm (w2[0], w2[1], selector); + w7[2] = hc_byte_perm (w1[3], w2[0], selector); + w7[1] = hc_byte_perm (w1[2], w1[3], selector); + w7[0] = hc_byte_perm (w1[1], w1[2], selector); + w6[3] = hc_byte_perm (w1[0], w1[1], selector); + w6[2] = hc_byte_perm (w0[3], w1[0], selector); + w6[1] = hc_byte_perm (w0[2], w0[3], selector); + w6[0] = hc_byte_perm (w0[1], w0[2], selector); + w5[3] = hc_byte_perm (w0[0], w0[1], selector); + w5[2] = hc_byte_perm ( 0, w0[0], selector); w5[1] = 0; w5[0] = 0; w4[3] = 0; @@ -9501,15 +10309,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 23: - w7[3] = hc_byte_perm (w2[0], w1[3], selector); - w7[2] = hc_byte_perm (w1[3], w1[2], selector); - w7[1] = hc_byte_perm (w1[2], w1[1], selector); - w7[0] = hc_byte_perm (w1[1], w1[0], selector); - w6[3] = hc_byte_perm (w1[0], w0[3], selector); - w6[2] = hc_byte_perm (w0[3], w0[2], selector); - w6[1] = hc_byte_perm (w0[2], w0[1], selector); - w6[0] = hc_byte_perm (w0[1], w0[0], selector); - w5[3] = hc_byte_perm (w0[0], 0, selector); + c5[3] = hc_byte_perm (w7[3], 0, selector); + c5[2] = hc_byte_perm (w7[2], w7[3], selector); + c5[1] = hc_byte_perm (w7[1], w7[2], selector); + c5[0] = hc_byte_perm (w7[0], w7[1], selector); + c4[3] = hc_byte_perm (w6[3], w7[0], selector); + c4[2] = hc_byte_perm (w6[2], w6[3], selector); + c4[1] = hc_byte_perm (w6[1], w6[2], selector); + c4[0] = hc_byte_perm (w6[0], w6[1], selector); + c3[3] = hc_byte_perm (w5[3], w6[0], selector); + c3[2] = hc_byte_perm (w5[2], w5[3], selector); + c3[1] = hc_byte_perm (w5[1], w5[2], selector); + c3[0] = hc_byte_perm (w5[0], w5[1], selector); + c2[3] = hc_byte_perm (w4[3], w5[0], selector); + c2[2] = hc_byte_perm (w4[2], w4[3], selector); + c2[1] = hc_byte_perm (w4[1], w4[2], selector); + c2[0] = hc_byte_perm (w4[0], w4[1], selector); + c1[3] = hc_byte_perm (w3[3], w4[0], selector); + c1[2] = hc_byte_perm (w3[2], w3[3], selector); + c1[1] = hc_byte_perm (w3[1], w3[2], selector); + c1[0] = hc_byte_perm (w3[0], w3[1], selector); + c0[3] = hc_byte_perm (w2[3], w3[0], selector); + c0[2] = hc_byte_perm (w2[2], w2[3], selector); + c0[1] = hc_byte_perm (w2[1], w2[2], selector); + c0[0] = hc_byte_perm (w2[0], w2[1], selector); + w7[3] = hc_byte_perm (w1[3], w2[0], selector); + w7[2] = hc_byte_perm (w1[2], w1[3], selector); + w7[1] = hc_byte_perm (w1[1], w1[2], selector); + w7[0] = hc_byte_perm (w1[0], w1[1], selector); + w6[3] = hc_byte_perm (w0[3], w1[0], selector); + w6[2] = hc_byte_perm (w0[2], w0[3], selector); + w6[1] = hc_byte_perm (w0[1], w0[2], selector); + w6[0] = hc_byte_perm (w0[0], w0[1], selector); + w5[3] = hc_byte_perm ( 0, w0[0], selector); w5[2] = 0; w5[1] = 0; w5[0] = 0; @@ -9537,14 +10369,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 24: - w7[3] = hc_byte_perm (w1[3], w1[2], selector); - w7[2] = hc_byte_perm (w1[2], w1[1], selector); - w7[1] = hc_byte_perm (w1[1], w1[0], selector); - w7[0] = hc_byte_perm (w1[0], w0[3], selector); - w6[3] = hc_byte_perm (w0[3], w0[2], selector); - w6[2] = hc_byte_perm (w0[2], w0[1], selector); - w6[1] = hc_byte_perm (w0[1], w0[0], selector); - w6[0] = hc_byte_perm (w0[0], 0, selector); + c6[0] = hc_byte_perm (w7[3], 0, selector); + c5[3] = hc_byte_perm (w7[2], w7[3], selector); + c5[2] = hc_byte_perm (w7[1], w7[2], selector); + c5[1] = hc_byte_perm (w7[0], w7[1], selector); + c5[0] = hc_byte_perm (w6[3], w7[0], selector); + c4[3] = hc_byte_perm (w6[2], w6[3], selector); + c4[2] = hc_byte_perm (w6[1], w6[2], selector); + c4[1] = hc_byte_perm (w6[0], w6[1], selector); + c4[0] = hc_byte_perm (w5[3], w6[0], selector); + c3[3] = hc_byte_perm (w5[2], w5[3], selector); + c3[2] = hc_byte_perm (w5[1], w5[2], selector); + c3[1] = hc_byte_perm (w5[0], w5[1], selector); + c3[0] = hc_byte_perm (w4[3], w5[0], selector); + c2[3] = hc_byte_perm (w4[2], w4[3], selector); + c2[2] = hc_byte_perm (w4[1], w4[2], selector); + c2[1] = hc_byte_perm (w4[0], w4[1], selector); + c2[0] = hc_byte_perm (w3[3], w4[0], selector); + c1[3] = hc_byte_perm (w3[2], w3[3], selector); + c1[2] = hc_byte_perm (w3[1], w3[2], selector); + c1[1] = hc_byte_perm (w3[0], w3[1], selector); + c1[0] = hc_byte_perm (w2[3], w3[0], selector); + c0[3] = hc_byte_perm (w2[2], w2[3], selector); + c0[2] = hc_byte_perm (w2[1], w2[2], selector); + c0[1] = hc_byte_perm (w2[0], w2[1], selector); + c0[0] = hc_byte_perm (w1[3], w2[0], selector); + w7[3] = hc_byte_perm (w1[2], w1[3], selector); + w7[2] = hc_byte_perm (w1[1], w1[2], selector); + w7[1] = hc_byte_perm (w1[0], w1[1], selector); + w7[0] = hc_byte_perm (w0[3], w1[0], selector); + w6[3] = hc_byte_perm (w0[2], w0[3], selector); + w6[2] = hc_byte_perm (w0[1], w0[2], selector); + w6[1] = hc_byte_perm (w0[0], w0[1], selector); + w6[0] = hc_byte_perm ( 0, w0[0], selector); w5[3] = 0; w5[2] = 0; w5[1] = 0; @@ -9573,13 +10430,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 25: - w7[3] = hc_byte_perm (w1[2], w1[1], selector); - w7[2] = hc_byte_perm (w1[1], w1[0], selector); - w7[1] = hc_byte_perm (w1[0], w0[3], selector); - w7[0] = hc_byte_perm (w0[3], w0[2], selector); - w6[3] = hc_byte_perm (w0[2], w0[1], selector); - w6[2] = hc_byte_perm (w0[1], w0[0], selector); - w6[1] = hc_byte_perm (w0[0], 0, selector); + c6[1] = hc_byte_perm (w7[3], 0, selector); + c6[0] = hc_byte_perm (w7[2], w7[3], selector); + c5[3] = hc_byte_perm (w7[1], w7[2], selector); + c5[2] = hc_byte_perm (w7[0], w7[1], selector); + c5[1] = hc_byte_perm (w6[3], w7[0], selector); + c5[0] = hc_byte_perm (w6[2], w6[3], selector); + c4[3] = hc_byte_perm (w6[1], w6[2], selector); + c4[2] = hc_byte_perm (w6[0], w6[1], selector); + c4[1] = hc_byte_perm (w5[3], w6[0], selector); + c4[0] = hc_byte_perm (w5[2], w5[3], selector); + c3[3] = hc_byte_perm (w5[1], w5[2], selector); + c3[2] = hc_byte_perm (w5[0], w5[1], selector); + c3[1] = hc_byte_perm (w4[3], w5[0], selector); + c3[0] = hc_byte_perm (w4[2], w4[3], selector); + c2[3] = hc_byte_perm (w4[1], w4[2], selector); + c2[2] = hc_byte_perm (w4[0], w4[1], selector); + c2[1] = hc_byte_perm (w3[3], w4[0], selector); + c2[0] = hc_byte_perm (w3[2], w3[3], selector); + c1[3] = hc_byte_perm (w3[1], w3[2], selector); + c1[2] = hc_byte_perm (w3[0], w3[1], selector); + c1[1] = hc_byte_perm (w2[3], w3[0], selector); + c1[0] = hc_byte_perm (w2[2], w2[3], selector); + c0[3] = hc_byte_perm (w2[1], w2[2], selector); + c0[2] = hc_byte_perm (w2[0], w2[1], selector); + c0[1] = hc_byte_perm (w1[3], w2[0], selector); + c0[0] = hc_byte_perm (w1[2], w1[3], selector); + w7[3] = hc_byte_perm (w1[1], w1[2], selector); + w7[2] = hc_byte_perm (w1[0], w1[1], selector); + w7[1] = hc_byte_perm (w0[3], w1[0], selector); + w7[0] = hc_byte_perm (w0[2], w0[3], selector); + w6[3] = hc_byte_perm (w0[1], w0[2], selector); + w6[2] = hc_byte_perm (w0[0], w0[1], selector); + w6[1] = hc_byte_perm ( 0, w0[0], selector); w6[0] = 0; w5[3] = 0; w5[2] = 0; @@ -9609,12 +10492,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 26: - w7[3] = hc_byte_perm (w1[1], w1[0], selector); - w7[2] = hc_byte_perm (w1[0], w0[3], selector); - w7[1] = hc_byte_perm (w0[3], w0[2], selector); - w7[0] = hc_byte_perm (w0[2], w0[1], selector); - w6[3] = hc_byte_perm (w0[1], w0[0], selector); - w6[2] = hc_byte_perm (w0[0], 0, selector); + c6[2] = hc_byte_perm (w7[3], 0, selector); + c6[1] = hc_byte_perm (w7[2], w7[3], selector); + c6[0] = hc_byte_perm (w7[1], w7[2], selector); + c5[3] = hc_byte_perm (w7[0], w7[1], selector); + c5[2] = hc_byte_perm (w6[3], w7[0], selector); + c5[1] = hc_byte_perm (w6[2], w6[3], selector); + c5[0] = hc_byte_perm (w6[1], w6[2], selector); + c4[3] = hc_byte_perm (w6[0], w6[1], selector); + c4[2] = hc_byte_perm (w5[3], w6[0], selector); + c4[1] = hc_byte_perm (w5[2], w5[3], selector); + c4[0] = hc_byte_perm (w5[1], w5[2], selector); + c3[3] = hc_byte_perm (w5[0], w5[1], selector); + c3[2] = hc_byte_perm (w4[3], w5[0], selector); + c3[1] = hc_byte_perm (w4[2], w4[3], selector); + c3[0] = hc_byte_perm (w4[1], w4[2], selector); + c2[3] = hc_byte_perm (w4[0], w4[1], selector); + c2[2] = hc_byte_perm (w3[3], w4[0], selector); + c2[1] = hc_byte_perm (w3[2], w3[3], selector); + c2[0] = hc_byte_perm (w3[1], w3[2], selector); + c1[3] = hc_byte_perm (w3[0], w3[1], selector); + c1[2] = hc_byte_perm (w2[3], w3[0], selector); + c1[1] = hc_byte_perm (w2[2], w2[3], selector); + c1[0] = hc_byte_perm (w2[1], w2[2], selector); + c0[3] = hc_byte_perm (w2[0], w2[1], selector); + c0[2] = hc_byte_perm (w1[3], w2[0], selector); + c0[1] = hc_byte_perm (w1[2], w1[3], selector); + c0[0] = hc_byte_perm (w1[1], w1[2], selector); + w7[3] = hc_byte_perm (w1[0], w1[1], selector); + w7[2] = hc_byte_perm (w0[3], w1[0], selector); + w7[1] = hc_byte_perm (w0[2], w0[3], selector); + w7[0] = hc_byte_perm (w0[1], w0[2], selector); + w6[3] = hc_byte_perm (w0[0], w0[1], selector); + w6[2] = hc_byte_perm ( 0, w0[0], selector); w6[1] = 0; w6[0] = 0; w5[3] = 0; @@ -9645,11 +10555,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 27: - w7[3] = hc_byte_perm (w1[0], w0[3], selector); - w7[2] = hc_byte_perm (w0[3], w0[2], selector); - w7[1] = hc_byte_perm (w0[2], w0[1], selector); - w7[0] = hc_byte_perm (w0[1], w0[0], selector); - w6[3] = hc_byte_perm (w0[0], 0, selector); + c6[3] = hc_byte_perm (w7[3], 0, selector); + c6[2] = hc_byte_perm (w7[2], w7[3], selector); + c6[1] = hc_byte_perm (w7[1], w7[2], selector); + c6[0] = hc_byte_perm (w7[0], w7[1], selector); + c5[3] = hc_byte_perm (w6[3], w7[0], selector); + c5[2] = hc_byte_perm (w6[2], w6[3], selector); + c5[1] = hc_byte_perm (w6[1], w6[2], selector); + c5[0] = hc_byte_perm (w6[0], w6[1], selector); + c4[3] = hc_byte_perm (w5[3], w6[0], selector); + c4[2] = hc_byte_perm (w5[2], w5[3], selector); + c4[1] = hc_byte_perm (w5[1], w5[2], selector); + c4[0] = hc_byte_perm (w5[0], w5[1], selector); + c3[3] = hc_byte_perm (w4[3], w5[0], selector); + c3[2] = hc_byte_perm (w4[2], w4[3], selector); + c3[1] = hc_byte_perm (w4[1], w4[2], selector); + c3[0] = hc_byte_perm (w4[0], w4[1], selector); + c2[3] = hc_byte_perm (w3[3], w4[0], selector); + c2[2] = hc_byte_perm (w3[2], w3[3], selector); + c2[1] = hc_byte_perm (w3[1], w3[2], selector); + c2[0] = hc_byte_perm (w3[0], w3[1], selector); + c1[3] = hc_byte_perm (w2[3], w3[0], selector); + c1[2] = hc_byte_perm (w2[2], w2[3], selector); + c1[1] = hc_byte_perm (w2[1], w2[2], selector); + c1[0] = hc_byte_perm (w2[0], w2[1], selector); + c0[3] = hc_byte_perm (w1[3], w2[0], selector); + c0[2] = hc_byte_perm (w1[2], w1[3], selector); + c0[1] = hc_byte_perm (w1[1], w1[2], selector); + c0[0] = hc_byte_perm (w1[0], w1[1], selector); + w7[3] = hc_byte_perm (w0[3], w1[0], selector); + w7[2] = hc_byte_perm (w0[2], w0[3], selector); + w7[1] = hc_byte_perm (w0[1], w0[2], selector); + w7[0] = hc_byte_perm (w0[0], w0[1], selector); + w6[3] = hc_byte_perm ( 0, w0[0], selector); w6[2] = 0; w6[1] = 0; w6[0] = 0; @@ -9681,10 +10619,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 28: - w7[3] = hc_byte_perm (w0[3], w0[2], selector); - w7[2] = hc_byte_perm (w0[2], w0[1], selector); - w7[1] = hc_byte_perm (w0[1], w0[0], selector); - w7[0] = hc_byte_perm (w0[0], 0, selector); + c7[0] = hc_byte_perm (w7[3], 0, selector); + c6[3] = hc_byte_perm (w7[2], w7[3], selector); + c6[2] = hc_byte_perm (w7[1], w7[2], selector); + c6[1] = hc_byte_perm (w7[0], w7[1], selector); + c6[0] = hc_byte_perm (w6[3], w7[0], selector); + c5[3] = hc_byte_perm (w6[2], w6[3], selector); + c5[2] = hc_byte_perm (w6[1], w6[2], selector); + c5[1] = hc_byte_perm (w6[0], w6[1], selector); + c5[0] = hc_byte_perm (w5[3], w6[0], selector); + c4[3] = hc_byte_perm (w5[2], w5[3], selector); + c4[2] = hc_byte_perm (w5[1], w5[2], selector); + c4[1] = hc_byte_perm (w5[0], w5[1], selector); + c4[0] = hc_byte_perm (w4[3], w5[0], selector); + c3[3] = hc_byte_perm (w4[2], w4[3], selector); + c3[2] = hc_byte_perm (w4[1], w4[2], selector); + c3[1] = hc_byte_perm (w4[0], w4[1], selector); + c3[0] = hc_byte_perm (w3[3], w4[0], selector); + c2[3] = hc_byte_perm (w3[2], w3[3], selector); + c2[2] = hc_byte_perm (w3[1], w3[2], selector); + c2[1] = hc_byte_perm (w3[0], w3[1], selector); + c2[0] = hc_byte_perm (w2[3], w3[0], selector); + c1[3] = hc_byte_perm (w2[2], w2[3], selector); + c1[2] = hc_byte_perm (w2[1], w2[2], selector); + c1[1] = hc_byte_perm (w2[0], w2[1], selector); + c1[0] = hc_byte_perm (w1[3], w2[0], selector); + c0[3] = hc_byte_perm (w1[2], w1[3], selector); + c0[2] = hc_byte_perm (w1[1], w1[2], selector); + c0[1] = hc_byte_perm (w1[0], w1[1], selector); + c0[0] = hc_byte_perm (w0[3], w1[0], selector); + w7[3] = hc_byte_perm (w0[2], w0[3], selector); + w7[2] = hc_byte_perm (w0[1], w0[2], selector); + w7[1] = hc_byte_perm (w0[0], w0[1], selector); + w7[0] = hc_byte_perm ( 0, w0[0], selector); w6[3] = 0; w6[2] = 0; w6[1] = 0; @@ -9717,9 +10684,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 29: - w7[3] = hc_byte_perm (w0[2], w0[1], selector); - w7[2] = hc_byte_perm (w0[1], w0[0], selector); - w7[1] = hc_byte_perm (w0[0], 0, selector); + c7[1] = hc_byte_perm (w7[3], 0, selector); + c7[0] = hc_byte_perm (w7[2], w7[3], selector); + c6[3] = hc_byte_perm (w7[1], w7[2], selector); + c6[2] = hc_byte_perm (w7[0], w7[1], selector); + c6[1] = hc_byte_perm (w6[3], w7[0], selector); + c6[0] = hc_byte_perm (w6[2], w6[3], selector); + c5[3] = hc_byte_perm (w6[1], w6[2], selector); + c5[2] = hc_byte_perm (w6[0], w6[1], selector); + c5[1] = hc_byte_perm (w5[3], w6[0], selector); + c5[0] = hc_byte_perm (w5[2], w5[3], selector); + c4[3] = hc_byte_perm (w5[1], w5[2], selector); + c4[2] = hc_byte_perm (w5[0], w5[1], selector); + c4[1] = hc_byte_perm (w4[3], w5[0], selector); + c4[0] = hc_byte_perm (w4[2], w4[3], selector); + c3[3] = hc_byte_perm (w4[1], w4[2], selector); + c3[2] = hc_byte_perm (w4[0], w4[1], selector); + c3[1] = hc_byte_perm (w3[3], w4[0], selector); + c3[0] = hc_byte_perm (w3[2], w3[3], selector); + c2[3] = hc_byte_perm (w3[1], w3[2], selector); + c2[2] = hc_byte_perm (w3[0], w3[1], selector); + c2[1] = hc_byte_perm (w2[3], w3[0], selector); + c2[0] = hc_byte_perm (w2[2], w2[3], selector); + c1[3] = hc_byte_perm (w2[1], w2[2], selector); + c1[2] = hc_byte_perm (w2[0], w2[1], selector); + c1[1] = hc_byte_perm (w1[3], w2[0], selector); + c1[0] = hc_byte_perm (w1[2], w1[3], selector); + c0[3] = hc_byte_perm (w1[1], w1[2], selector); + c0[2] = hc_byte_perm (w1[0], w1[1], selector); + c0[1] = hc_byte_perm (w0[3], w1[0], selector); + c0[0] = hc_byte_perm (w0[2], w0[3], selector); + w7[3] = hc_byte_perm (w0[1], w0[2], selector); + w7[2] = hc_byte_perm (w0[0], w0[1], selector); + w7[1] = hc_byte_perm ( 0, w0[0], selector); w7[0] = 0; w6[3] = 0; w6[2] = 0; @@ -9753,8 +10750,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 30: - w7[3] = hc_byte_perm (w0[1], w0[0], selector); - w7[2] = hc_byte_perm (w0[0], 0, selector); + c7[2] = hc_byte_perm (w7[3], 0, selector); + c7[1] = hc_byte_perm (w7[2], w7[3], selector); + c7[0] = hc_byte_perm (w7[1], w7[2], selector); + c6[3] = hc_byte_perm (w7[0], w7[1], selector); + c6[2] = hc_byte_perm (w6[3], w7[0], selector); + c6[1] = hc_byte_perm (w6[2], w6[3], selector); + c6[0] = hc_byte_perm (w6[1], w6[2], selector); + c5[3] = hc_byte_perm (w6[0], w6[1], selector); + c5[2] = hc_byte_perm (w5[3], w6[0], selector); + c5[1] = hc_byte_perm (w5[2], w5[3], selector); + c5[0] = hc_byte_perm (w5[1], w5[2], selector); + c4[3] = hc_byte_perm (w5[0], w5[1], selector); + c4[2] = hc_byte_perm (w4[3], w5[0], selector); + c4[1] = hc_byte_perm (w4[2], w4[3], selector); + c4[0] = hc_byte_perm (w4[1], w4[2], selector); + c3[3] = hc_byte_perm (w4[0], w4[1], selector); + c3[2] = hc_byte_perm (w3[3], w4[0], selector); + c3[1] = hc_byte_perm (w3[2], w3[3], selector); + c3[0] = hc_byte_perm (w3[1], w3[2], selector); + c2[3] = hc_byte_perm (w3[0], w3[1], selector); + c2[2] = hc_byte_perm (w2[3], w3[0], selector); + c2[1] = hc_byte_perm (w2[2], w2[3], selector); + c2[0] = hc_byte_perm (w2[1], w2[2], selector); + c1[3] = hc_byte_perm (w2[0], w2[1], selector); + c1[2] = hc_byte_perm (w1[3], w2[0], selector); + c1[1] = hc_byte_perm (w1[2], w1[3], selector); + c1[0] = hc_byte_perm (w1[1], w1[2], selector); + c0[3] = hc_byte_perm (w1[0], w1[1], selector); + c0[2] = hc_byte_perm (w0[3], w1[0], selector); + c0[1] = hc_byte_perm (w0[2], w0[3], selector); + c0[0] = hc_byte_perm (w0[1], w0[2], selector); + w7[3] = hc_byte_perm (w0[0], w0[1], selector); + w7[2] = hc_byte_perm ( 0, w0[0], selector); w7[1] = 0; w7[0] = 0; w6[3] = 0; @@ -9789,7 +10817,39 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x break; case 31: - w7[3] = hc_byte_perm (w0[0], 0, selector); + c7[3] = hc_byte_perm (w7[3], 0, selector); + c7[2] = hc_byte_perm (w7[2], w7[3], selector); + c7[1] = hc_byte_perm (w7[1], w7[2], selector); + c7[0] = hc_byte_perm (w7[0], w7[1], selector); + c6[3] = hc_byte_perm (w6[3], w7[0], selector); + c6[2] = hc_byte_perm (w6[2], w6[3], selector); + c6[1] = hc_byte_perm (w6[1], w6[2], selector); + c6[0] = hc_byte_perm (w6[0], w6[1], selector); + c5[3] = hc_byte_perm (w5[3], w6[0], selector); + c5[2] = hc_byte_perm (w5[2], w5[3], selector); + c5[1] = hc_byte_perm (w5[1], w5[2], selector); + c5[0] = hc_byte_perm (w5[0], w5[1], selector); + c4[3] = hc_byte_perm (w4[3], w5[0], selector); + c4[2] = hc_byte_perm (w4[2], w4[3], selector); + c4[1] = hc_byte_perm (w4[1], w4[2], selector); + c4[0] = hc_byte_perm (w4[0], w4[1], selector); + c3[3] = hc_byte_perm (w3[3], w4[0], selector); + c3[2] = hc_byte_perm (w3[2], w3[3], selector); + c3[1] = hc_byte_perm (w3[1], w3[2], selector); + c3[0] = hc_byte_perm (w3[0], w3[1], selector); + c2[3] = hc_byte_perm (w2[3], w3[0], selector); + c2[2] = hc_byte_perm (w2[2], w2[3], selector); + c2[1] = hc_byte_perm (w2[1], w2[2], selector); + c2[0] = hc_byte_perm (w2[0], w2[1], selector); + c1[3] = hc_byte_perm (w1[3], w2[0], selector); + c1[2] = hc_byte_perm (w1[2], w1[3], selector); + c1[1] = hc_byte_perm (w1[1], w1[2], selector); + c1[0] = hc_byte_perm (w1[0], w1[1], selector); + c0[3] = hc_byte_perm (w0[3], w1[0], selector); + c0[2] = hc_byte_perm (w0[2], w0[3], selector); + c0[1] = hc_byte_perm (w0[1], w0[2], selector); + c0[0] = hc_byte_perm (w0[0], w0[1], selector); + w7[3] = hc_byte_perm ( 0, w0[0], selector); w7[2] = 0; w7[1] = 0; w7[0] = 0; @@ -9827,7 +10887,7 @@ DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x #endif } -DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2, u32x *w3, u32x *w4, u32x *w5, u32x *w6, u32x *w7, u32x *c0, u32x *c1, u32x *c2, u32x *c3, u32x *c4, u32x *c5, u32x *c6, u32x *c7, const u32 offset) +DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x *w3, u32x *w4, u32x *w5, u32x *w6, u32x *w7, const u32 offset) { const int offset_switch = offset / 4; @@ -9835,7 +10895,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 switch (offset_switch) { case 0: - c0[0] = hc_bytealign_be (w7[3], 0, offset); w7[3] = hc_bytealign_be (w7[2], w7[3], offset); w7[2] = hc_bytealign_be (w7[1], w7[2], offset); w7[1] = hc_bytealign_be (w7[0], w7[1], offset); @@ -9872,8 +10931,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 1: - c0[1] = hc_bytealign_be (w7[3], 0, offset); - c0[0] = hc_bytealign_be (w7[2], w7[3], offset); w7[3] = hc_bytealign_be (w7[1], w7[2], offset); w7[2] = hc_bytealign_be (w7[0], w7[1], offset); w7[1] = hc_bytealign_be (w6[3], w7[0], offset); @@ -9910,9 +10967,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 2: - c0[2] = hc_bytealign_be (w7[3], 0, offset); - c0[1] = hc_bytealign_be (w7[2], w7[3], offset); - c0[0] = hc_bytealign_be (w7[1], w7[2], offset); w7[3] = hc_bytealign_be (w7[0], w7[1], offset); w7[2] = hc_bytealign_be (w6[3], w7[0], offset); w7[1] = hc_bytealign_be (w6[2], w6[3], offset); @@ -9949,10 +11003,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 3: - c0[3] = hc_bytealign_be (w7[3], 0, offset); - c0[2] = hc_bytealign_be (w7[2], w7[3], offset); - c0[1] = hc_bytealign_be (w7[1], w7[2], offset); - c0[0] = hc_bytealign_be (w7[0], w7[1], offset); w7[3] = hc_bytealign_be (w6[3], w7[0], offset); w7[2] = hc_bytealign_be (w6[2], w6[3], offset); w7[1] = hc_bytealign_be (w6[1], w6[2], offset); @@ -9989,11 +11039,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 4: - c1[0] = hc_bytealign_be (w7[3], 0, offset); - c0[3] = hc_bytealign_be (w7[2], w7[3], offset); - c0[2] = hc_bytealign_be (w7[1], w7[2], offset); - c0[1] = hc_bytealign_be (w7[0], w7[1], offset); - c0[0] = hc_bytealign_be (w6[3], w7[0], offset); w7[3] = hc_bytealign_be (w6[2], w6[3], offset); w7[2] = hc_bytealign_be (w6[1], w6[2], offset); w7[1] = hc_bytealign_be (w6[0], w6[1], offset); @@ -10030,12 +11075,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 5: - c1[1] = hc_bytealign_be (w7[3], 0, offset); - c1[0] = hc_bytealign_be (w7[2], w7[3], offset); - c0[3] = hc_bytealign_be (w7[1], w7[2], offset); - c0[2] = hc_bytealign_be (w7[0], w7[1], offset); - c0[1] = hc_bytealign_be (w6[3], w7[0], offset); - c0[0] = hc_bytealign_be (w6[2], w6[3], offset); w7[3] = hc_bytealign_be (w6[1], w6[2], offset); w7[2] = hc_bytealign_be (w6[0], w6[1], offset); w7[1] = hc_bytealign_be (w5[3], w6[0], offset); @@ -10072,13 +11111,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 6: - c1[2] = hc_bytealign_be (w7[3], 0, offset); - c1[1] = hc_bytealign_be (w7[2], w7[3], offset); - c1[0] = hc_bytealign_be (w7[1], w7[2], offset); - c0[3] = hc_bytealign_be (w7[0], w7[1], offset); - c0[2] = hc_bytealign_be (w6[3], w7[0], offset); - c0[1] = hc_bytealign_be (w6[2], w6[3], offset); - c0[0] = hc_bytealign_be (w6[1], w6[2], offset); w7[3] = hc_bytealign_be (w6[0], w6[1], offset); w7[2] = hc_bytealign_be (w5[3], w6[0], offset); w7[1] = hc_bytealign_be (w5[2], w5[3], offset); @@ -10115,14 +11147,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 7: - c1[3] = hc_bytealign_be (w7[3], 0, offset); - c1[2] = hc_bytealign_be (w7[2], w7[3], offset); - c1[1] = hc_bytealign_be (w7[1], w7[2], offset); - c1[0] = hc_bytealign_be (w7[0], w7[1], offset); - c0[3] = hc_bytealign_be (w6[3], w7[0], offset); - c0[2] = hc_bytealign_be (w6[2], w6[3], offset); - c0[1] = hc_bytealign_be (w6[1], w6[2], offset); - c0[0] = hc_bytealign_be (w6[0], w6[1], offset); w7[3] = hc_bytealign_be (w5[3], w6[0], offset); w7[2] = hc_bytealign_be (w5[2], w5[3], offset); w7[1] = hc_bytealign_be (w5[1], w5[2], offset); @@ -10159,15 +11183,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 8: - c2[0] = hc_bytealign_be (w7[3], 0, offset); - c1[3] = hc_bytealign_be (w7[2], w7[3], offset); - c1[2] = hc_bytealign_be (w7[1], w7[2], offset); - c1[1] = hc_bytealign_be (w7[0], w7[1], offset); - c1[0] = hc_bytealign_be (w6[3], w7[0], offset); - c0[3] = hc_bytealign_be (w6[2], w6[3], offset); - c0[2] = hc_bytealign_be (w6[1], w6[2], offset); - c0[1] = hc_bytealign_be (w6[0], w6[1], offset); - c0[0] = hc_bytealign_be (w5[3], w6[0], offset); w7[3] = hc_bytealign_be (w5[2], w5[3], offset); w7[2] = hc_bytealign_be (w5[1], w5[2], offset); w7[1] = hc_bytealign_be (w5[0], w5[1], offset); @@ -10204,16 +11219,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 9: - c2[1] = hc_bytealign_be (w7[3], 0, offset); - c2[0] = hc_bytealign_be (w7[2], w7[3], offset); - c1[3] = hc_bytealign_be (w7[1], w7[2], offset); - c1[2] = hc_bytealign_be (w7[0], w7[1], offset); - c1[1] = hc_bytealign_be (w6[3], w7[0], offset); - c1[0] = hc_bytealign_be (w6[2], w6[3], offset); - c0[3] = hc_bytealign_be (w6[1], w6[2], offset); - c0[2] = hc_bytealign_be (w6[0], w6[1], offset); - c0[1] = hc_bytealign_be (w5[3], w6[0], offset); - c0[0] = hc_bytealign_be (w5[2], w5[3], offset); w7[3] = hc_bytealign_be (w5[1], w5[2], offset); w7[2] = hc_bytealign_be (w5[0], w5[1], offset); w7[1] = hc_bytealign_be (w4[3], w5[0], offset); @@ -10250,17 +11255,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 10: - c2[2] = hc_bytealign_be (w7[3], 0, offset); - c2[1] = hc_bytealign_be (w7[2], w7[3], offset); - c2[0] = hc_bytealign_be (w7[1], w7[2], offset); - c1[3] = hc_bytealign_be (w7[0], w7[1], offset); - c1[2] = hc_bytealign_be (w6[3], w7[0], offset); - c1[1] = hc_bytealign_be (w6[2], w6[3], offset); - c1[0] = hc_bytealign_be (w6[1], w6[2], offset); - c0[3] = hc_bytealign_be (w6[0], w6[1], offset); - c0[2] = hc_bytealign_be (w5[3], w6[0], offset); - c0[1] = hc_bytealign_be (w5[2], w5[3], offset); - c0[0] = hc_bytealign_be (w5[1], w5[2], offset); w7[3] = hc_bytealign_be (w5[0], w5[1], offset); w7[2] = hc_bytealign_be (w4[3], w5[0], offset); w7[1] = hc_bytealign_be (w4[2], w4[3], offset); @@ -10297,18 +11291,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 11: - c2[3] = hc_bytealign_be (w7[3], 0, offset); - c2[2] = hc_bytealign_be (w7[2], w7[3], offset); - c2[1] = hc_bytealign_be (w7[1], w7[2], offset); - c2[0] = hc_bytealign_be (w7[0], w7[1], offset); - c1[3] = hc_bytealign_be (w6[3], w7[0], offset); - c1[2] = hc_bytealign_be (w6[2], w6[3], offset); - c1[1] = hc_bytealign_be (w6[1], w6[2], offset); - c1[0] = hc_bytealign_be (w6[0], w6[1], offset); - c0[3] = hc_bytealign_be (w5[3], w6[0], offset); - c0[2] = hc_bytealign_be (w5[2], w5[3], offset); - c0[1] = hc_bytealign_be (w5[1], w5[2], offset); - c0[0] = hc_bytealign_be (w5[0], w5[1], offset); w7[3] = hc_bytealign_be (w4[3], w5[0], offset); w7[2] = hc_bytealign_be (w4[2], w4[3], offset); w7[1] = hc_bytealign_be (w4[1], w4[2], offset); @@ -10345,19 +11327,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 12: - c3[0] = hc_bytealign_be (w7[3], 0, offset); - c2[3] = hc_bytealign_be (w7[2], w7[3], offset); - c2[2] = hc_bytealign_be (w7[1], w7[2], offset); - c2[1] = hc_bytealign_be (w7[0], w7[1], offset); - c2[0] = hc_bytealign_be (w6[3], w7[0], offset); - c1[3] = hc_bytealign_be (w6[2], w6[3], offset); - c1[2] = hc_bytealign_be (w6[1], w6[2], offset); - c1[1] = hc_bytealign_be (w6[0], w6[1], offset); - c1[0] = hc_bytealign_be (w5[3], w6[0], offset); - c0[3] = hc_bytealign_be (w5[2], w5[3], offset); - c0[2] = hc_bytealign_be (w5[1], w5[2], offset); - c0[1] = hc_bytealign_be (w5[0], w5[1], offset); - c0[0] = hc_bytealign_be (w4[3], w5[0], offset); w7[3] = hc_bytealign_be (w4[2], w4[3], offset); w7[2] = hc_bytealign_be (w4[1], w4[2], offset); w7[1] = hc_bytealign_be (w4[0], w4[1], offset); @@ -10394,20 +11363,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 13: - c3[1] = hc_bytealign_be (w7[3], 0, offset); - c3[0] = hc_bytealign_be (w7[2], w7[3], offset); - c2[3] = hc_bytealign_be (w7[1], w7[2], offset); - c2[2] = hc_bytealign_be (w7[0], w7[1], offset); - c2[1] = hc_bytealign_be (w6[3], w7[0], offset); - c2[0] = hc_bytealign_be (w6[2], w6[3], offset); - c1[3] = hc_bytealign_be (w6[1], w6[2], offset); - c1[2] = hc_bytealign_be (w6[0], w6[1], offset); - c1[1] = hc_bytealign_be (w5[3], w6[0], offset); - c1[0] = hc_bytealign_be (w5[2], w5[3], offset); - c0[3] = hc_bytealign_be (w5[1], w5[2], offset); - c0[2] = hc_bytealign_be (w5[0], w5[1], offset); - c0[1] = hc_bytealign_be (w4[3], w5[0], offset); - c0[0] = hc_bytealign_be (w4[2], w4[3], offset); w7[3] = hc_bytealign_be (w4[1], w4[2], offset); w7[2] = hc_bytealign_be (w4[0], w4[1], offset); w7[1] = hc_bytealign_be (w3[3], w4[0], offset); @@ -10444,21 +11399,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 14: - c3[2] = hc_bytealign_be (w7[3], 0, offset); - c3[1] = hc_bytealign_be (w7[2], w7[3], offset); - c3[0] = hc_bytealign_be (w7[1], w7[2], offset); - c2[3] = hc_bytealign_be (w7[0], w7[1], offset); - c2[2] = hc_bytealign_be (w6[3], w7[0], offset); - c2[1] = hc_bytealign_be (w6[2], w6[3], offset); - c2[0] = hc_bytealign_be (w6[1], w6[2], offset); - c1[3] = hc_bytealign_be (w6[0], w6[1], offset); - c1[2] = hc_bytealign_be (w5[3], w6[0], offset); - c1[1] = hc_bytealign_be (w5[2], w5[3], offset); - c1[0] = hc_bytealign_be (w5[1], w5[2], offset); - c0[3] = hc_bytealign_be (w5[0], w5[1], offset); - c0[2] = hc_bytealign_be (w4[3], w5[0], offset); - c0[1] = hc_bytealign_be (w4[2], w4[3], offset); - c0[0] = hc_bytealign_be (w4[1], w4[2], offset); w7[3] = hc_bytealign_be (w4[0], w4[1], offset); w7[2] = hc_bytealign_be (w3[3], w4[0], offset); w7[1] = hc_bytealign_be (w3[2], w3[3], offset); @@ -10495,22 +11435,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 15: - c3[3] = hc_bytealign_be (w7[3], 0, offset); - c3[2] = hc_bytealign_be (w7[2], w7[3], offset); - c3[1] = hc_bytealign_be (w7[1], w7[2], offset); - c3[0] = hc_bytealign_be (w7[0], w7[1], offset); - c2[3] = hc_bytealign_be (w6[3], w7[0], offset); - c2[2] = hc_bytealign_be (w6[2], w6[3], offset); - c2[1] = hc_bytealign_be (w6[1], w6[2], offset); - c2[0] = hc_bytealign_be (w6[0], w6[1], offset); - c1[3] = hc_bytealign_be (w5[3], w6[0], offset); - c1[2] = hc_bytealign_be (w5[2], w5[3], offset); - c1[1] = hc_bytealign_be (w5[1], w5[2], offset); - c1[0] = hc_bytealign_be (w5[0], w5[1], offset); - c0[3] = hc_bytealign_be (w4[3], w5[0], offset); - c0[2] = hc_bytealign_be (w4[2], w4[3], offset); - c0[1] = hc_bytealign_be (w4[1], w4[2], offset); - c0[0] = hc_bytealign_be (w4[0], w4[1], offset); w7[3] = hc_bytealign_be (w3[3], w4[0], offset); w7[2] = hc_bytealign_be (w3[2], w3[3], offset); w7[1] = hc_bytealign_be (w3[1], w3[2], offset); @@ -10547,23 +11471,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 16: - c4[0] = hc_bytealign_be (w7[3], 0, offset); - c3[3] = hc_bytealign_be (w7[2], w7[3], offset); - c3[2] = hc_bytealign_be (w7[1], w7[2], offset); - c3[1] = hc_bytealign_be (w7[0], w7[1], offset); - c3[0] = hc_bytealign_be (w6[3], w7[0], offset); - c2[3] = hc_bytealign_be (w6[2], w6[3], offset); - c2[2] = hc_bytealign_be (w6[1], w6[2], offset); - c2[1] = hc_bytealign_be (w6[0], w6[1], offset); - c2[0] = hc_bytealign_be (w5[3], w6[0], offset); - c1[3] = hc_bytealign_be (w5[2], w5[3], offset); - c1[2] = hc_bytealign_be (w5[1], w5[2], offset); - c1[1] = hc_bytealign_be (w5[0], w5[1], offset); - c1[0] = hc_bytealign_be (w4[3], w5[0], offset); - c0[3] = hc_bytealign_be (w4[2], w4[3], offset); - c0[2] = hc_bytealign_be (w4[1], w4[2], offset); - c0[1] = hc_bytealign_be (w4[0], w4[1], offset); - c0[0] = hc_bytealign_be (w3[3], w4[0], offset); w7[3] = hc_bytealign_be (w3[2], w3[3], offset); w7[2] = hc_bytealign_be (w3[1], w3[2], offset); w7[1] = hc_bytealign_be (w3[0], w3[1], offset); @@ -10600,24 +11507,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 17: - c4[1] = hc_bytealign_be (w7[3], 0, offset); - c4[0] = hc_bytealign_be (w7[2], w7[3], offset); - c3[3] = hc_bytealign_be (w7[1], w7[2], offset); - c3[2] = hc_bytealign_be (w7[0], w7[1], offset); - c3[1] = hc_bytealign_be (w6[3], w7[0], offset); - c3[0] = hc_bytealign_be (w6[2], w6[3], offset); - c2[3] = hc_bytealign_be (w6[1], w6[2], offset); - c2[2] = hc_bytealign_be (w6[0], w6[1], offset); - c2[1] = hc_bytealign_be (w5[3], w6[0], offset); - c2[0] = hc_bytealign_be (w5[2], w5[3], offset); - c1[3] = hc_bytealign_be (w5[1], w5[2], offset); - c1[2] = hc_bytealign_be (w5[0], w5[1], offset); - c1[1] = hc_bytealign_be (w4[3], w5[0], offset); - c1[0] = hc_bytealign_be (w4[2], w4[3], offset); - c0[3] = hc_bytealign_be (w4[1], w4[2], offset); - c0[2] = hc_bytealign_be (w4[0], w4[1], offset); - c0[1] = hc_bytealign_be (w3[3], w4[0], offset); - c0[0] = hc_bytealign_be (w3[2], w3[3], offset); w7[3] = hc_bytealign_be (w3[1], w3[2], offset); w7[2] = hc_bytealign_be (w3[0], w3[1], offset); w7[1] = hc_bytealign_be (w2[3], w3[0], offset); @@ -10654,25 +11543,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 18: - c4[2] = hc_bytealign_be (w7[3], 0, offset); - c4[1] = hc_bytealign_be (w7[2], w7[3], offset); - c4[0] = hc_bytealign_be (w7[1], w7[2], offset); - c3[3] = hc_bytealign_be (w7[0], w7[1], offset); - c3[2] = hc_bytealign_be (w6[3], w7[0], offset); - c3[1] = hc_bytealign_be (w6[2], w6[3], offset); - c3[0] = hc_bytealign_be (w6[1], w6[2], offset); - c2[3] = hc_bytealign_be (w6[0], w6[1], offset); - c2[2] = hc_bytealign_be (w5[3], w6[0], offset); - c2[1] = hc_bytealign_be (w5[2], w5[3], offset); - c2[0] = hc_bytealign_be (w5[1], w5[2], offset); - c1[3] = hc_bytealign_be (w5[0], w5[1], offset); - c1[2] = hc_bytealign_be (w4[3], w5[0], offset); - c1[1] = hc_bytealign_be (w4[2], w4[3], offset); - c1[0] = hc_bytealign_be (w4[1], w4[2], offset); - c0[3] = hc_bytealign_be (w4[0], w4[1], offset); - c0[2] = hc_bytealign_be (w3[3], w4[0], offset); - c0[1] = hc_bytealign_be (w3[2], w3[3], offset); - c0[0] = hc_bytealign_be (w3[1], w3[2], offset); w7[3] = hc_bytealign_be (w3[0], w3[1], offset); w7[2] = hc_bytealign_be (w2[3], w3[0], offset); w7[1] = hc_bytealign_be (w2[2], w2[3], offset); @@ -10709,26 +11579,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 19: - c4[3] = hc_bytealign_be (w7[3], 0, offset); - c4[2] = hc_bytealign_be (w7[2], w7[3], offset); - c4[1] = hc_bytealign_be (w7[1], w7[2], offset); - c4[0] = hc_bytealign_be (w7[0], w7[1], offset); - c3[3] = hc_bytealign_be (w6[3], w7[0], offset); - c3[2] = hc_bytealign_be (w6[2], w6[3], offset); - c3[1] = hc_bytealign_be (w6[1], w6[2], offset); - c3[0] = hc_bytealign_be (w6[0], w6[1], offset); - c2[3] = hc_bytealign_be (w5[3], w6[0], offset); - c2[2] = hc_bytealign_be (w5[2], w5[3], offset); - c2[1] = hc_bytealign_be (w5[1], w5[2], offset); - c2[0] = hc_bytealign_be (w5[0], w5[1], offset); - c1[3] = hc_bytealign_be (w4[3], w5[0], offset); - c1[2] = hc_bytealign_be (w4[2], w4[3], offset); - c1[1] = hc_bytealign_be (w4[1], w4[2], offset); - c1[0] = hc_bytealign_be (w4[0], w4[1], offset); - c0[3] = hc_bytealign_be (w3[3], w4[0], offset); - c0[2] = hc_bytealign_be (w3[2], w3[3], offset); - c0[1] = hc_bytealign_be (w3[1], w3[2], offset); - c0[0] = hc_bytealign_be (w3[0], w3[1], offset); w7[3] = hc_bytealign_be (w2[3], w3[0], offset); w7[2] = hc_bytealign_be (w2[2], w2[3], offset); w7[1] = hc_bytealign_be (w2[1], w2[2], offset); @@ -10765,27 +11615,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 20: - c5[0] = hc_bytealign_be (w7[3], 0, offset); - c4[3] = hc_bytealign_be (w7[2], w7[3], offset); - c4[2] = hc_bytealign_be (w7[1], w7[2], offset); - c4[1] = hc_bytealign_be (w7[0], w7[1], offset); - c4[0] = hc_bytealign_be (w6[3], w7[0], offset); - c3[3] = hc_bytealign_be (w6[2], w6[3], offset); - c3[2] = hc_bytealign_be (w6[1], w6[2], offset); - c3[1] = hc_bytealign_be (w6[0], w6[1], offset); - c3[0] = hc_bytealign_be (w5[3], w6[0], offset); - c2[3] = hc_bytealign_be (w5[2], w5[3], offset); - c2[2] = hc_bytealign_be (w5[1], w5[2], offset); - c2[1] = hc_bytealign_be (w5[0], w5[1], offset); - c2[0] = hc_bytealign_be (w4[3], w5[0], offset); - c1[3] = hc_bytealign_be (w4[2], w4[3], offset); - c1[2] = hc_bytealign_be (w4[1], w4[2], offset); - c1[1] = hc_bytealign_be (w4[0], w4[1], offset); - c1[0] = hc_bytealign_be (w3[3], w4[0], offset); - c0[3] = hc_bytealign_be (w3[2], w3[3], offset); - c0[2] = hc_bytealign_be (w3[1], w3[2], offset); - c0[1] = hc_bytealign_be (w3[0], w3[1], offset); - c0[0] = hc_bytealign_be (w2[3], w3[0], offset); w7[3] = hc_bytealign_be (w2[2], w2[3], offset); w7[2] = hc_bytealign_be (w2[1], w2[2], offset); w7[1] = hc_bytealign_be (w2[0], w2[1], offset); @@ -10822,28 +11651,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 21: - c5[1] = hc_bytealign_be (w7[3], 0, offset); - c5[0] = hc_bytealign_be (w7[2], w7[3], offset); - c4[3] = hc_bytealign_be (w7[1], w7[2], offset); - c4[2] = hc_bytealign_be (w7[0], w7[1], offset); - c4[1] = hc_bytealign_be (w6[3], w7[0], offset); - c4[0] = hc_bytealign_be (w6[2], w6[3], offset); - c3[3] = hc_bytealign_be (w6[1], w6[2], offset); - c3[2] = hc_bytealign_be (w6[0], w6[1], offset); - c3[1] = hc_bytealign_be (w5[3], w6[0], offset); - c3[0] = hc_bytealign_be (w5[2], w5[3], offset); - c2[3] = hc_bytealign_be (w5[1], w5[2], offset); - c2[2] = hc_bytealign_be (w5[0], w5[1], offset); - c2[1] = hc_bytealign_be (w4[3], w5[0], offset); - c2[0] = hc_bytealign_be (w4[2], w4[3], offset); - c1[3] = hc_bytealign_be (w4[1], w4[2], offset); - c1[2] = hc_bytealign_be (w4[0], w4[1], offset); - c1[1] = hc_bytealign_be (w3[3], w4[0], offset); - c1[0] = hc_bytealign_be (w3[2], w3[3], offset); - c0[3] = hc_bytealign_be (w3[1], w3[2], offset); - c0[2] = hc_bytealign_be (w3[0], w3[1], offset); - c0[1] = hc_bytealign_be (w2[3], w3[0], offset); - c0[0] = hc_bytealign_be (w2[2], w2[3], offset); w7[3] = hc_bytealign_be (w2[1], w2[2], offset); w7[2] = hc_bytealign_be (w2[0], w2[1], offset); w7[1] = hc_bytealign_be (w1[3], w2[0], offset); @@ -10880,29 +11687,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 22: - c5[2] = hc_bytealign_be (w7[3], 0, offset); - c5[1] = hc_bytealign_be (w7[2], w7[3], offset); - c5[0] = hc_bytealign_be (w7[1], w7[2], offset); - c4[3] = hc_bytealign_be (w7[0], w7[1], offset); - c4[2] = hc_bytealign_be (w6[3], w7[0], offset); - c4[1] = hc_bytealign_be (w6[2], w6[3], offset); - c4[0] = hc_bytealign_be (w6[1], w6[2], offset); - c3[3] = hc_bytealign_be (w6[0], w6[1], offset); - c3[2] = hc_bytealign_be (w5[3], w6[0], offset); - c3[1] = hc_bytealign_be (w5[2], w5[3], offset); - c3[0] = hc_bytealign_be (w5[1], w5[2], offset); - c2[3] = hc_bytealign_be (w5[0], w5[1], offset); - c2[2] = hc_bytealign_be (w4[3], w5[0], offset); - c2[1] = hc_bytealign_be (w4[2], w4[3], offset); - c2[0] = hc_bytealign_be (w4[1], w4[2], offset); - c1[3] = hc_bytealign_be (w4[0], w4[1], offset); - c1[2] = hc_bytealign_be (w3[3], w4[0], offset); - c1[1] = hc_bytealign_be (w3[2], w3[3], offset); - c1[0] = hc_bytealign_be (w3[1], w3[2], offset); - c0[3] = hc_bytealign_be (w3[0], w3[1], offset); - c0[2] = hc_bytealign_be (w2[3], w3[0], offset); - c0[1] = hc_bytealign_be (w2[2], w2[3], offset); - c0[0] = hc_bytealign_be (w2[1], w2[2], offset); w7[3] = hc_bytealign_be (w2[0], w2[1], offset); w7[2] = hc_bytealign_be (w1[3], w2[0], offset); w7[1] = hc_bytealign_be (w1[2], w1[3], offset); @@ -10939,30 +11723,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 23: - c5[3] = hc_bytealign_be (w7[3], 0, offset); - c5[2] = hc_bytealign_be (w7[2], w7[3], offset); - c5[1] = hc_bytealign_be (w7[1], w7[2], offset); - c5[0] = hc_bytealign_be (w7[0], w7[1], offset); - c4[3] = hc_bytealign_be (w6[3], w7[0], offset); - c4[2] = hc_bytealign_be (w6[2], w6[3], offset); - c4[1] = hc_bytealign_be (w6[1], w6[2], offset); - c4[0] = hc_bytealign_be (w6[0], w6[1], offset); - c3[3] = hc_bytealign_be (w5[3], w6[0], offset); - c3[2] = hc_bytealign_be (w5[2], w5[3], offset); - c3[1] = hc_bytealign_be (w5[1], w5[2], offset); - c3[0] = hc_bytealign_be (w5[0], w5[1], offset); - c2[3] = hc_bytealign_be (w4[3], w5[0], offset); - c2[2] = hc_bytealign_be (w4[2], w4[3], offset); - c2[1] = hc_bytealign_be (w4[1], w4[2], offset); - c2[0] = hc_bytealign_be (w4[0], w4[1], offset); - c1[3] = hc_bytealign_be (w3[3], w4[0], offset); - c1[2] = hc_bytealign_be (w3[2], w3[3], offset); - c1[1] = hc_bytealign_be (w3[1], w3[2], offset); - c1[0] = hc_bytealign_be (w3[0], w3[1], offset); - c0[3] = hc_bytealign_be (w2[3], w3[0], offset); - c0[2] = hc_bytealign_be (w2[2], w2[3], offset); - c0[1] = hc_bytealign_be (w2[1], w2[2], offset); - c0[0] = hc_bytealign_be (w2[0], w2[1], offset); w7[3] = hc_bytealign_be (w1[3], w2[0], offset); w7[2] = hc_bytealign_be (w1[2], w1[3], offset); w7[1] = hc_bytealign_be (w1[1], w1[2], offset); @@ -10999,31 +11759,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 24: - c6[0] = hc_bytealign_be (w7[3], 0, offset); - c5[3] = hc_bytealign_be (w7[2], w7[3], offset); - c5[2] = hc_bytealign_be (w7[1], w7[2], offset); - c5[1] = hc_bytealign_be (w7[0], w7[1], offset); - c5[0] = hc_bytealign_be (w6[3], w7[0], offset); - c4[3] = hc_bytealign_be (w6[2], w6[3], offset); - c4[2] = hc_bytealign_be (w6[1], w6[2], offset); - c4[1] = hc_bytealign_be (w6[0], w6[1], offset); - c4[0] = hc_bytealign_be (w5[3], w6[0], offset); - c3[3] = hc_bytealign_be (w5[2], w5[3], offset); - c3[2] = hc_bytealign_be (w5[1], w5[2], offset); - c3[1] = hc_bytealign_be (w5[0], w5[1], offset); - c3[0] = hc_bytealign_be (w4[3], w5[0], offset); - c2[3] = hc_bytealign_be (w4[2], w4[3], offset); - c2[2] = hc_bytealign_be (w4[1], w4[2], offset); - c2[1] = hc_bytealign_be (w4[0], w4[1], offset); - c2[0] = hc_bytealign_be (w3[3], w4[0], offset); - c1[3] = hc_bytealign_be (w3[2], w3[3], offset); - c1[2] = hc_bytealign_be (w3[1], w3[2], offset); - c1[1] = hc_bytealign_be (w3[0], w3[1], offset); - c1[0] = hc_bytealign_be (w2[3], w3[0], offset); - c0[3] = hc_bytealign_be (w2[2], w2[3], offset); - c0[2] = hc_bytealign_be (w2[1], w2[2], offset); - c0[1] = hc_bytealign_be (w2[0], w2[1], offset); - c0[0] = hc_bytealign_be (w1[3], w2[0], offset); w7[3] = hc_bytealign_be (w1[2], w1[3], offset); w7[2] = hc_bytealign_be (w1[1], w1[2], offset); w7[1] = hc_bytealign_be (w1[0], w1[1], offset); @@ -11060,32 +11795,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 25: - c6[1] = hc_bytealign_be (w7[3], 0, offset); - c6[0] = hc_bytealign_be (w7[2], w7[3], offset); - c5[3] = hc_bytealign_be (w7[1], w7[2], offset); - c5[2] = hc_bytealign_be (w7[0], w7[1], offset); - c5[1] = hc_bytealign_be (w6[3], w7[0], offset); - c5[0] = hc_bytealign_be (w6[2], w6[3], offset); - c4[3] = hc_bytealign_be (w6[1], w6[2], offset); - c4[2] = hc_bytealign_be (w6[0], w6[1], offset); - c4[1] = hc_bytealign_be (w5[3], w6[0], offset); - c4[0] = hc_bytealign_be (w5[2], w5[3], offset); - c3[3] = hc_bytealign_be (w5[1], w5[2], offset); - c3[2] = hc_bytealign_be (w5[0], w5[1], offset); - c3[1] = hc_bytealign_be (w4[3], w5[0], offset); - c3[0] = hc_bytealign_be (w4[2], w4[3], offset); - c2[3] = hc_bytealign_be (w4[1], w4[2], offset); - c2[2] = hc_bytealign_be (w4[0], w4[1], offset); - c2[1] = hc_bytealign_be (w3[3], w4[0], offset); - c2[0] = hc_bytealign_be (w3[2], w3[3], offset); - c1[3] = hc_bytealign_be (w3[1], w3[2], offset); - c1[2] = hc_bytealign_be (w3[0], w3[1], offset); - c1[1] = hc_bytealign_be (w2[3], w3[0], offset); - c1[0] = hc_bytealign_be (w2[2], w2[3], offset); - c0[3] = hc_bytealign_be (w2[1], w2[2], offset); - c0[2] = hc_bytealign_be (w2[0], w2[1], offset); - c0[1] = hc_bytealign_be (w1[3], w2[0], offset); - c0[0] = hc_bytealign_be (w1[2], w1[3], offset); w7[3] = hc_bytealign_be (w1[1], w1[2], offset); w7[2] = hc_bytealign_be (w1[0], w1[1], offset); w7[1] = hc_bytealign_be (w0[3], w1[0], offset); @@ -11122,33 +11831,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 26: - c6[2] = hc_bytealign_be (w7[3], 0, offset); - c6[1] = hc_bytealign_be (w7[2], w7[3], offset); - c6[0] = hc_bytealign_be (w7[1], w7[2], offset); - c5[3] = hc_bytealign_be (w7[0], w7[1], offset); - c5[2] = hc_bytealign_be (w6[3], w7[0], offset); - c5[1] = hc_bytealign_be (w6[2], w6[3], offset); - c5[0] = hc_bytealign_be (w6[1], w6[2], offset); - c4[3] = hc_bytealign_be (w6[0], w6[1], offset); - c4[2] = hc_bytealign_be (w5[3], w6[0], offset); - c4[1] = hc_bytealign_be (w5[2], w5[3], offset); - c4[0] = hc_bytealign_be (w5[1], w5[2], offset); - c3[3] = hc_bytealign_be (w5[0], w5[1], offset); - c3[2] = hc_bytealign_be (w4[3], w5[0], offset); - c3[1] = hc_bytealign_be (w4[2], w4[3], offset); - c3[0] = hc_bytealign_be (w4[1], w4[2], offset); - c2[3] = hc_bytealign_be (w4[0], w4[1], offset); - c2[2] = hc_bytealign_be (w3[3], w4[0], offset); - c2[1] = hc_bytealign_be (w3[2], w3[3], offset); - c2[0] = hc_bytealign_be (w3[1], w3[2], offset); - c1[3] = hc_bytealign_be (w3[0], w3[1], offset); - c1[2] = hc_bytealign_be (w2[3], w3[0], offset); - c1[1] = hc_bytealign_be (w2[2], w2[3], offset); - c1[0] = hc_bytealign_be (w2[1], w2[2], offset); - c0[3] = hc_bytealign_be (w2[0], w2[1], offset); - c0[2] = hc_bytealign_be (w1[3], w2[0], offset); - c0[1] = hc_bytealign_be (w1[2], w1[3], offset); - c0[0] = hc_bytealign_be (w1[1], w1[2], offset); w7[3] = hc_bytealign_be (w1[0], w1[1], offset); w7[2] = hc_bytealign_be (w0[3], w1[0], offset); w7[1] = hc_bytealign_be (w0[2], w0[3], offset); @@ -11185,34 +11867,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 27: - c6[3] = hc_bytealign_be (w7[3], 0, offset); - c6[2] = hc_bytealign_be (w7[2], w7[3], offset); - c6[1] = hc_bytealign_be (w7[1], w7[2], offset); - c6[0] = hc_bytealign_be (w7[0], w7[1], offset); - c5[3] = hc_bytealign_be (w6[3], w7[0], offset); - c5[2] = hc_bytealign_be (w6[2], w6[3], offset); - c5[1] = hc_bytealign_be (w6[1], w6[2], offset); - c5[0] = hc_bytealign_be (w6[0], w6[1], offset); - c4[3] = hc_bytealign_be (w5[3], w6[0], offset); - c4[2] = hc_bytealign_be (w5[2], w5[3], offset); - c4[1] = hc_bytealign_be (w5[1], w5[2], offset); - c4[0] = hc_bytealign_be (w5[0], w5[1], offset); - c3[3] = hc_bytealign_be (w4[3], w5[0], offset); - c3[2] = hc_bytealign_be (w4[2], w4[3], offset); - c3[1] = hc_bytealign_be (w4[1], w4[2], offset); - c3[0] = hc_bytealign_be (w4[0], w4[1], offset); - c2[3] = hc_bytealign_be (w3[3], w4[0], offset); - c2[2] = hc_bytealign_be (w3[2], w3[3], offset); - c2[1] = hc_bytealign_be (w3[1], w3[2], offset); - c2[0] = hc_bytealign_be (w3[0], w3[1], offset); - c1[3] = hc_bytealign_be (w2[3], w3[0], offset); - c1[2] = hc_bytealign_be (w2[2], w2[3], offset); - c1[1] = hc_bytealign_be (w2[1], w2[2], offset); - c1[0] = hc_bytealign_be (w2[0], w2[1], offset); - c0[3] = hc_bytealign_be (w1[3], w2[0], offset); - c0[2] = hc_bytealign_be (w1[2], w1[3], offset); - c0[1] = hc_bytealign_be (w1[1], w1[2], offset); - c0[0] = hc_bytealign_be (w1[0], w1[1], offset); w7[3] = hc_bytealign_be (w0[3], w1[0], offset); w7[2] = hc_bytealign_be (w0[2], w0[3], offset); w7[1] = hc_bytealign_be (w0[1], w0[2], offset); @@ -11249,35 +11903,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 28: - c7[0] = hc_bytealign_be (w7[3], 0, offset); - c6[3] = hc_bytealign_be (w7[2], w7[3], offset); - c6[2] = hc_bytealign_be (w7[1], w7[2], offset); - c6[1] = hc_bytealign_be (w7[0], w7[1], offset); - c6[0] = hc_bytealign_be (w6[3], w7[0], offset); - c5[3] = hc_bytealign_be (w6[2], w6[3], offset); - c5[2] = hc_bytealign_be (w6[1], w6[2], offset); - c5[1] = hc_bytealign_be (w6[0], w6[1], offset); - c5[0] = hc_bytealign_be (w5[3], w6[0], offset); - c4[3] = hc_bytealign_be (w5[2], w5[3], offset); - c4[2] = hc_bytealign_be (w5[1], w5[2], offset); - c4[1] = hc_bytealign_be (w5[0], w5[1], offset); - c4[0] = hc_bytealign_be (w4[3], w5[0], offset); - c3[3] = hc_bytealign_be (w4[2], w4[3], offset); - c3[2] = hc_bytealign_be (w4[1], w4[2], offset); - c3[1] = hc_bytealign_be (w4[0], w4[1], offset); - c3[0] = hc_bytealign_be (w3[3], w4[0], offset); - c2[3] = hc_bytealign_be (w3[2], w3[3], offset); - c2[2] = hc_bytealign_be (w3[1], w3[2], offset); - c2[1] = hc_bytealign_be (w3[0], w3[1], offset); - c2[0] = hc_bytealign_be (w2[3], w3[0], offset); - c1[3] = hc_bytealign_be (w2[2], w2[3], offset); - c1[2] = hc_bytealign_be (w2[1], w2[2], offset); - c1[1] = hc_bytealign_be (w2[0], w2[1], offset); - c1[0] = hc_bytealign_be (w1[3], w2[0], offset); - c0[3] = hc_bytealign_be (w1[2], w1[3], offset); - c0[2] = hc_bytealign_be (w1[1], w1[2], offset); - c0[1] = hc_bytealign_be (w1[0], w1[1], offset); - c0[0] = hc_bytealign_be (w0[3], w1[0], offset); w7[3] = hc_bytealign_be (w0[2], w0[3], offset); w7[2] = hc_bytealign_be (w0[1], w0[2], offset); w7[1] = hc_bytealign_be (w0[0], w0[1], offset); @@ -11314,36 +11939,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 29: - c7[1] = hc_bytealign_be (w7[3], 0, offset); - c7[0] = hc_bytealign_be (w7[2], w7[3], offset); - c6[3] = hc_bytealign_be (w7[1], w7[2], offset); - c6[2] = hc_bytealign_be (w7[0], w7[1], offset); - c6[1] = hc_bytealign_be (w6[3], w7[0], offset); - c6[0] = hc_bytealign_be (w6[2], w6[3], offset); - c5[3] = hc_bytealign_be (w6[1], w6[2], offset); - c5[2] = hc_bytealign_be (w6[0], w6[1], offset); - c5[1] = hc_bytealign_be (w5[3], w6[0], offset); - c5[0] = hc_bytealign_be (w5[2], w5[3], offset); - c4[3] = hc_bytealign_be (w5[1], w5[2], offset); - c4[2] = hc_bytealign_be (w5[0], w5[1], offset); - c4[1] = hc_bytealign_be (w4[3], w5[0], offset); - c4[0] = hc_bytealign_be (w4[2], w4[3], offset); - c3[3] = hc_bytealign_be (w4[1], w4[2], offset); - c3[2] = hc_bytealign_be (w4[0], w4[1], offset); - c3[1] = hc_bytealign_be (w3[3], w4[0], offset); - c3[0] = hc_bytealign_be (w3[2], w3[3], offset); - c2[3] = hc_bytealign_be (w3[1], w3[2], offset); - c2[2] = hc_bytealign_be (w3[0], w3[1], offset); - c2[1] = hc_bytealign_be (w2[3], w3[0], offset); - c2[0] = hc_bytealign_be (w2[2], w2[3], offset); - c1[3] = hc_bytealign_be (w2[1], w2[2], offset); - c1[2] = hc_bytealign_be (w2[0], w2[1], offset); - c1[1] = hc_bytealign_be (w1[3], w2[0], offset); - c1[0] = hc_bytealign_be (w1[2], w1[3], offset); - c0[3] = hc_bytealign_be (w1[1], w1[2], offset); - c0[2] = hc_bytealign_be (w1[0], w1[1], offset); - c0[1] = hc_bytealign_be (w0[3], w1[0], offset); - c0[0] = hc_bytealign_be (w0[2], w0[3], offset); w7[3] = hc_bytealign_be (w0[1], w0[2], offset); w7[2] = hc_bytealign_be (w0[0], w0[1], offset); w7[1] = hc_bytealign_be ( 0, w0[0], offset); @@ -11380,37 +11975,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 30: - c7[2] = hc_bytealign_be (w7[3], 0, offset); - c7[1] = hc_bytealign_be (w7[2], w7[3], offset); - c7[0] = hc_bytealign_be (w7[1], w7[2], offset); - c6[3] = hc_bytealign_be (w7[0], w7[1], offset); - c6[2] = hc_bytealign_be (w6[3], w7[0], offset); - c6[1] = hc_bytealign_be (w6[2], w6[3], offset); - c6[0] = hc_bytealign_be (w6[1], w6[2], offset); - c5[3] = hc_bytealign_be (w6[0], w6[1], offset); - c5[2] = hc_bytealign_be (w5[3], w6[0], offset); - c5[1] = hc_bytealign_be (w5[2], w5[3], offset); - c5[0] = hc_bytealign_be (w5[1], w5[2], offset); - c4[3] = hc_bytealign_be (w5[0], w5[1], offset); - c4[2] = hc_bytealign_be (w4[3], w5[0], offset); - c4[1] = hc_bytealign_be (w4[2], w4[3], offset); - c4[0] = hc_bytealign_be (w4[1], w4[2], offset); - c3[3] = hc_bytealign_be (w4[0], w4[1], offset); - c3[2] = hc_bytealign_be (w3[3], w4[0], offset); - c3[1] = hc_bytealign_be (w3[2], w3[3], offset); - c3[0] = hc_bytealign_be (w3[1], w3[2], offset); - c2[3] = hc_bytealign_be (w3[0], w3[1], offset); - c2[2] = hc_bytealign_be (w2[3], w3[0], offset); - c2[1] = hc_bytealign_be (w2[2], w2[3], offset); - c2[0] = hc_bytealign_be (w2[1], w2[2], offset); - c1[3] = hc_bytealign_be (w2[0], w2[1], offset); - c1[2] = hc_bytealign_be (w1[3], w2[0], offset); - c1[1] = hc_bytealign_be (w1[2], w1[3], offset); - c1[0] = hc_bytealign_be (w1[1], w1[2], offset); - c0[3] = hc_bytealign_be (w1[0], w1[1], offset); - c0[2] = hc_bytealign_be (w0[3], w1[0], offset); - c0[1] = hc_bytealign_be (w0[2], w0[3], offset); - c0[0] = hc_bytealign_be (w0[1], w0[2], offset); w7[3] = hc_bytealign_be (w0[0], w0[1], offset); w7[2] = hc_bytealign_be ( 0, w0[0], offset); w7[1] = 0; @@ -11447,38 +12011,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 31: - c7[3] = hc_bytealign_be (w7[3], 0, offset); - c7[2] = hc_bytealign_be (w7[2], w7[3], offset); - c7[1] = hc_bytealign_be (w7[1], w7[2], offset); - c7[0] = hc_bytealign_be (w7[0], w7[1], offset); - c6[3] = hc_bytealign_be (w6[3], w7[0], offset); - c6[2] = hc_bytealign_be (w6[2], w6[3], offset); - c6[1] = hc_bytealign_be (w6[1], w6[2], offset); - c6[0] = hc_bytealign_be (w6[0], w6[1], offset); - c5[3] = hc_bytealign_be (w5[3], w6[0], offset); - c5[2] = hc_bytealign_be (w5[2], w5[3], offset); - c5[1] = hc_bytealign_be (w5[1], w5[2], offset); - c5[0] = hc_bytealign_be (w5[0], w5[1], offset); - c4[3] = hc_bytealign_be (w4[3], w5[0], offset); - c4[2] = hc_bytealign_be (w4[2], w4[3], offset); - c4[1] = hc_bytealign_be (w4[1], w4[2], offset); - c4[0] = hc_bytealign_be (w4[0], w4[1], offset); - c3[3] = hc_bytealign_be (w3[3], w4[0], offset); - c3[2] = hc_bytealign_be (w3[2], w3[3], offset); - c3[1] = hc_bytealign_be (w3[1], w3[2], offset); - c3[0] = hc_bytealign_be (w3[0], w3[1], offset); - c2[3] = hc_bytealign_be (w2[3], w3[0], offset); - c2[2] = hc_bytealign_be (w2[2], w2[3], offset); - c2[1] = hc_bytealign_be (w2[1], w2[2], offset); - c2[0] = hc_bytealign_be (w2[0], w2[1], offset); - c1[3] = hc_bytealign_be (w1[3], w2[0], offset); - c1[2] = hc_bytealign_be (w1[2], w1[3], offset); - c1[1] = hc_bytealign_be (w1[1], w1[2], offset); - c1[0] = hc_bytealign_be (w1[0], w1[1], offset); - c0[3] = hc_bytealign_be (w0[3], w1[0], offset); - c0[2] = hc_bytealign_be (w0[2], w0[3], offset); - c0[1] = hc_bytealign_be (w0[1], w0[2], offset); - c0[0] = hc_bytealign_be (w0[0], w0[1], offset); w7[3] = hc_bytealign_be ( 0, w0[0], offset); w7[2] = 0; w7[1] = 0; @@ -11529,7 +12061,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 switch (offset_switch) { case 0: - c0[0] = hc_byte_perm ( 0, w7[3], selector); w7[3] = hc_byte_perm (w7[3], w7[2], selector); w7[2] = hc_byte_perm (w7[2], w7[1], selector); w7[1] = hc_byte_perm (w7[1], w7[0], selector); @@ -11566,8 +12097,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 1: - c0[1] = hc_byte_perm ( 0, w7[3], selector); - c0[0] = hc_byte_perm (w7[3], w7[2], selector); w7[3] = hc_byte_perm (w7[2], w7[1], selector); w7[2] = hc_byte_perm (w7[1], w7[0], selector); w7[1] = hc_byte_perm (w7[0], w6[3], selector); @@ -11604,9 +12133,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 2: - c0[2] = hc_byte_perm ( 0, w7[3], selector); - c0[1] = hc_byte_perm (w7[3], w7[2], selector); - c0[0] = hc_byte_perm (w7[2], w7[1], selector); w7[3] = hc_byte_perm (w7[1], w7[0], selector); w7[2] = hc_byte_perm (w7[0], w6[3], selector); w7[1] = hc_byte_perm (w6[3], w6[2], selector); @@ -11643,10 +12169,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 3: - c0[3] = hc_byte_perm ( 0, w7[3], selector); - c0[2] = hc_byte_perm (w7[3], w7[2], selector); - c0[1] = hc_byte_perm (w7[2], w7[1], selector); - c0[0] = hc_byte_perm (w7[1], w7[0], selector); w7[3] = hc_byte_perm (w7[0], w6[3], selector); w7[2] = hc_byte_perm (w6[3], w6[2], selector); w7[1] = hc_byte_perm (w6[2], w6[1], selector); @@ -11683,11 +12205,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 4: - c1[0] = hc_byte_perm ( 0, w7[3], selector); - c0[3] = hc_byte_perm (w7[3], w7[2], selector); - c0[2] = hc_byte_perm (w7[2], w7[1], selector); - c0[1] = hc_byte_perm (w7[1], w7[0], selector); - c0[0] = hc_byte_perm (w7[0], w6[3], selector); w7[3] = hc_byte_perm (w6[3], w6[2], selector); w7[2] = hc_byte_perm (w6[2], w6[1], selector); w7[1] = hc_byte_perm (w6[1], w6[0], selector); @@ -11724,12 +12241,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 5: - c1[1] = hc_byte_perm ( 0, w7[3], selector); - c1[0] = hc_byte_perm (w7[3], w7[2], selector); - c0[3] = hc_byte_perm (w7[2], w7[1], selector); - c0[2] = hc_byte_perm (w7[1], w7[0], selector); - c0[1] = hc_byte_perm (w7[0], w6[3], selector); - c0[0] = hc_byte_perm (w6[3], w6[2], selector); w7[3] = hc_byte_perm (w6[2], w6[1], selector); w7[2] = hc_byte_perm (w6[1], w6[0], selector); w7[1] = hc_byte_perm (w6[0], w5[3], selector); @@ -11766,13 +12277,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 6: - c1[2] = hc_byte_perm ( 0, w7[3], selector); - c1[1] = hc_byte_perm (w7[3], w7[2], selector); - c1[0] = hc_byte_perm (w7[2], w7[1], selector); - c0[3] = hc_byte_perm (w7[1], w7[0], selector); - c0[2] = hc_byte_perm (w7[0], w6[3], selector); - c0[1] = hc_byte_perm (w6[3], w6[2], selector); - c0[0] = hc_byte_perm (w6[2], w6[1], selector); w7[3] = hc_byte_perm (w6[1], w6[0], selector); w7[2] = hc_byte_perm (w6[0], w5[3], selector); w7[1] = hc_byte_perm (w5[3], w5[2], selector); @@ -11809,14 +12313,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 7: - c1[3] = hc_byte_perm ( 0, w7[3], selector); - c1[2] = hc_byte_perm (w7[3], w7[2], selector); - c1[1] = hc_byte_perm (w7[2], w7[1], selector); - c1[0] = hc_byte_perm (w7[1], w7[0], selector); - c0[3] = hc_byte_perm (w7[0], w6[3], selector); - c0[2] = hc_byte_perm (w6[3], w6[2], selector); - c0[1] = hc_byte_perm (w6[2], w6[1], selector); - c0[0] = hc_byte_perm (w6[1], w6[0], selector); w7[3] = hc_byte_perm (w6[0], w5[3], selector); w7[2] = hc_byte_perm (w5[3], w5[2], selector); w7[1] = hc_byte_perm (w5[2], w5[1], selector); @@ -11853,15 +12349,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 8: - c2[0] = hc_byte_perm ( 0, w7[3], selector); - c1[3] = hc_byte_perm (w7[3], w7[2], selector); - c1[2] = hc_byte_perm (w7[2], w7[1], selector); - c1[1] = hc_byte_perm (w7[1], w7[0], selector); - c1[0] = hc_byte_perm (w7[0], w6[3], selector); - c0[3] = hc_byte_perm (w6[3], w6[2], selector); - c0[2] = hc_byte_perm (w6[2], w6[1], selector); - c0[1] = hc_byte_perm (w6[1], w6[0], selector); - c0[0] = hc_byte_perm (w6[0], w5[3], selector); w7[3] = hc_byte_perm (w5[3], w5[2], selector); w7[2] = hc_byte_perm (w5[2], w5[1], selector); w7[1] = hc_byte_perm (w5[1], w5[0], selector); @@ -11898,16 +12385,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 9: - c2[1] = hc_byte_perm ( 0, w7[3], selector); - c2[0] = hc_byte_perm (w7[3], w7[2], selector); - c1[3] = hc_byte_perm (w7[2], w7[1], selector); - c1[2] = hc_byte_perm (w7[1], w7[0], selector); - c1[1] = hc_byte_perm (w7[0], w6[3], selector); - c1[0] = hc_byte_perm (w6[3], w6[2], selector); - c0[3] = hc_byte_perm (w6[2], w6[1], selector); - c0[2] = hc_byte_perm (w6[1], w6[0], selector); - c0[1] = hc_byte_perm (w6[0], w5[3], selector); - c0[0] = hc_byte_perm (w5[3], w5[2], selector); w7[3] = hc_byte_perm (w5[2], w5[1], selector); w7[2] = hc_byte_perm (w5[1], w5[0], selector); w7[1] = hc_byte_perm (w5[0], w4[3], selector); @@ -11944,17 +12421,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 10: - c2[2] = hc_byte_perm ( 0, w7[3], selector); - c2[1] = hc_byte_perm (w7[3], w7[2], selector); - c2[0] = hc_byte_perm (w7[2], w7[1], selector); - c1[3] = hc_byte_perm (w7[1], w7[0], selector); - c1[2] = hc_byte_perm (w7[0], w6[3], selector); - c1[1] = hc_byte_perm (w6[3], w6[2], selector); - c1[0] = hc_byte_perm (w6[2], w6[1], selector); - c0[3] = hc_byte_perm (w6[1], w6[0], selector); - c0[2] = hc_byte_perm (w6[0], w5[3], selector); - c0[1] = hc_byte_perm (w5[3], w5[2], selector); - c0[0] = hc_byte_perm (w5[2], w5[1], selector); w7[3] = hc_byte_perm (w5[1], w5[0], selector); w7[2] = hc_byte_perm (w5[0], w4[3], selector); w7[1] = hc_byte_perm (w4[3], w4[2], selector); @@ -11991,18 +12457,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 11: - c2[3] = hc_byte_perm ( 0, w7[3], selector); - c2[2] = hc_byte_perm (w7[3], w7[2], selector); - c2[1] = hc_byte_perm (w7[2], w7[1], selector); - c2[0] = hc_byte_perm (w7[1], w7[0], selector); - c1[3] = hc_byte_perm (w7[0], w6[3], selector); - c1[2] = hc_byte_perm (w6[3], w6[2], selector); - c1[1] = hc_byte_perm (w6[2], w6[1], selector); - c1[0] = hc_byte_perm (w6[1], w6[0], selector); - c0[3] = hc_byte_perm (w6[0], w5[3], selector); - c0[2] = hc_byte_perm (w5[3], w5[2], selector); - c0[1] = hc_byte_perm (w5[2], w5[1], selector); - c0[0] = hc_byte_perm (w5[1], w5[0], selector); w7[3] = hc_byte_perm (w5[0], w4[3], selector); w7[2] = hc_byte_perm (w4[3], w4[2], selector); w7[1] = hc_byte_perm (w4[2], w4[1], selector); @@ -12039,19 +12493,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 12: - c3[0] = hc_byte_perm ( 0, w7[3], selector); - c2[3] = hc_byte_perm (w7[3], w7[2], selector); - c2[2] = hc_byte_perm (w7[2], w7[1], selector); - c2[1] = hc_byte_perm (w7[1], w7[0], selector); - c2[0] = hc_byte_perm (w7[0], w6[3], selector); - c1[3] = hc_byte_perm (w6[3], w6[2], selector); - c1[2] = hc_byte_perm (w6[2], w6[1], selector); - c1[1] = hc_byte_perm (w6[1], w6[0], selector); - c1[0] = hc_byte_perm (w6[0], w5[3], selector); - c0[3] = hc_byte_perm (w5[3], w5[2], selector); - c0[2] = hc_byte_perm (w5[2], w5[1], selector); - c0[1] = hc_byte_perm (w5[1], w5[0], selector); - c0[0] = hc_byte_perm (w5[0], w4[3], selector); w7[3] = hc_byte_perm (w4[3], w4[2], selector); w7[2] = hc_byte_perm (w4[2], w4[1], selector); w7[1] = hc_byte_perm (w4[1], w4[0], selector); @@ -12088,20 +12529,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 13: - c3[1] = hc_byte_perm ( 0, w7[3], selector); - c3[0] = hc_byte_perm (w7[3], w7[2], selector); - c2[3] = hc_byte_perm (w7[2], w7[1], selector); - c2[2] = hc_byte_perm (w7[1], w7[0], selector); - c2[1] = hc_byte_perm (w7[0], w6[3], selector); - c2[0] = hc_byte_perm (w6[3], w6[2], selector); - c1[3] = hc_byte_perm (w6[2], w6[1], selector); - c1[2] = hc_byte_perm (w6[1], w6[0], selector); - c1[1] = hc_byte_perm (w6[0], w5[3], selector); - c1[0] = hc_byte_perm (w5[3], w5[2], selector); - c0[3] = hc_byte_perm (w5[2], w5[1], selector); - c0[2] = hc_byte_perm (w5[1], w5[0], selector); - c0[1] = hc_byte_perm (w5[0], w4[3], selector); - c0[0] = hc_byte_perm (w4[3], w4[2], selector); w7[3] = hc_byte_perm (w4[2], w4[1], selector); w7[2] = hc_byte_perm (w4[1], w4[0], selector); w7[1] = hc_byte_perm (w4[0], w3[3], selector); @@ -12138,21 +12565,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 14: - c3[2] = hc_byte_perm ( 0, w7[3], selector); - c3[1] = hc_byte_perm (w7[3], w7[2], selector); - c3[0] = hc_byte_perm (w7[2], w7[1], selector); - c2[3] = hc_byte_perm (w7[1], w7[0], selector); - c2[2] = hc_byte_perm (w7[0], w6[3], selector); - c2[1] = hc_byte_perm (w6[3], w6[2], selector); - c2[0] = hc_byte_perm (w6[2], w6[1], selector); - c1[3] = hc_byte_perm (w6[1], w6[0], selector); - c1[2] = hc_byte_perm (w6[0], w5[3], selector); - c1[1] = hc_byte_perm (w5[3], w5[2], selector); - c1[0] = hc_byte_perm (w5[2], w5[1], selector); - c0[3] = hc_byte_perm (w5[1], w5[0], selector); - c0[2] = hc_byte_perm (w5[0], w4[3], selector); - c0[1] = hc_byte_perm (w4[3], w4[2], selector); - c0[0] = hc_byte_perm (w4[2], w4[1], selector); w7[3] = hc_byte_perm (w4[1], w4[0], selector); w7[2] = hc_byte_perm (w4[0], w3[3], selector); w7[1] = hc_byte_perm (w3[3], w3[2], selector); @@ -12189,22 +12601,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 15: - c3[3] = hc_byte_perm ( 0, w7[3], selector); - c3[2] = hc_byte_perm (w7[3], w7[2], selector); - c3[1] = hc_byte_perm (w7[2], w7[1], selector); - c3[0] = hc_byte_perm (w7[1], w7[0], selector); - c2[3] = hc_byte_perm (w7[0], w6[3], selector); - c2[2] = hc_byte_perm (w6[3], w6[2], selector); - c2[1] = hc_byte_perm (w6[2], w6[1], selector); - c2[0] = hc_byte_perm (w6[1], w6[0], selector); - c1[3] = hc_byte_perm (w6[0], w5[3], selector); - c1[2] = hc_byte_perm (w5[3], w5[2], selector); - c1[1] = hc_byte_perm (w5[2], w5[1], selector); - c1[0] = hc_byte_perm (w5[1], w5[0], selector); - c0[3] = hc_byte_perm (w5[0], w4[3], selector); - c0[2] = hc_byte_perm (w4[3], w4[2], selector); - c0[1] = hc_byte_perm (w4[2], w4[1], selector); - c0[0] = hc_byte_perm (w4[1], w4[0], selector); w7[3] = hc_byte_perm (w4[0], w3[3], selector); w7[2] = hc_byte_perm (w3[3], w3[2], selector); w7[1] = hc_byte_perm (w3[2], w3[1], selector); @@ -12241,23 +12637,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 16: - c4[0] = hc_byte_perm ( 0, w7[3], selector); - c3[3] = hc_byte_perm (w7[3], w7[2], selector); - c3[2] = hc_byte_perm (w7[2], w7[1], selector); - c3[1] = hc_byte_perm (w7[1], w7[0], selector); - c3[0] = hc_byte_perm (w7[0], w6[3], selector); - c2[3] = hc_byte_perm (w6[3], w6[2], selector); - c2[2] = hc_byte_perm (w6[2], w6[1], selector); - c2[1] = hc_byte_perm (w6[1], w6[0], selector); - c2[0] = hc_byte_perm (w6[0], w5[3], selector); - c1[3] = hc_byte_perm (w5[3], w5[2], selector); - c1[2] = hc_byte_perm (w5[2], w5[1], selector); - c1[1] = hc_byte_perm (w5[1], w5[0], selector); - c1[0] = hc_byte_perm (w5[0], w4[3], selector); - c0[3] = hc_byte_perm (w4[3], w4[2], selector); - c0[2] = hc_byte_perm (w4[2], w4[1], selector); - c0[1] = hc_byte_perm (w4[1], w4[0], selector); - c0[0] = hc_byte_perm (w4[0], w3[3], selector); w7[3] = hc_byte_perm (w3[3], w3[2], selector); w7[2] = hc_byte_perm (w3[2], w3[1], selector); w7[1] = hc_byte_perm (w3[1], w3[0], selector); @@ -12294,24 +12673,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 17: - c4[1] = hc_byte_perm ( 0, w7[3], selector); - c4[0] = hc_byte_perm (w7[3], w7[2], selector); - c3[3] = hc_byte_perm (w7[2], w7[1], selector); - c3[2] = hc_byte_perm (w7[1], w7[0], selector); - c3[1] = hc_byte_perm (w7[0], w6[3], selector); - c3[0] = hc_byte_perm (w6[3], w6[2], selector); - c2[3] = hc_byte_perm (w6[2], w6[1], selector); - c2[2] = hc_byte_perm (w6[1], w6[0], selector); - c2[1] = hc_byte_perm (w6[0], w5[3], selector); - c2[0] = hc_byte_perm (w5[3], w5[2], selector); - c1[3] = hc_byte_perm (w5[2], w5[1], selector); - c1[2] = hc_byte_perm (w5[1], w5[0], selector); - c1[1] = hc_byte_perm (w5[0], w4[3], selector); - c1[0] = hc_byte_perm (w4[3], w4[2], selector); - c0[3] = hc_byte_perm (w4[2], w4[1], selector); - c0[2] = hc_byte_perm (w4[1], w4[0], selector); - c0[1] = hc_byte_perm (w4[0], w3[3], selector); - c0[0] = hc_byte_perm (w3[3], w3[2], selector); w7[3] = hc_byte_perm (w3[2], w3[1], selector); w7[2] = hc_byte_perm (w3[1], w3[0], selector); w7[1] = hc_byte_perm (w3[0], w2[3], selector); @@ -12348,25 +12709,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 18: - c4[2] = hc_byte_perm ( 0, w7[3], selector); - c4[1] = hc_byte_perm (w7[3], w7[2], selector); - c4[0] = hc_byte_perm (w7[2], w7[1], selector); - c3[3] = hc_byte_perm (w7[1], w7[0], selector); - c3[2] = hc_byte_perm (w7[0], w6[3], selector); - c3[1] = hc_byte_perm (w6[3], w6[2], selector); - c3[0] = hc_byte_perm (w6[2], w6[1], selector); - c2[3] = hc_byte_perm (w6[1], w6[0], selector); - c2[2] = hc_byte_perm (w6[0], w5[3], selector); - c2[1] = hc_byte_perm (w5[3], w5[2], selector); - c2[0] = hc_byte_perm (w5[2], w5[1], selector); - c1[3] = hc_byte_perm (w5[1], w5[0], selector); - c1[2] = hc_byte_perm (w5[0], w4[3], selector); - c1[1] = hc_byte_perm (w4[3], w4[2], selector); - c1[0] = hc_byte_perm (w4[2], w4[1], selector); - c0[3] = hc_byte_perm (w4[1], w4[0], selector); - c0[2] = hc_byte_perm (w4[0], w3[3], selector); - c0[1] = hc_byte_perm (w3[3], w3[2], selector); - c0[0] = hc_byte_perm (w3[2], w3[1], selector); w7[3] = hc_byte_perm (w3[1], w3[0], selector); w7[2] = hc_byte_perm (w3[0], w2[3], selector); w7[1] = hc_byte_perm (w2[3], w2[2], selector); @@ -12403,26 +12745,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 19: - c4[3] = hc_byte_perm ( 0, w7[3], selector); - c4[2] = hc_byte_perm (w7[3], w7[2], selector); - c4[1] = hc_byte_perm (w7[2], w7[1], selector); - c4[0] = hc_byte_perm (w7[1], w7[0], selector); - c3[3] = hc_byte_perm (w7[0], w6[3], selector); - c3[2] = hc_byte_perm (w6[3], w6[2], selector); - c3[1] = hc_byte_perm (w6[2], w6[1], selector); - c3[0] = hc_byte_perm (w6[1], w6[0], selector); - c2[3] = hc_byte_perm (w6[0], w5[3], selector); - c2[2] = hc_byte_perm (w5[3], w5[2], selector); - c2[1] = hc_byte_perm (w5[2], w5[1], selector); - c2[0] = hc_byte_perm (w5[1], w5[0], selector); - c1[3] = hc_byte_perm (w5[0], w4[3], selector); - c1[2] = hc_byte_perm (w4[3], w4[2], selector); - c1[1] = hc_byte_perm (w4[2], w4[1], selector); - c1[0] = hc_byte_perm (w4[1], w4[0], selector); - c0[3] = hc_byte_perm (w4[0], w3[3], selector); - c0[2] = hc_byte_perm (w3[3], w3[2], selector); - c0[1] = hc_byte_perm (w3[2], w3[1], selector); - c0[0] = hc_byte_perm (w3[1], w3[0], selector); w7[3] = hc_byte_perm (w3[0], w2[3], selector); w7[2] = hc_byte_perm (w2[3], w2[2], selector); w7[1] = hc_byte_perm (w2[2], w2[1], selector); @@ -12459,27 +12781,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 20: - c5[0] = hc_byte_perm ( 0, w7[3], selector); - c4[3] = hc_byte_perm (w7[3], w7[2], selector); - c4[2] = hc_byte_perm (w7[2], w7[1], selector); - c4[1] = hc_byte_perm (w7[1], w7[0], selector); - c4[0] = hc_byte_perm (w7[0], w6[3], selector); - c3[3] = hc_byte_perm (w6[3], w6[2], selector); - c3[2] = hc_byte_perm (w6[2], w6[1], selector); - c3[1] = hc_byte_perm (w6[1], w6[0], selector); - c3[0] = hc_byte_perm (w6[0], w5[3], selector); - c2[3] = hc_byte_perm (w5[3], w5[2], selector); - c2[2] = hc_byte_perm (w5[2], w5[1], selector); - c2[1] = hc_byte_perm (w5[1], w5[0], selector); - c2[0] = hc_byte_perm (w5[0], w4[3], selector); - c1[3] = hc_byte_perm (w4[3], w4[2], selector); - c1[2] = hc_byte_perm (w4[2], w4[1], selector); - c1[1] = hc_byte_perm (w4[1], w4[0], selector); - c1[0] = hc_byte_perm (w4[0], w3[3], selector); - c0[3] = hc_byte_perm (w3[3], w3[2], selector); - c0[2] = hc_byte_perm (w3[2], w3[1], selector); - c0[1] = hc_byte_perm (w3[1], w3[0], selector); - c0[0] = hc_byte_perm (w3[0], w2[3], selector); w7[3] = hc_byte_perm (w2[3], w2[2], selector); w7[2] = hc_byte_perm (w2[2], w2[1], selector); w7[1] = hc_byte_perm (w2[1], w2[0], selector); @@ -12516,28 +12817,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 21: - c5[1] = hc_byte_perm ( 0, w7[3], selector); - c5[0] = hc_byte_perm (w7[3], w7[2], selector); - c4[3] = hc_byte_perm (w7[2], w7[1], selector); - c4[2] = hc_byte_perm (w7[1], w7[0], selector); - c4[1] = hc_byte_perm (w7[0], w6[3], selector); - c4[0] = hc_byte_perm (w6[3], w6[2], selector); - c3[3] = hc_byte_perm (w6[2], w6[1], selector); - c3[2] = hc_byte_perm (w6[1], w6[0], selector); - c3[1] = hc_byte_perm (w6[0], w5[3], selector); - c3[0] = hc_byte_perm (w5[3], w5[2], selector); - c2[3] = hc_byte_perm (w5[2], w5[1], selector); - c2[2] = hc_byte_perm (w5[1], w5[0], selector); - c2[1] = hc_byte_perm (w5[0], w4[3], selector); - c2[0] = hc_byte_perm (w4[3], w4[2], selector); - c1[3] = hc_byte_perm (w4[2], w4[1], selector); - c1[2] = hc_byte_perm (w4[1], w4[0], selector); - c1[1] = hc_byte_perm (w4[0], w3[3], selector); - c1[0] = hc_byte_perm (w3[3], w3[2], selector); - c0[3] = hc_byte_perm (w3[2], w3[1], selector); - c0[2] = hc_byte_perm (w3[1], w3[0], selector); - c0[1] = hc_byte_perm (w3[0], w2[3], selector); - c0[0] = hc_byte_perm (w2[3], w2[2], selector); w7[3] = hc_byte_perm (w2[2], w2[1], selector); w7[2] = hc_byte_perm (w2[1], w2[0], selector); w7[1] = hc_byte_perm (w2[0], w1[3], selector); @@ -12574,29 +12853,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 22: - c5[2] = hc_byte_perm ( 0, w7[3], selector); - c5[1] = hc_byte_perm (w7[3], w7[2], selector); - c5[0] = hc_byte_perm (w7[2], w7[1], selector); - c4[3] = hc_byte_perm (w7[1], w7[0], selector); - c4[2] = hc_byte_perm (w7[0], w6[3], selector); - c4[1] = hc_byte_perm (w6[3], w6[2], selector); - c4[0] = hc_byte_perm (w6[2], w6[1], selector); - c3[3] = hc_byte_perm (w6[1], w6[0], selector); - c3[2] = hc_byte_perm (w6[0], w5[3], selector); - c3[1] = hc_byte_perm (w5[3], w5[2], selector); - c3[0] = hc_byte_perm (w5[2], w5[1], selector); - c2[3] = hc_byte_perm (w5[1], w5[0], selector); - c2[2] = hc_byte_perm (w5[0], w4[3], selector); - c2[1] = hc_byte_perm (w4[3], w4[2], selector); - c2[0] = hc_byte_perm (w4[2], w4[1], selector); - c1[3] = hc_byte_perm (w4[1], w4[0], selector); - c1[2] = hc_byte_perm (w4[0], w3[3], selector); - c1[1] = hc_byte_perm (w3[3], w3[2], selector); - c1[0] = hc_byte_perm (w3[2], w3[1], selector); - c0[3] = hc_byte_perm (w3[1], w3[0], selector); - c0[2] = hc_byte_perm (w3[0], w2[3], selector); - c0[1] = hc_byte_perm (w2[3], w2[2], selector); - c0[0] = hc_byte_perm (w2[2], w2[1], selector); w7[3] = hc_byte_perm (w2[1], w2[0], selector); w7[2] = hc_byte_perm (w2[0], w1[3], selector); w7[1] = hc_byte_perm (w1[3], w1[2], selector); @@ -12633,30 +12889,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 23: - c5[3] = hc_byte_perm ( 0, w7[3], selector); - c5[2] = hc_byte_perm (w7[3], w7[2], selector); - c5[1] = hc_byte_perm (w7[2], w7[1], selector); - c5[0] = hc_byte_perm (w7[1], w7[0], selector); - c4[3] = hc_byte_perm (w7[0], w6[3], selector); - c4[2] = hc_byte_perm (w6[3], w6[2], selector); - c4[1] = hc_byte_perm (w6[2], w6[1], selector); - c4[0] = hc_byte_perm (w6[1], w6[0], selector); - c3[3] = hc_byte_perm (w6[0], w5[3], selector); - c3[2] = hc_byte_perm (w5[3], w5[2], selector); - c3[1] = hc_byte_perm (w5[2], w5[1], selector); - c3[0] = hc_byte_perm (w5[1], w5[0], selector); - c2[3] = hc_byte_perm (w5[0], w4[3], selector); - c2[2] = hc_byte_perm (w4[3], w4[2], selector); - c2[1] = hc_byte_perm (w4[2], w4[1], selector); - c2[0] = hc_byte_perm (w4[1], w4[0], selector); - c1[3] = hc_byte_perm (w4[0], w3[3], selector); - c1[2] = hc_byte_perm (w3[3], w3[2], selector); - c1[1] = hc_byte_perm (w3[2], w3[1], selector); - c1[0] = hc_byte_perm (w3[1], w3[0], selector); - c0[3] = hc_byte_perm (w3[0], w2[3], selector); - c0[2] = hc_byte_perm (w2[3], w2[2], selector); - c0[1] = hc_byte_perm (w2[2], w2[1], selector); - c0[0] = hc_byte_perm (w2[1], w2[0], selector); w7[3] = hc_byte_perm (w2[0], w1[3], selector); w7[2] = hc_byte_perm (w1[3], w1[2], selector); w7[1] = hc_byte_perm (w1[2], w1[1], selector); @@ -12693,31 +12925,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 24: - c6[0] = hc_byte_perm ( 0, w7[3], selector); - c5[3] = hc_byte_perm (w7[3], w7[2], selector); - c5[2] = hc_byte_perm (w7[2], w7[1], selector); - c5[1] = hc_byte_perm (w7[1], w7[0], selector); - c5[0] = hc_byte_perm (w7[0], w6[3], selector); - c4[3] = hc_byte_perm (w6[3], w6[2], selector); - c4[2] = hc_byte_perm (w6[2], w6[1], selector); - c4[1] = hc_byte_perm (w6[1], w6[0], selector); - c4[0] = hc_byte_perm (w6[0], w5[3], selector); - c3[3] = hc_byte_perm (w5[3], w5[2], selector); - c3[2] = hc_byte_perm (w5[2], w5[1], selector); - c3[1] = hc_byte_perm (w5[1], w5[0], selector); - c3[0] = hc_byte_perm (w5[0], w4[3], selector); - c2[3] = hc_byte_perm (w4[3], w4[2], selector); - c2[2] = hc_byte_perm (w4[2], w4[1], selector); - c2[1] = hc_byte_perm (w4[1], w4[0], selector); - c2[0] = hc_byte_perm (w4[0], w3[3], selector); - c1[3] = hc_byte_perm (w3[3], w3[2], selector); - c1[2] = hc_byte_perm (w3[2], w3[1], selector); - c1[1] = hc_byte_perm (w3[1], w3[0], selector); - c1[0] = hc_byte_perm (w3[0], w2[3], selector); - c0[3] = hc_byte_perm (w2[3], w2[2], selector); - c0[2] = hc_byte_perm (w2[2], w2[1], selector); - c0[1] = hc_byte_perm (w2[1], w2[0], selector); - c0[0] = hc_byte_perm (w2[0], w1[3], selector); w7[3] = hc_byte_perm (w1[3], w1[2], selector); w7[2] = hc_byte_perm (w1[2], w1[1], selector); w7[1] = hc_byte_perm (w1[1], w1[0], selector); @@ -12754,32 +12961,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 25: - c6[1] = hc_byte_perm ( 0, w7[3], selector); - c6[0] = hc_byte_perm (w7[3], w7[2], selector); - c5[3] = hc_byte_perm (w7[2], w7[1], selector); - c5[2] = hc_byte_perm (w7[1], w7[0], selector); - c5[1] = hc_byte_perm (w7[0], w6[3], selector); - c5[0] = hc_byte_perm (w6[3], w6[2], selector); - c4[3] = hc_byte_perm (w6[2], w6[1], selector); - c4[2] = hc_byte_perm (w6[1], w6[0], selector); - c4[1] = hc_byte_perm (w6[0], w5[3], selector); - c4[0] = hc_byte_perm (w5[3], w5[2], selector); - c3[3] = hc_byte_perm (w5[2], w5[1], selector); - c3[2] = hc_byte_perm (w5[1], w5[0], selector); - c3[1] = hc_byte_perm (w5[0], w4[3], selector); - c3[0] = hc_byte_perm (w4[3], w4[2], selector); - c2[3] = hc_byte_perm (w4[2], w4[1], selector); - c2[2] = hc_byte_perm (w4[1], w4[0], selector); - c2[1] = hc_byte_perm (w4[0], w3[3], selector); - c2[0] = hc_byte_perm (w3[3], w3[2], selector); - c1[3] = hc_byte_perm (w3[2], w3[1], selector); - c1[2] = hc_byte_perm (w3[1], w3[0], selector); - c1[1] = hc_byte_perm (w3[0], w2[3], selector); - c1[0] = hc_byte_perm (w2[3], w2[2], selector); - c0[3] = hc_byte_perm (w2[2], w2[1], selector); - c0[2] = hc_byte_perm (w2[1], w2[0], selector); - c0[1] = hc_byte_perm (w2[0], w1[3], selector); - c0[0] = hc_byte_perm (w1[3], w1[2], selector); w7[3] = hc_byte_perm (w1[2], w1[1], selector); w7[2] = hc_byte_perm (w1[1], w1[0], selector); w7[1] = hc_byte_perm (w1[0], w0[3], selector); @@ -12816,33 +12997,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 26: - c6[2] = hc_byte_perm ( 0, w7[3], selector); - c6[1] = hc_byte_perm (w7[3], w7[2], selector); - c6[0] = hc_byte_perm (w7[2], w7[1], selector); - c5[3] = hc_byte_perm (w7[1], w7[0], selector); - c5[2] = hc_byte_perm (w7[0], w6[3], selector); - c5[1] = hc_byte_perm (w6[3], w6[2], selector); - c5[0] = hc_byte_perm (w6[2], w6[1], selector); - c4[3] = hc_byte_perm (w6[1], w6[0], selector); - c4[2] = hc_byte_perm (w6[0], w5[3], selector); - c4[1] = hc_byte_perm (w5[3], w5[2], selector); - c4[0] = hc_byte_perm (w5[2], w5[1], selector); - c3[3] = hc_byte_perm (w5[1], w5[0], selector); - c3[2] = hc_byte_perm (w5[0], w4[3], selector); - c3[1] = hc_byte_perm (w4[3], w4[2], selector); - c3[0] = hc_byte_perm (w4[2], w4[1], selector); - c2[3] = hc_byte_perm (w4[1], w4[0], selector); - c2[2] = hc_byte_perm (w4[0], w3[3], selector); - c2[1] = hc_byte_perm (w3[3], w3[2], selector); - c2[0] = hc_byte_perm (w3[2], w3[1], selector); - c1[3] = hc_byte_perm (w3[1], w3[0], selector); - c1[2] = hc_byte_perm (w3[0], w2[3], selector); - c1[1] = hc_byte_perm (w2[3], w2[2], selector); - c1[0] = hc_byte_perm (w2[2], w2[1], selector); - c0[3] = hc_byte_perm (w2[1], w2[0], selector); - c0[2] = hc_byte_perm (w2[0], w1[3], selector); - c0[1] = hc_byte_perm (w1[3], w1[2], selector); - c0[0] = hc_byte_perm (w1[2], w1[1], selector); w7[3] = hc_byte_perm (w1[1], w1[0], selector); w7[2] = hc_byte_perm (w1[0], w0[3], selector); w7[1] = hc_byte_perm (w0[3], w0[2], selector); @@ -12879,34 +13033,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 27: - c6[3] = hc_byte_perm ( 0, w7[3], selector); - c6[2] = hc_byte_perm (w7[3], w7[2], selector); - c6[1] = hc_byte_perm (w7[2], w7[1], selector); - c6[0] = hc_byte_perm (w7[1], w7[0], selector); - c5[3] = hc_byte_perm (w7[0], w6[3], selector); - c5[2] = hc_byte_perm (w6[3], w6[2], selector); - c5[1] = hc_byte_perm (w6[2], w6[1], selector); - c5[0] = hc_byte_perm (w6[1], w6[0], selector); - c4[3] = hc_byte_perm (w6[0], w5[3], selector); - c4[2] = hc_byte_perm (w5[3], w5[2], selector); - c4[1] = hc_byte_perm (w5[2], w5[1], selector); - c4[0] = hc_byte_perm (w5[1], w5[0], selector); - c3[3] = hc_byte_perm (w5[0], w4[3], selector); - c3[2] = hc_byte_perm (w4[3], w4[2], selector); - c3[1] = hc_byte_perm (w4[2], w4[1], selector); - c3[0] = hc_byte_perm (w4[1], w4[0], selector); - c2[3] = hc_byte_perm (w4[0], w3[3], selector); - c2[2] = hc_byte_perm (w3[3], w3[2], selector); - c2[1] = hc_byte_perm (w3[2], w3[1], selector); - c2[0] = hc_byte_perm (w3[1], w3[0], selector); - c1[3] = hc_byte_perm (w3[0], w2[3], selector); - c1[2] = hc_byte_perm (w2[3], w2[2], selector); - c1[1] = hc_byte_perm (w2[2], w2[1], selector); - c1[0] = hc_byte_perm (w2[1], w2[0], selector); - c0[3] = hc_byte_perm (w2[0], w1[3], selector); - c0[2] = hc_byte_perm (w1[3], w1[2], selector); - c0[1] = hc_byte_perm (w1[2], w1[1], selector); - c0[0] = hc_byte_perm (w1[1], w1[0], selector); w7[3] = hc_byte_perm (w1[0], w0[3], selector); w7[2] = hc_byte_perm (w0[3], w0[2], selector); w7[1] = hc_byte_perm (w0[2], w0[1], selector); @@ -12943,35 +13069,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 28: - c7[0] = hc_byte_perm ( 0, w7[3], selector); - c6[3] = hc_byte_perm (w7[3], w7[2], selector); - c6[2] = hc_byte_perm (w7[2], w7[1], selector); - c6[1] = hc_byte_perm (w7[1], w7[0], selector); - c6[0] = hc_byte_perm (w7[0], w6[3], selector); - c5[3] = hc_byte_perm (w6[3], w6[2], selector); - c5[2] = hc_byte_perm (w6[2], w6[1], selector); - c5[1] = hc_byte_perm (w6[1], w6[0], selector); - c5[0] = hc_byte_perm (w6[0], w5[3], selector); - c4[3] = hc_byte_perm (w5[3], w5[2], selector); - c4[2] = hc_byte_perm (w5[2], w5[1], selector); - c4[1] = hc_byte_perm (w5[1], w5[0], selector); - c4[0] = hc_byte_perm (w5[0], w4[3], selector); - c3[3] = hc_byte_perm (w4[3], w4[2], selector); - c3[2] = hc_byte_perm (w4[2], w4[1], selector); - c3[1] = hc_byte_perm (w4[1], w4[0], selector); - c3[0] = hc_byte_perm (w4[0], w3[3], selector); - c2[3] = hc_byte_perm (w3[3], w3[2], selector); - c2[2] = hc_byte_perm (w3[2], w3[1], selector); - c2[1] = hc_byte_perm (w3[1], w3[0], selector); - c2[0] = hc_byte_perm (w3[0], w2[3], selector); - c1[3] = hc_byte_perm (w2[3], w2[2], selector); - c1[2] = hc_byte_perm (w2[2], w2[1], selector); - c1[1] = hc_byte_perm (w2[1], w2[0], selector); - c1[0] = hc_byte_perm (w2[0], w1[3], selector); - c0[3] = hc_byte_perm (w1[3], w1[2], selector); - c0[2] = hc_byte_perm (w1[2], w1[1], selector); - c0[1] = hc_byte_perm (w1[1], w1[0], selector); - c0[0] = hc_byte_perm (w1[0], w0[3], selector); w7[3] = hc_byte_perm (w0[3], w0[2], selector); w7[2] = hc_byte_perm (w0[2], w0[1], selector); w7[1] = hc_byte_perm (w0[1], w0[0], selector); @@ -13008,36 +13105,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 29: - c7[1] = hc_byte_perm ( 0, w7[3], selector); - c7[0] = hc_byte_perm (w7[3], w7[2], selector); - c6[3] = hc_byte_perm (w7[2], w7[1], selector); - c6[2] = hc_byte_perm (w7[1], w7[0], selector); - c6[1] = hc_byte_perm (w7[0], w6[3], selector); - c6[0] = hc_byte_perm (w6[3], w6[2], selector); - c5[3] = hc_byte_perm (w6[2], w6[1], selector); - c5[2] = hc_byte_perm (w6[1], w6[0], selector); - c5[1] = hc_byte_perm (w6[0], w5[3], selector); - c5[0] = hc_byte_perm (w5[3], w5[2], selector); - c4[3] = hc_byte_perm (w5[2], w5[1], selector); - c4[2] = hc_byte_perm (w5[1], w5[0], selector); - c4[1] = hc_byte_perm (w5[0], w4[3], selector); - c4[0] = hc_byte_perm (w4[3], w4[2], selector); - c3[3] = hc_byte_perm (w4[2], w4[1], selector); - c3[2] = hc_byte_perm (w4[1], w4[0], selector); - c3[1] = hc_byte_perm (w4[0], w3[3], selector); - c3[0] = hc_byte_perm (w3[3], w3[2], selector); - c2[3] = hc_byte_perm (w3[2], w3[1], selector); - c2[2] = hc_byte_perm (w3[1], w3[0], selector); - c2[1] = hc_byte_perm (w3[0], w2[3], selector); - c2[0] = hc_byte_perm (w2[3], w2[2], selector); - c1[3] = hc_byte_perm (w2[2], w2[1], selector); - c1[2] = hc_byte_perm (w2[1], w2[0], selector); - c1[1] = hc_byte_perm (w2[0], w1[3], selector); - c1[0] = hc_byte_perm (w1[3], w1[2], selector); - c0[3] = hc_byte_perm (w1[2], w1[1], selector); - c0[2] = hc_byte_perm (w1[1], w1[0], selector); - c0[1] = hc_byte_perm (w1[0], w0[3], selector); - c0[0] = hc_byte_perm (w0[3], w0[2], selector); w7[3] = hc_byte_perm (w0[2], w0[1], selector); w7[2] = hc_byte_perm (w0[1], w0[0], selector); w7[1] = hc_byte_perm (w0[0], 0, selector); @@ -13074,37 +13141,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 30: - c7[2] = hc_byte_perm ( 0, w7[3], selector); - c7[1] = hc_byte_perm (w7[3], w7[2], selector); - c7[0] = hc_byte_perm (w7[2], w7[1], selector); - c6[3] = hc_byte_perm (w7[1], w7[0], selector); - c6[2] = hc_byte_perm (w7[0], w6[3], selector); - c6[1] = hc_byte_perm (w6[3], w6[2], selector); - c6[0] = hc_byte_perm (w6[2], w6[1], selector); - c5[3] = hc_byte_perm (w6[1], w6[0], selector); - c5[2] = hc_byte_perm (w6[0], w5[3], selector); - c5[1] = hc_byte_perm (w5[3], w5[2], selector); - c5[0] = hc_byte_perm (w5[2], w5[1], selector); - c4[3] = hc_byte_perm (w5[1], w5[0], selector); - c4[2] = hc_byte_perm (w5[0], w4[3], selector); - c4[1] = hc_byte_perm (w4[3], w4[2], selector); - c4[0] = hc_byte_perm (w4[2], w4[1], selector); - c3[3] = hc_byte_perm (w4[1], w4[0], selector); - c3[2] = hc_byte_perm (w4[0], w3[3], selector); - c3[1] = hc_byte_perm (w3[3], w3[2], selector); - c3[0] = hc_byte_perm (w3[2], w3[1], selector); - c2[3] = hc_byte_perm (w3[1], w3[0], selector); - c2[2] = hc_byte_perm (w3[0], w2[3], selector); - c2[1] = hc_byte_perm (w2[3], w2[2], selector); - c2[0] = hc_byte_perm (w2[2], w2[1], selector); - c1[3] = hc_byte_perm (w2[1], w2[0], selector); - c1[2] = hc_byte_perm (w2[0], w1[3], selector); - c1[1] = hc_byte_perm (w1[3], w1[2], selector); - c1[0] = hc_byte_perm (w1[2], w1[1], selector); - c0[3] = hc_byte_perm (w1[1], w1[0], selector); - c0[2] = hc_byte_perm (w1[0], w0[3], selector); - c0[1] = hc_byte_perm (w0[3], w0[2], selector); - c0[0] = hc_byte_perm (w0[2], w0[1], selector); w7[3] = hc_byte_perm (w0[1], w0[0], selector); w7[2] = hc_byte_perm (w0[0], 0, selector); w7[1] = 0; @@ -13141,38 +13177,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 break; case 31: - c7[3] = hc_byte_perm ( 0, w7[3], selector); - c7[2] = hc_byte_perm (w7[3], w7[2], selector); - c7[1] = hc_byte_perm (w7[2], w7[1], selector); - c7[0] = hc_byte_perm (w7[1], w7[0], selector); - c6[3] = hc_byte_perm (w7[0], w6[3], selector); - c6[2] = hc_byte_perm (w6[3], w6[2], selector); - c6[1] = hc_byte_perm (w6[2], w6[1], selector); - c6[0] = hc_byte_perm (w6[1], w6[0], selector); - c5[3] = hc_byte_perm (w6[0], w5[3], selector); - c5[2] = hc_byte_perm (w5[3], w5[2], selector); - c5[1] = hc_byte_perm (w5[2], w5[1], selector); - c5[0] = hc_byte_perm (w5[1], w5[0], selector); - c4[3] = hc_byte_perm (w5[0], w4[3], selector); - c4[2] = hc_byte_perm (w4[3], w4[2], selector); - c4[1] = hc_byte_perm (w4[2], w4[1], selector); - c4[0] = hc_byte_perm (w4[1], w4[0], selector); - c3[3] = hc_byte_perm (w4[0], w3[3], selector); - c3[2] = hc_byte_perm (w3[3], w3[2], selector); - c3[1] = hc_byte_perm (w3[2], w3[1], selector); - c3[0] = hc_byte_perm (w3[1], w3[0], selector); - c2[3] = hc_byte_perm (w3[0], w2[3], selector); - c2[2] = hc_byte_perm (w2[3], w2[2], selector); - c2[1] = hc_byte_perm (w2[2], w2[1], selector); - c2[0] = hc_byte_perm (w2[1], w2[0], selector); - c1[3] = hc_byte_perm (w2[0], w1[3], selector); - c1[2] = hc_byte_perm (w1[3], w1[2], selector); - c1[1] = hc_byte_perm (w1[2], w1[1], selector); - c1[0] = hc_byte_perm (w1[1], w1[0], selector); - c0[3] = hc_byte_perm (w1[0], w0[3], selector); - c0[2] = hc_byte_perm (w0[3], w0[2], selector); - c0[1] = hc_byte_perm (w0[2], w0[1], selector); - c0[0] = hc_byte_perm (w0[1], w0[0], selector); w7[3] = hc_byte_perm (w0[0], 0, selector); w7[2] = 0; w7[1] = 0; @@ -13211,7 +13215,7 @@ DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2 #endif } -DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) +DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2, u32x *w3, u32x *w4, u32x *w5, u32x *w6, u32x *w7, u32x *c0, u32x *c1, u32x *c2, u32x *c3, u32x *c4, u32x *c5, u32x *c6, u32x *c7, const u32 offset) { const int offset_switch = offset / 4; @@ -13219,96 +13223,3480 @@ DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) switch (offset_switch) { case 0: - w[63] = hc_bytealign (w[62], w[63], offset); - w[62] = hc_bytealign (w[61], w[62], offset); - w[61] = hc_bytealign (w[60], w[61], offset); - w[60] = hc_bytealign (w[59], w[60], offset); - w[59] = hc_bytealign (w[58], w[59], offset); - w[58] = hc_bytealign (w[57], w[58], offset); - w[57] = hc_bytealign (w[56], w[57], offset); - w[56] = hc_bytealign (w[55], w[56], offset); - w[55] = hc_bytealign (w[54], w[55], offset); - w[54] = hc_bytealign (w[53], w[54], offset); - w[53] = hc_bytealign (w[52], w[53], offset); - w[52] = hc_bytealign (w[51], w[52], offset); - w[51] = hc_bytealign (w[50], w[51], offset); - w[50] = hc_bytealign (w[49], w[50], offset); - w[49] = hc_bytealign (w[48], w[49], offset); - w[48] = hc_bytealign (w[47], w[48], offset); - w[47] = hc_bytealign (w[46], w[47], offset); - w[46] = hc_bytealign (w[45], w[46], offset); - w[45] = hc_bytealign (w[44], w[45], offset); - w[44] = hc_bytealign (w[43], w[44], offset); - w[43] = hc_bytealign (w[42], w[43], offset); - w[42] = hc_bytealign (w[41], w[42], offset); - w[41] = hc_bytealign (w[40], w[41], offset); - w[40] = hc_bytealign (w[39], w[40], offset); - w[39] = hc_bytealign (w[38], w[39], offset); - w[38] = hc_bytealign (w[37], w[38], offset); - w[37] = hc_bytealign (w[36], w[37], offset); - w[36] = hc_bytealign (w[35], w[36], offset); - w[35] = hc_bytealign (w[34], w[35], offset); - w[34] = hc_bytealign (w[33], w[34], offset); - w[33] = hc_bytealign (w[32], w[33], offset); - w[32] = hc_bytealign (w[31], w[32], offset); - w[31] = hc_bytealign (w[30], w[31], offset); - w[30] = hc_bytealign (w[29], w[30], offset); - w[29] = hc_bytealign (w[28], w[29], offset); - w[28] = hc_bytealign (w[27], w[28], offset); - w[27] = hc_bytealign (w[26], w[27], offset); - w[26] = hc_bytealign (w[25], w[26], offset); - w[25] = hc_bytealign (w[24], w[25], offset); - w[24] = hc_bytealign (w[23], w[24], offset); - w[23] = hc_bytealign (w[22], w[23], offset); - w[22] = hc_bytealign (w[21], w[22], offset); - w[21] = hc_bytealign (w[20], w[21], offset); - w[20] = hc_bytealign (w[19], w[20], offset); - w[19] = hc_bytealign (w[18], w[19], offset); - w[18] = hc_bytealign (w[17], w[18], offset); - w[17] = hc_bytealign (w[16], w[17], offset); - w[16] = hc_bytealign (w[15], w[16], offset); - w[15] = hc_bytealign (w[14], w[15], offset); - w[14] = hc_bytealign (w[13], w[14], offset); - w[13] = hc_bytealign (w[12], w[13], offset); - w[12] = hc_bytealign (w[11], w[12], offset); - w[11] = hc_bytealign (w[10], w[11], offset); - w[10] = hc_bytealign (w[ 9], w[10], offset); - w[ 9] = hc_bytealign (w[ 8], w[ 9], offset); - w[ 8] = hc_bytealign (w[ 7], w[ 8], offset); - w[ 7] = hc_bytealign (w[ 6], w[ 7], offset); - w[ 6] = hc_bytealign (w[ 5], w[ 6], offset); - w[ 5] = hc_bytealign (w[ 4], w[ 5], offset); - w[ 4] = hc_bytealign (w[ 3], w[ 4], offset); - w[ 3] = hc_bytealign (w[ 2], w[ 3], offset); - w[ 2] = hc_bytealign (w[ 1], w[ 2], offset); - w[ 1] = hc_bytealign (w[ 0], w[ 1], offset); - w[ 0] = hc_bytealign ( 0, w[ 0], offset); + c0[0] = hc_bytealign_be (w7[3], 0, offset); + w7[3] = hc_bytealign_be (w7[2], w7[3], offset); + w7[2] = hc_bytealign_be (w7[1], w7[2], offset); + w7[1] = hc_bytealign_be (w7[0], w7[1], offset); + w7[0] = hc_bytealign_be (w6[3], w7[0], offset); + w6[3] = hc_bytealign_be (w6[2], w6[3], offset); + w6[2] = hc_bytealign_be (w6[1], w6[2], offset); + w6[1] = hc_bytealign_be (w6[0], w6[1], offset); + w6[0] = hc_bytealign_be (w5[3], w6[0], offset); + w5[3] = hc_bytealign_be (w5[2], w5[3], offset); + w5[2] = hc_bytealign_be (w5[1], w5[2], offset); + w5[1] = hc_bytealign_be (w5[0], w5[1], offset); + w5[0] = hc_bytealign_be (w4[3], w5[0], offset); + w4[3] = hc_bytealign_be (w4[2], w4[3], offset); + w4[2] = hc_bytealign_be (w4[1], w4[2], offset); + w4[1] = hc_bytealign_be (w4[0], w4[1], offset); + w4[0] = hc_bytealign_be (w3[3], w4[0], offset); + w3[3] = hc_bytealign_be (w3[2], w3[3], offset); + w3[2] = hc_bytealign_be (w3[1], w3[2], offset); + w3[1] = hc_bytealign_be (w3[0], w3[1], offset); + w3[0] = hc_bytealign_be (w2[3], w3[0], offset); + w2[3] = hc_bytealign_be (w2[2], w2[3], offset); + w2[2] = hc_bytealign_be (w2[1], w2[2], offset); + w2[1] = hc_bytealign_be (w2[0], w2[1], offset); + w2[0] = hc_bytealign_be (w1[3], w2[0], offset); + w1[3] = hc_bytealign_be (w1[2], w1[3], offset); + w1[2] = hc_bytealign_be (w1[1], w1[2], offset); + w1[1] = hc_bytealign_be (w1[0], w1[1], offset); + w1[0] = hc_bytealign_be (w0[3], w1[0], offset); + w0[3] = hc_bytealign_be (w0[2], w0[3], offset); + w0[2] = hc_bytealign_be (w0[1], w0[2], offset); + w0[1] = hc_bytealign_be (w0[0], w0[1], offset); + w0[0] = hc_bytealign_be ( 0, w0[0], offset); break; case 1: - w[63] = hc_bytealign (w[61], w[62], offset); - w[62] = hc_bytealign (w[60], w[61], offset); - w[61] = hc_bytealign (w[59], w[60], offset); - w[60] = hc_bytealign (w[58], w[59], offset); - w[59] = hc_bytealign (w[57], w[58], offset); - w[58] = hc_bytealign (w[56], w[57], offset); - w[57] = hc_bytealign (w[55], w[56], offset); - w[56] = hc_bytealign (w[54], w[55], offset); - w[55] = hc_bytealign (w[53], w[54], offset); - w[54] = hc_bytealign (w[52], w[53], offset); - w[53] = hc_bytealign (w[51], w[52], offset); - w[52] = hc_bytealign (w[50], w[51], offset); - w[51] = hc_bytealign (w[49], w[50], offset); - w[50] = hc_bytealign (w[48], w[49], offset); - w[49] = hc_bytealign (w[47], w[48], offset); - w[48] = hc_bytealign (w[46], w[47], offset); - w[47] = hc_bytealign (w[45], w[46], offset); - w[46] = hc_bytealign (w[44], w[45], offset); - w[45] = hc_bytealign (w[43], w[44], offset); - w[44] = hc_bytealign (w[42], w[43], offset); - w[43] = hc_bytealign (w[41], w[42], offset); - w[42] = hc_bytealign (w[40], w[41], offset); + c0[1] = hc_bytealign_be (w7[3], 0, offset); + c0[0] = hc_bytealign_be (w7[2], w7[3], offset); + w7[3] = hc_bytealign_be (w7[1], w7[2], offset); + w7[2] = hc_bytealign_be (w7[0], w7[1], offset); + w7[1] = hc_bytealign_be (w6[3], w7[0], offset); + w7[0] = hc_bytealign_be (w6[2], w6[3], offset); + w6[3] = hc_bytealign_be (w6[1], w6[2], offset); + w6[2] = hc_bytealign_be (w6[0], w6[1], offset); + w6[1] = hc_bytealign_be (w5[3], w6[0], offset); + w6[0] = hc_bytealign_be (w5[2], w5[3], offset); + w5[3] = hc_bytealign_be (w5[1], w5[2], offset); + w5[2] = hc_bytealign_be (w5[0], w5[1], offset); + w5[1] = hc_bytealign_be (w4[3], w5[0], offset); + w5[0] = hc_bytealign_be (w4[2], w4[3], offset); + w4[3] = hc_bytealign_be (w4[1], w4[2], offset); + w4[2] = hc_bytealign_be (w4[0], w4[1], offset); + w4[1] = hc_bytealign_be (w3[3], w4[0], offset); + w4[0] = hc_bytealign_be (w3[2], w3[3], offset); + w3[3] = hc_bytealign_be (w3[1], w3[2], offset); + w3[2] = hc_bytealign_be (w3[0], w3[1], offset); + w3[1] = hc_bytealign_be (w2[3], w3[0], offset); + w3[0] = hc_bytealign_be (w2[2], w2[3], offset); + w2[3] = hc_bytealign_be (w2[1], w2[2], offset); + w2[2] = hc_bytealign_be (w2[0], w2[1], offset); + w2[1] = hc_bytealign_be (w1[3], w2[0], offset); + w2[0] = hc_bytealign_be (w1[2], w1[3], offset); + w1[3] = hc_bytealign_be (w1[1], w1[2], offset); + w1[2] = hc_bytealign_be (w1[0], w1[1], offset); + w1[1] = hc_bytealign_be (w0[3], w1[0], offset); + w1[0] = hc_bytealign_be (w0[2], w0[3], offset); + w0[3] = hc_bytealign_be (w0[1], w0[2], offset); + w0[2] = hc_bytealign_be (w0[0], w0[1], offset); + w0[1] = hc_bytealign_be ( 0, w0[0], offset); + w0[0] = 0; + + break; + + case 2: + c0[2] = hc_bytealign_be (w7[3], 0, offset); + c0[1] = hc_bytealign_be (w7[2], w7[3], offset); + c0[0] = hc_bytealign_be (w7[1], w7[2], offset); + w7[3] = hc_bytealign_be (w7[0], w7[1], offset); + w7[2] = hc_bytealign_be (w6[3], w7[0], offset); + w7[1] = hc_bytealign_be (w6[2], w6[3], offset); + w7[0] = hc_bytealign_be (w6[1], w6[2], offset); + w6[3] = hc_bytealign_be (w6[0], w6[1], offset); + w6[2] = hc_bytealign_be (w5[3], w6[0], offset); + w6[1] = hc_bytealign_be (w5[2], w5[3], offset); + w6[0] = hc_bytealign_be (w5[1], w5[2], offset); + w5[3] = hc_bytealign_be (w5[0], w5[1], offset); + w5[2] = hc_bytealign_be (w4[3], w5[0], offset); + w5[1] = hc_bytealign_be (w4[2], w4[3], offset); + w5[0] = hc_bytealign_be (w4[1], w4[2], offset); + w4[3] = hc_bytealign_be (w4[0], w4[1], offset); + w4[2] = hc_bytealign_be (w3[3], w4[0], offset); + w4[1] = hc_bytealign_be (w3[2], w3[3], offset); + w4[0] = hc_bytealign_be (w3[1], w3[2], offset); + w3[3] = hc_bytealign_be (w3[0], w3[1], offset); + w3[2] = hc_bytealign_be (w2[3], w3[0], offset); + w3[1] = hc_bytealign_be (w2[2], w2[3], offset); + w3[0] = hc_bytealign_be (w2[1], w2[2], offset); + w2[3] = hc_bytealign_be (w2[0], w2[1], offset); + w2[2] = hc_bytealign_be (w1[3], w2[0], offset); + w2[1] = hc_bytealign_be (w1[2], w1[3], offset); + w2[0] = hc_bytealign_be (w1[1], w1[2], offset); + w1[3] = hc_bytealign_be (w1[0], w1[1], offset); + w1[2] = hc_bytealign_be (w0[3], w1[0], offset); + w1[1] = hc_bytealign_be (w0[2], w0[3], offset); + w1[0] = hc_bytealign_be (w0[1], w0[2], offset); + w0[3] = hc_bytealign_be (w0[0], w0[1], offset); + w0[2] = hc_bytealign_be ( 0, w0[0], offset); + w0[1] = 0; + w0[0] = 0; + + break; + + case 3: + c0[3] = hc_bytealign_be (w7[3], 0, offset); + c0[2] = hc_bytealign_be (w7[2], w7[3], offset); + c0[1] = hc_bytealign_be (w7[1], w7[2], offset); + c0[0] = hc_bytealign_be (w7[0], w7[1], offset); + w7[3] = hc_bytealign_be (w6[3], w7[0], offset); + w7[2] = hc_bytealign_be (w6[2], w6[3], offset); + w7[1] = hc_bytealign_be (w6[1], w6[2], offset); + w7[0] = hc_bytealign_be (w6[0], w6[1], offset); + w6[3] = hc_bytealign_be (w5[3], w6[0], offset); + w6[2] = hc_bytealign_be (w5[2], w5[3], offset); + w6[1] = hc_bytealign_be (w5[1], w5[2], offset); + w6[0] = hc_bytealign_be (w5[0], w5[1], offset); + w5[3] = hc_bytealign_be (w4[3], w5[0], offset); + w5[2] = hc_bytealign_be (w4[2], w4[3], offset); + w5[1] = hc_bytealign_be (w4[1], w4[2], offset); + w5[0] = hc_bytealign_be (w4[0], w4[1], offset); + w4[3] = hc_bytealign_be (w3[3], w4[0], offset); + w4[2] = hc_bytealign_be (w3[2], w3[3], offset); + w4[1] = hc_bytealign_be (w3[1], w3[2], offset); + w4[0] = hc_bytealign_be (w3[0], w3[1], offset); + w3[3] = hc_bytealign_be (w2[3], w3[0], offset); + w3[2] = hc_bytealign_be (w2[2], w2[3], offset); + w3[1] = hc_bytealign_be (w2[1], w2[2], offset); + w3[0] = hc_bytealign_be (w2[0], w2[1], offset); + w2[3] = hc_bytealign_be (w1[3], w2[0], offset); + w2[2] = hc_bytealign_be (w1[2], w1[3], offset); + w2[1] = hc_bytealign_be (w1[1], w1[2], offset); + w2[0] = hc_bytealign_be (w1[0], w1[1], offset); + w1[3] = hc_bytealign_be (w0[3], w1[0], offset); + w1[2] = hc_bytealign_be (w0[2], w0[3], offset); + w1[1] = hc_bytealign_be (w0[1], w0[2], offset); + w1[0] = hc_bytealign_be (w0[0], w0[1], offset); + w0[3] = hc_bytealign_be ( 0, w0[0], offset); + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 4: + c1[0] = hc_bytealign_be (w7[3], 0, offset); + c0[3] = hc_bytealign_be (w7[2], w7[3], offset); + c0[2] = hc_bytealign_be (w7[1], w7[2], offset); + c0[1] = hc_bytealign_be (w7[0], w7[1], offset); + c0[0] = hc_bytealign_be (w6[3], w7[0], offset); + w7[3] = hc_bytealign_be (w6[2], w6[3], offset); + w7[2] = hc_bytealign_be (w6[1], w6[2], offset); + w7[1] = hc_bytealign_be (w6[0], w6[1], offset); + w7[0] = hc_bytealign_be (w5[3], w6[0], offset); + w6[3] = hc_bytealign_be (w5[2], w5[3], offset); + w6[2] = hc_bytealign_be (w5[1], w5[2], offset); + w6[1] = hc_bytealign_be (w5[0], w5[1], offset); + w6[0] = hc_bytealign_be (w4[3], w5[0], offset); + w5[3] = hc_bytealign_be (w4[2], w4[3], offset); + w5[2] = hc_bytealign_be (w4[1], w4[2], offset); + w5[1] = hc_bytealign_be (w4[0], w4[1], offset); + w5[0] = hc_bytealign_be (w3[3], w4[0], offset); + w4[3] = hc_bytealign_be (w3[2], w3[3], offset); + w4[2] = hc_bytealign_be (w3[1], w3[2], offset); + w4[1] = hc_bytealign_be (w3[0], w3[1], offset); + w4[0] = hc_bytealign_be (w2[3], w3[0], offset); + w3[3] = hc_bytealign_be (w2[2], w2[3], offset); + w3[2] = hc_bytealign_be (w2[1], w2[2], offset); + w3[1] = hc_bytealign_be (w2[0], w2[1], offset); + w3[0] = hc_bytealign_be (w1[3], w2[0], offset); + w2[3] = hc_bytealign_be (w1[2], w1[3], offset); + w2[2] = hc_bytealign_be (w1[1], w1[2], offset); + w2[1] = hc_bytealign_be (w1[0], w1[1], offset); + w2[0] = hc_bytealign_be (w0[3], w1[0], offset); + w1[3] = hc_bytealign_be (w0[2], w0[3], offset); + w1[2] = hc_bytealign_be (w0[1], w0[2], offset); + w1[1] = hc_bytealign_be (w0[0], w0[1], offset); + w1[0] = hc_bytealign_be ( 0, w0[0], offset); + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 5: + c1[1] = hc_bytealign_be (w7[3], 0, offset); + c1[0] = hc_bytealign_be (w7[2], w7[3], offset); + c0[3] = hc_bytealign_be (w7[1], w7[2], offset); + c0[2] = hc_bytealign_be (w7[0], w7[1], offset); + c0[1] = hc_bytealign_be (w6[3], w7[0], offset); + c0[0] = hc_bytealign_be (w6[2], w6[3], offset); + w7[3] = hc_bytealign_be (w6[1], w6[2], offset); + w7[2] = hc_bytealign_be (w6[0], w6[1], offset); + w7[1] = hc_bytealign_be (w5[3], w6[0], offset); + w7[0] = hc_bytealign_be (w5[2], w5[3], offset); + w6[3] = hc_bytealign_be (w5[1], w5[2], offset); + w6[2] = hc_bytealign_be (w5[0], w5[1], offset); + w6[1] = hc_bytealign_be (w4[3], w5[0], offset); + w6[0] = hc_bytealign_be (w4[2], w4[3], offset); + w5[3] = hc_bytealign_be (w4[1], w4[2], offset); + w5[2] = hc_bytealign_be (w4[0], w4[1], offset); + w5[1] = hc_bytealign_be (w3[3], w4[0], offset); + w5[0] = hc_bytealign_be (w3[2], w3[3], offset); + w4[3] = hc_bytealign_be (w3[1], w3[2], offset); + w4[2] = hc_bytealign_be (w3[0], w3[1], offset); + w4[1] = hc_bytealign_be (w2[3], w3[0], offset); + w4[0] = hc_bytealign_be (w2[2], w2[3], offset); + w3[3] = hc_bytealign_be (w2[1], w2[2], offset); + w3[2] = hc_bytealign_be (w2[0], w2[1], offset); + w3[1] = hc_bytealign_be (w1[3], w2[0], offset); + w3[0] = hc_bytealign_be (w1[2], w1[3], offset); + w2[3] = hc_bytealign_be (w1[1], w1[2], offset); + w2[2] = hc_bytealign_be (w1[0], w1[1], offset); + w2[1] = hc_bytealign_be (w0[3], w1[0], offset); + w2[0] = hc_bytealign_be (w0[2], w0[3], offset); + w1[3] = hc_bytealign_be (w0[1], w0[2], offset); + w1[2] = hc_bytealign_be (w0[0], w0[1], offset); + w1[1] = hc_bytealign_be ( 0, w0[0], offset); + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 6: + c1[2] = hc_bytealign_be (w7[3], 0, offset); + c1[1] = hc_bytealign_be (w7[2], w7[3], offset); + c1[0] = hc_bytealign_be (w7[1], w7[2], offset); + c0[3] = hc_bytealign_be (w7[0], w7[1], offset); + c0[2] = hc_bytealign_be (w6[3], w7[0], offset); + c0[1] = hc_bytealign_be (w6[2], w6[3], offset); + c0[0] = hc_bytealign_be (w6[1], w6[2], offset); + w7[3] = hc_bytealign_be (w6[0], w6[1], offset); + w7[2] = hc_bytealign_be (w5[3], w6[0], offset); + w7[1] = hc_bytealign_be (w5[2], w5[3], offset); + w7[0] = hc_bytealign_be (w5[1], w5[2], offset); + w6[3] = hc_bytealign_be (w5[0], w5[1], offset); + w6[2] = hc_bytealign_be (w4[3], w5[0], offset); + w6[1] = hc_bytealign_be (w4[2], w4[3], offset); + w6[0] = hc_bytealign_be (w4[1], w4[2], offset); + w5[3] = hc_bytealign_be (w4[0], w4[1], offset); + w5[2] = hc_bytealign_be (w3[3], w4[0], offset); + w5[1] = hc_bytealign_be (w3[2], w3[3], offset); + w5[0] = hc_bytealign_be (w3[1], w3[2], offset); + w4[3] = hc_bytealign_be (w3[0], w3[1], offset); + w4[2] = hc_bytealign_be (w2[3], w3[0], offset); + w4[1] = hc_bytealign_be (w2[2], w2[3], offset); + w4[0] = hc_bytealign_be (w2[1], w2[2], offset); + w3[3] = hc_bytealign_be (w2[0], w2[1], offset); + w3[2] = hc_bytealign_be (w1[3], w2[0], offset); + w3[1] = hc_bytealign_be (w1[2], w1[3], offset); + w3[0] = hc_bytealign_be (w1[1], w1[2], offset); + w2[3] = hc_bytealign_be (w1[0], w1[1], offset); + w2[2] = hc_bytealign_be (w0[3], w1[0], offset); + w2[1] = hc_bytealign_be (w0[2], w0[3], offset); + w2[0] = hc_bytealign_be (w0[1], w0[2], offset); + w1[3] = hc_bytealign_be (w0[0], w0[1], offset); + w1[2] = hc_bytealign_be ( 0, w0[0], offset); + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 7: + c1[3] = hc_bytealign_be (w7[3], 0, offset); + c1[2] = hc_bytealign_be (w7[2], w7[3], offset); + c1[1] = hc_bytealign_be (w7[1], w7[2], offset); + c1[0] = hc_bytealign_be (w7[0], w7[1], offset); + c0[3] = hc_bytealign_be (w6[3], w7[0], offset); + c0[2] = hc_bytealign_be (w6[2], w6[3], offset); + c0[1] = hc_bytealign_be (w6[1], w6[2], offset); + c0[0] = hc_bytealign_be (w6[0], w6[1], offset); + w7[3] = hc_bytealign_be (w5[3], w6[0], offset); + w7[2] = hc_bytealign_be (w5[2], w5[3], offset); + w7[1] = hc_bytealign_be (w5[1], w5[2], offset); + w7[0] = hc_bytealign_be (w5[0], w5[1], offset); + w6[3] = hc_bytealign_be (w4[3], w5[0], offset); + w6[2] = hc_bytealign_be (w4[2], w4[3], offset); + w6[1] = hc_bytealign_be (w4[1], w4[2], offset); + w6[0] = hc_bytealign_be (w4[0], w4[1], offset); + w5[3] = hc_bytealign_be (w3[3], w4[0], offset); + w5[2] = hc_bytealign_be (w3[2], w3[3], offset); + w5[1] = hc_bytealign_be (w3[1], w3[2], offset); + w5[0] = hc_bytealign_be (w3[0], w3[1], offset); + w4[3] = hc_bytealign_be (w2[3], w3[0], offset); + w4[2] = hc_bytealign_be (w2[2], w2[3], offset); + w4[1] = hc_bytealign_be (w2[1], w2[2], offset); + w4[0] = hc_bytealign_be (w2[0], w2[1], offset); + w3[3] = hc_bytealign_be (w1[3], w2[0], offset); + w3[2] = hc_bytealign_be (w1[2], w1[3], offset); + w3[1] = hc_bytealign_be (w1[1], w1[2], offset); + w3[0] = hc_bytealign_be (w1[0], w1[1], offset); + w2[3] = hc_bytealign_be (w0[3], w1[0], offset); + w2[2] = hc_bytealign_be (w0[2], w0[3], offset); + w2[1] = hc_bytealign_be (w0[1], w0[2], offset); + w2[0] = hc_bytealign_be (w0[0], w0[1], offset); + w1[3] = hc_bytealign_be ( 0, w0[0], offset); + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 8: + c2[0] = hc_bytealign_be (w7[3], 0, offset); + c1[3] = hc_bytealign_be (w7[2], w7[3], offset); + c1[2] = hc_bytealign_be (w7[1], w7[2], offset); + c1[1] = hc_bytealign_be (w7[0], w7[1], offset); + c1[0] = hc_bytealign_be (w6[3], w7[0], offset); + c0[3] = hc_bytealign_be (w6[2], w6[3], offset); + c0[2] = hc_bytealign_be (w6[1], w6[2], offset); + c0[1] = hc_bytealign_be (w6[0], w6[1], offset); + c0[0] = hc_bytealign_be (w5[3], w6[0], offset); + w7[3] = hc_bytealign_be (w5[2], w5[3], offset); + w7[2] = hc_bytealign_be (w5[1], w5[2], offset); + w7[1] = hc_bytealign_be (w5[0], w5[1], offset); + w7[0] = hc_bytealign_be (w4[3], w5[0], offset); + w6[3] = hc_bytealign_be (w4[2], w4[3], offset); + w6[2] = hc_bytealign_be (w4[1], w4[2], offset); + w6[1] = hc_bytealign_be (w4[0], w4[1], offset); + w6[0] = hc_bytealign_be (w3[3], w4[0], offset); + w5[3] = hc_bytealign_be (w3[2], w3[3], offset); + w5[2] = hc_bytealign_be (w3[1], w3[2], offset); + w5[1] = hc_bytealign_be (w3[0], w3[1], offset); + w5[0] = hc_bytealign_be (w2[3], w3[0], offset); + w4[3] = hc_bytealign_be (w2[2], w2[3], offset); + w4[2] = hc_bytealign_be (w2[1], w2[2], offset); + w4[1] = hc_bytealign_be (w2[0], w2[1], offset); + w4[0] = hc_bytealign_be (w1[3], w2[0], offset); + w3[3] = hc_bytealign_be (w1[2], w1[3], offset); + w3[2] = hc_bytealign_be (w1[1], w1[2], offset); + w3[1] = hc_bytealign_be (w1[0], w1[1], offset); + w3[0] = hc_bytealign_be (w0[3], w1[0], offset); + w2[3] = hc_bytealign_be (w0[2], w0[3], offset); + w2[2] = hc_bytealign_be (w0[1], w0[2], offset); + w2[1] = hc_bytealign_be (w0[0], w0[1], offset); + w2[0] = hc_bytealign_be ( 0, w0[0], offset); + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 9: + c2[1] = hc_bytealign_be (w7[3], 0, offset); + c2[0] = hc_bytealign_be (w7[2], w7[3], offset); + c1[3] = hc_bytealign_be (w7[1], w7[2], offset); + c1[2] = hc_bytealign_be (w7[0], w7[1], offset); + c1[1] = hc_bytealign_be (w6[3], w7[0], offset); + c1[0] = hc_bytealign_be (w6[2], w6[3], offset); + c0[3] = hc_bytealign_be (w6[1], w6[2], offset); + c0[2] = hc_bytealign_be (w6[0], w6[1], offset); + c0[1] = hc_bytealign_be (w5[3], w6[0], offset); + c0[0] = hc_bytealign_be (w5[2], w5[3], offset); + w7[3] = hc_bytealign_be (w5[1], w5[2], offset); + w7[2] = hc_bytealign_be (w5[0], w5[1], offset); + w7[1] = hc_bytealign_be (w4[3], w5[0], offset); + w7[0] = hc_bytealign_be (w4[2], w4[3], offset); + w6[3] = hc_bytealign_be (w4[1], w4[2], offset); + w6[2] = hc_bytealign_be (w4[0], w4[1], offset); + w6[1] = hc_bytealign_be (w3[3], w4[0], offset); + w6[0] = hc_bytealign_be (w3[2], w3[3], offset); + w5[3] = hc_bytealign_be (w3[1], w3[2], offset); + w5[2] = hc_bytealign_be (w3[0], w3[1], offset); + w5[1] = hc_bytealign_be (w2[3], w3[0], offset); + w5[0] = hc_bytealign_be (w2[2], w2[3], offset); + w4[3] = hc_bytealign_be (w2[1], w2[2], offset); + w4[2] = hc_bytealign_be (w2[0], w2[1], offset); + w4[1] = hc_bytealign_be (w1[3], w2[0], offset); + w4[0] = hc_bytealign_be (w1[2], w1[3], offset); + w3[3] = hc_bytealign_be (w1[1], w1[2], offset); + w3[2] = hc_bytealign_be (w1[0], w1[1], offset); + w3[1] = hc_bytealign_be (w0[3], w1[0], offset); + w3[0] = hc_bytealign_be (w0[2], w0[3], offset); + w2[3] = hc_bytealign_be (w0[1], w0[2], offset); + w2[2] = hc_bytealign_be (w0[0], w0[1], offset); + w2[1] = hc_bytealign_be ( 0, w0[0], offset); + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 10: + c2[2] = hc_bytealign_be (w7[3], 0, offset); + c2[1] = hc_bytealign_be (w7[2], w7[3], offset); + c2[0] = hc_bytealign_be (w7[1], w7[2], offset); + c1[3] = hc_bytealign_be (w7[0], w7[1], offset); + c1[2] = hc_bytealign_be (w6[3], w7[0], offset); + c1[1] = hc_bytealign_be (w6[2], w6[3], offset); + c1[0] = hc_bytealign_be (w6[1], w6[2], offset); + c0[3] = hc_bytealign_be (w6[0], w6[1], offset); + c0[2] = hc_bytealign_be (w5[3], w6[0], offset); + c0[1] = hc_bytealign_be (w5[2], w5[3], offset); + c0[0] = hc_bytealign_be (w5[1], w5[2], offset); + w7[3] = hc_bytealign_be (w5[0], w5[1], offset); + w7[2] = hc_bytealign_be (w4[3], w5[0], offset); + w7[1] = hc_bytealign_be (w4[2], w4[3], offset); + w7[0] = hc_bytealign_be (w4[1], w4[2], offset); + w6[3] = hc_bytealign_be (w4[0], w4[1], offset); + w6[2] = hc_bytealign_be (w3[3], w4[0], offset); + w6[1] = hc_bytealign_be (w3[2], w3[3], offset); + w6[0] = hc_bytealign_be (w3[1], w3[2], offset); + w5[3] = hc_bytealign_be (w3[0], w3[1], offset); + w5[2] = hc_bytealign_be (w2[3], w3[0], offset); + w5[1] = hc_bytealign_be (w2[2], w2[3], offset); + w5[0] = hc_bytealign_be (w2[1], w2[2], offset); + w4[3] = hc_bytealign_be (w2[0], w2[1], offset); + w4[2] = hc_bytealign_be (w1[3], w2[0], offset); + w4[1] = hc_bytealign_be (w1[2], w1[3], offset); + w4[0] = hc_bytealign_be (w1[1], w1[2], offset); + w3[3] = hc_bytealign_be (w1[0], w1[1], offset); + w3[2] = hc_bytealign_be (w0[3], w1[0], offset); + w3[1] = hc_bytealign_be (w0[2], w0[3], offset); + w3[0] = hc_bytealign_be (w0[1], w0[2], offset); + w2[3] = hc_bytealign_be (w0[0], w0[1], offset); + w2[2] = hc_bytealign_be ( 0, w0[0], offset); + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 11: + c2[3] = hc_bytealign_be (w7[3], 0, offset); + c2[2] = hc_bytealign_be (w7[2], w7[3], offset); + c2[1] = hc_bytealign_be (w7[1], w7[2], offset); + c2[0] = hc_bytealign_be (w7[0], w7[1], offset); + c1[3] = hc_bytealign_be (w6[3], w7[0], offset); + c1[2] = hc_bytealign_be (w6[2], w6[3], offset); + c1[1] = hc_bytealign_be (w6[1], w6[2], offset); + c1[0] = hc_bytealign_be (w6[0], w6[1], offset); + c0[3] = hc_bytealign_be (w5[3], w6[0], offset); + c0[2] = hc_bytealign_be (w5[2], w5[3], offset); + c0[1] = hc_bytealign_be (w5[1], w5[2], offset); + c0[0] = hc_bytealign_be (w5[0], w5[1], offset); + w7[3] = hc_bytealign_be (w4[3], w5[0], offset); + w7[2] = hc_bytealign_be (w4[2], w4[3], offset); + w7[1] = hc_bytealign_be (w4[1], w4[2], offset); + w7[0] = hc_bytealign_be (w4[0], w4[1], offset); + w6[3] = hc_bytealign_be (w3[3], w4[0], offset); + w6[2] = hc_bytealign_be (w3[2], w3[3], offset); + w6[1] = hc_bytealign_be (w3[1], w3[2], offset); + w6[0] = hc_bytealign_be (w3[0], w3[1], offset); + w5[3] = hc_bytealign_be (w2[3], w3[0], offset); + w5[2] = hc_bytealign_be (w2[2], w2[3], offset); + w5[1] = hc_bytealign_be (w2[1], w2[2], offset); + w5[0] = hc_bytealign_be (w2[0], w2[1], offset); + w4[3] = hc_bytealign_be (w1[3], w2[0], offset); + w4[2] = hc_bytealign_be (w1[2], w1[3], offset); + w4[1] = hc_bytealign_be (w1[1], w1[2], offset); + w4[0] = hc_bytealign_be (w1[0], w1[1], offset); + w3[3] = hc_bytealign_be (w0[3], w1[0], offset); + w3[2] = hc_bytealign_be (w0[2], w0[3], offset); + w3[1] = hc_bytealign_be (w0[1], w0[2], offset); + w3[0] = hc_bytealign_be (w0[0], w0[1], offset); + w2[3] = hc_bytealign_be ( 0, w0[0], offset); + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 12: + c3[0] = hc_bytealign_be (w7[3], 0, offset); + c2[3] = hc_bytealign_be (w7[2], w7[3], offset); + c2[2] = hc_bytealign_be (w7[1], w7[2], offset); + c2[1] = hc_bytealign_be (w7[0], w7[1], offset); + c2[0] = hc_bytealign_be (w6[3], w7[0], offset); + c1[3] = hc_bytealign_be (w6[2], w6[3], offset); + c1[2] = hc_bytealign_be (w6[1], w6[2], offset); + c1[1] = hc_bytealign_be (w6[0], w6[1], offset); + c1[0] = hc_bytealign_be (w5[3], w6[0], offset); + c0[3] = hc_bytealign_be (w5[2], w5[3], offset); + c0[2] = hc_bytealign_be (w5[1], w5[2], offset); + c0[1] = hc_bytealign_be (w5[0], w5[1], offset); + c0[0] = hc_bytealign_be (w4[3], w5[0], offset); + w7[3] = hc_bytealign_be (w4[2], w4[3], offset); + w7[2] = hc_bytealign_be (w4[1], w4[2], offset); + w7[1] = hc_bytealign_be (w4[0], w4[1], offset); + w7[0] = hc_bytealign_be (w3[3], w4[0], offset); + w6[3] = hc_bytealign_be (w3[2], w3[3], offset); + w6[2] = hc_bytealign_be (w3[1], w3[2], offset); + w6[1] = hc_bytealign_be (w3[0], w3[1], offset); + w6[0] = hc_bytealign_be (w2[3], w3[0], offset); + w5[3] = hc_bytealign_be (w2[2], w2[3], offset); + w5[2] = hc_bytealign_be (w2[1], w2[2], offset); + w5[1] = hc_bytealign_be (w2[0], w2[1], offset); + w5[0] = hc_bytealign_be (w1[3], w2[0], offset); + w4[3] = hc_bytealign_be (w1[2], w1[3], offset); + w4[2] = hc_bytealign_be (w1[1], w1[2], offset); + w4[1] = hc_bytealign_be (w1[0], w1[1], offset); + w4[0] = hc_bytealign_be (w0[3], w1[0], offset); + w3[3] = hc_bytealign_be (w0[2], w0[3], offset); + w3[2] = hc_bytealign_be (w0[1], w0[2], offset); + w3[1] = hc_bytealign_be (w0[0], w0[1], offset); + w3[0] = hc_bytealign_be ( 0, w0[0], offset); + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 13: + c3[1] = hc_bytealign_be (w7[3], 0, offset); + c3[0] = hc_bytealign_be (w7[2], w7[3], offset); + c2[3] = hc_bytealign_be (w7[1], w7[2], offset); + c2[2] = hc_bytealign_be (w7[0], w7[1], offset); + c2[1] = hc_bytealign_be (w6[3], w7[0], offset); + c2[0] = hc_bytealign_be (w6[2], w6[3], offset); + c1[3] = hc_bytealign_be (w6[1], w6[2], offset); + c1[2] = hc_bytealign_be (w6[0], w6[1], offset); + c1[1] = hc_bytealign_be (w5[3], w6[0], offset); + c1[0] = hc_bytealign_be (w5[2], w5[3], offset); + c0[3] = hc_bytealign_be (w5[1], w5[2], offset); + c0[2] = hc_bytealign_be (w5[0], w5[1], offset); + c0[1] = hc_bytealign_be (w4[3], w5[0], offset); + c0[0] = hc_bytealign_be (w4[2], w4[3], offset); + w7[3] = hc_bytealign_be (w4[1], w4[2], offset); + w7[2] = hc_bytealign_be (w4[0], w4[1], offset); + w7[1] = hc_bytealign_be (w3[3], w4[0], offset); + w7[0] = hc_bytealign_be (w3[2], w3[3], offset); + w6[3] = hc_bytealign_be (w3[1], w3[2], offset); + w6[2] = hc_bytealign_be (w3[0], w3[1], offset); + w6[1] = hc_bytealign_be (w2[3], w3[0], offset); + w6[0] = hc_bytealign_be (w2[2], w2[3], offset); + w5[3] = hc_bytealign_be (w2[1], w2[2], offset); + w5[2] = hc_bytealign_be (w2[0], w2[1], offset); + w5[1] = hc_bytealign_be (w1[3], w2[0], offset); + w5[0] = hc_bytealign_be (w1[2], w1[3], offset); + w4[3] = hc_bytealign_be (w1[1], w1[2], offset); + w4[2] = hc_bytealign_be (w1[0], w1[1], offset); + w4[1] = hc_bytealign_be (w0[3], w1[0], offset); + w4[0] = hc_bytealign_be (w0[2], w0[3], offset); + w3[3] = hc_bytealign_be (w0[1], w0[2], offset); + w3[2] = hc_bytealign_be (w0[0], w0[1], offset); + w3[1] = hc_bytealign_be ( 0, w0[0], offset); + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 14: + c3[2] = hc_bytealign_be (w7[3], 0, offset); + c3[1] = hc_bytealign_be (w7[2], w7[3], offset); + c3[0] = hc_bytealign_be (w7[1], w7[2], offset); + c2[3] = hc_bytealign_be (w7[0], w7[1], offset); + c2[2] = hc_bytealign_be (w6[3], w7[0], offset); + c2[1] = hc_bytealign_be (w6[2], w6[3], offset); + c2[0] = hc_bytealign_be (w6[1], w6[2], offset); + c1[3] = hc_bytealign_be (w6[0], w6[1], offset); + c1[2] = hc_bytealign_be (w5[3], w6[0], offset); + c1[1] = hc_bytealign_be (w5[2], w5[3], offset); + c1[0] = hc_bytealign_be (w5[1], w5[2], offset); + c0[3] = hc_bytealign_be (w5[0], w5[1], offset); + c0[2] = hc_bytealign_be (w4[3], w5[0], offset); + c0[1] = hc_bytealign_be (w4[2], w4[3], offset); + c0[0] = hc_bytealign_be (w4[1], w4[2], offset); + w7[3] = hc_bytealign_be (w4[0], w4[1], offset); + w7[2] = hc_bytealign_be (w3[3], w4[0], offset); + w7[1] = hc_bytealign_be (w3[2], w3[3], offset); + w7[0] = hc_bytealign_be (w3[1], w3[2], offset); + w6[3] = hc_bytealign_be (w3[0], w3[1], offset); + w6[2] = hc_bytealign_be (w2[3], w3[0], offset); + w6[1] = hc_bytealign_be (w2[2], w2[3], offset); + w6[0] = hc_bytealign_be (w2[1], w2[2], offset); + w5[3] = hc_bytealign_be (w2[0], w2[1], offset); + w5[2] = hc_bytealign_be (w1[3], w2[0], offset); + w5[1] = hc_bytealign_be (w1[2], w1[3], offset); + w5[0] = hc_bytealign_be (w1[1], w1[2], offset); + w4[3] = hc_bytealign_be (w1[0], w1[1], offset); + w4[2] = hc_bytealign_be (w0[3], w1[0], offset); + w4[1] = hc_bytealign_be (w0[2], w0[3], offset); + w4[0] = hc_bytealign_be (w0[1], w0[2], offset); + w3[3] = hc_bytealign_be (w0[0], w0[1], offset); + w3[2] = hc_bytealign_be ( 0, w0[0], offset); + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 15: + c3[3] = hc_bytealign_be (w7[3], 0, offset); + c3[2] = hc_bytealign_be (w7[2], w7[3], offset); + c3[1] = hc_bytealign_be (w7[1], w7[2], offset); + c3[0] = hc_bytealign_be (w7[0], w7[1], offset); + c2[3] = hc_bytealign_be (w6[3], w7[0], offset); + c2[2] = hc_bytealign_be (w6[2], w6[3], offset); + c2[1] = hc_bytealign_be (w6[1], w6[2], offset); + c2[0] = hc_bytealign_be (w6[0], w6[1], offset); + c1[3] = hc_bytealign_be (w5[3], w6[0], offset); + c1[2] = hc_bytealign_be (w5[2], w5[3], offset); + c1[1] = hc_bytealign_be (w5[1], w5[2], offset); + c1[0] = hc_bytealign_be (w5[0], w5[1], offset); + c0[3] = hc_bytealign_be (w4[3], w5[0], offset); + c0[2] = hc_bytealign_be (w4[2], w4[3], offset); + c0[1] = hc_bytealign_be (w4[1], w4[2], offset); + c0[0] = hc_bytealign_be (w4[0], w4[1], offset); + w7[3] = hc_bytealign_be (w3[3], w4[0], offset); + w7[2] = hc_bytealign_be (w3[2], w3[3], offset); + w7[1] = hc_bytealign_be (w3[1], w3[2], offset); + w7[0] = hc_bytealign_be (w3[0], w3[1], offset); + w6[3] = hc_bytealign_be (w2[3], w3[0], offset); + w6[2] = hc_bytealign_be (w2[2], w2[3], offset); + w6[1] = hc_bytealign_be (w2[1], w2[2], offset); + w6[0] = hc_bytealign_be (w2[0], w2[1], offset); + w5[3] = hc_bytealign_be (w1[3], w2[0], offset); + w5[2] = hc_bytealign_be (w1[2], w1[3], offset); + w5[1] = hc_bytealign_be (w1[1], w1[2], offset); + w5[0] = hc_bytealign_be (w1[0], w1[1], offset); + w4[3] = hc_bytealign_be (w0[3], w1[0], offset); + w4[2] = hc_bytealign_be (w0[2], w0[3], offset); + w4[1] = hc_bytealign_be (w0[1], w0[2], offset); + w4[0] = hc_bytealign_be (w0[0], w0[1], offset); + w3[3] = hc_bytealign_be ( 0, w0[0], offset); + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 16: + c4[0] = hc_bytealign_be (w7[3], 0, offset); + c3[3] = hc_bytealign_be (w7[2], w7[3], offset); + c3[2] = hc_bytealign_be (w7[1], w7[2], offset); + c3[1] = hc_bytealign_be (w7[0], w7[1], offset); + c3[0] = hc_bytealign_be (w6[3], w7[0], offset); + c2[3] = hc_bytealign_be (w6[2], w6[3], offset); + c2[2] = hc_bytealign_be (w6[1], w6[2], offset); + c2[1] = hc_bytealign_be (w6[0], w6[1], offset); + c2[0] = hc_bytealign_be (w5[3], w6[0], offset); + c1[3] = hc_bytealign_be (w5[2], w5[3], offset); + c1[2] = hc_bytealign_be (w5[1], w5[2], offset); + c1[1] = hc_bytealign_be (w5[0], w5[1], offset); + c1[0] = hc_bytealign_be (w4[3], w5[0], offset); + c0[3] = hc_bytealign_be (w4[2], w4[3], offset); + c0[2] = hc_bytealign_be (w4[1], w4[2], offset); + c0[1] = hc_bytealign_be (w4[0], w4[1], offset); + c0[0] = hc_bytealign_be (w3[3], w4[0], offset); + w7[3] = hc_bytealign_be (w3[2], w3[3], offset); + w7[2] = hc_bytealign_be (w3[1], w3[2], offset); + w7[1] = hc_bytealign_be (w3[0], w3[1], offset); + w7[0] = hc_bytealign_be (w2[3], w3[0], offset); + w6[3] = hc_bytealign_be (w2[2], w2[3], offset); + w6[2] = hc_bytealign_be (w2[1], w2[2], offset); + w6[1] = hc_bytealign_be (w2[0], w2[1], offset); + w6[0] = hc_bytealign_be (w1[3], w2[0], offset); + w5[3] = hc_bytealign_be (w1[2], w1[3], offset); + w5[2] = hc_bytealign_be (w1[1], w1[2], offset); + w5[1] = hc_bytealign_be (w1[0], w1[1], offset); + w5[0] = hc_bytealign_be (w0[3], w1[0], offset); + w4[3] = hc_bytealign_be (w0[2], w0[3], offset); + w4[2] = hc_bytealign_be (w0[1], w0[2], offset); + w4[1] = hc_bytealign_be (w0[0], w0[1], offset); + w4[0] = hc_bytealign_be ( 0, w0[0], offset); + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 17: + c4[1] = hc_bytealign_be (w7[3], 0, offset); + c4[0] = hc_bytealign_be (w7[2], w7[3], offset); + c3[3] = hc_bytealign_be (w7[1], w7[2], offset); + c3[2] = hc_bytealign_be (w7[0], w7[1], offset); + c3[1] = hc_bytealign_be (w6[3], w7[0], offset); + c3[0] = hc_bytealign_be (w6[2], w6[3], offset); + c2[3] = hc_bytealign_be (w6[1], w6[2], offset); + c2[2] = hc_bytealign_be (w6[0], w6[1], offset); + c2[1] = hc_bytealign_be (w5[3], w6[0], offset); + c2[0] = hc_bytealign_be (w5[2], w5[3], offset); + c1[3] = hc_bytealign_be (w5[1], w5[2], offset); + c1[2] = hc_bytealign_be (w5[0], w5[1], offset); + c1[1] = hc_bytealign_be (w4[3], w5[0], offset); + c1[0] = hc_bytealign_be (w4[2], w4[3], offset); + c0[3] = hc_bytealign_be (w4[1], w4[2], offset); + c0[2] = hc_bytealign_be (w4[0], w4[1], offset); + c0[1] = hc_bytealign_be (w3[3], w4[0], offset); + c0[0] = hc_bytealign_be (w3[2], w3[3], offset); + w7[3] = hc_bytealign_be (w3[1], w3[2], offset); + w7[2] = hc_bytealign_be (w3[0], w3[1], offset); + w7[1] = hc_bytealign_be (w2[3], w3[0], offset); + w7[0] = hc_bytealign_be (w2[2], w2[3], offset); + w6[3] = hc_bytealign_be (w2[1], w2[2], offset); + w6[2] = hc_bytealign_be (w2[0], w2[1], offset); + w6[1] = hc_bytealign_be (w1[3], w2[0], offset); + w6[0] = hc_bytealign_be (w1[2], w1[3], offset); + w5[3] = hc_bytealign_be (w1[1], w1[2], offset); + w5[2] = hc_bytealign_be (w1[0], w1[1], offset); + w5[1] = hc_bytealign_be (w0[3], w1[0], offset); + w5[0] = hc_bytealign_be (w0[2], w0[3], offset); + w4[3] = hc_bytealign_be (w0[1], w0[2], offset); + w4[2] = hc_bytealign_be (w0[0], w0[1], offset); + w4[1] = hc_bytealign_be ( 0, w0[0], offset); + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 18: + c4[2] = hc_bytealign_be (w7[3], 0, offset); + c4[1] = hc_bytealign_be (w7[2], w7[3], offset); + c4[0] = hc_bytealign_be (w7[1], w7[2], offset); + c3[3] = hc_bytealign_be (w7[0], w7[1], offset); + c3[2] = hc_bytealign_be (w6[3], w7[0], offset); + c3[1] = hc_bytealign_be (w6[2], w6[3], offset); + c3[0] = hc_bytealign_be (w6[1], w6[2], offset); + c2[3] = hc_bytealign_be (w6[0], w6[1], offset); + c2[2] = hc_bytealign_be (w5[3], w6[0], offset); + c2[1] = hc_bytealign_be (w5[2], w5[3], offset); + c2[0] = hc_bytealign_be (w5[1], w5[2], offset); + c1[3] = hc_bytealign_be (w5[0], w5[1], offset); + c1[2] = hc_bytealign_be (w4[3], w5[0], offset); + c1[1] = hc_bytealign_be (w4[2], w4[3], offset); + c1[0] = hc_bytealign_be (w4[1], w4[2], offset); + c0[3] = hc_bytealign_be (w4[0], w4[1], offset); + c0[2] = hc_bytealign_be (w3[3], w4[0], offset); + c0[1] = hc_bytealign_be (w3[2], w3[3], offset); + c0[0] = hc_bytealign_be (w3[1], w3[2], offset); + w7[3] = hc_bytealign_be (w3[0], w3[1], offset); + w7[2] = hc_bytealign_be (w2[3], w3[0], offset); + w7[1] = hc_bytealign_be (w2[2], w2[3], offset); + w7[0] = hc_bytealign_be (w2[1], w2[2], offset); + w6[3] = hc_bytealign_be (w2[0], w2[1], offset); + w6[2] = hc_bytealign_be (w1[3], w2[0], offset); + w6[1] = hc_bytealign_be (w1[2], w1[3], offset); + w6[0] = hc_bytealign_be (w1[1], w1[2], offset); + w5[3] = hc_bytealign_be (w1[0], w1[1], offset); + w5[2] = hc_bytealign_be (w0[3], w1[0], offset); + w5[1] = hc_bytealign_be (w0[2], w0[3], offset); + w5[0] = hc_bytealign_be (w0[1], w0[2], offset); + w4[3] = hc_bytealign_be (w0[0], w0[1], offset); + w4[2] = hc_bytealign_be ( 0, w0[0], offset); + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 19: + c4[3] = hc_bytealign_be (w7[3], 0, offset); + c4[2] = hc_bytealign_be (w7[2], w7[3], offset); + c4[1] = hc_bytealign_be (w7[1], w7[2], offset); + c4[0] = hc_bytealign_be (w7[0], w7[1], offset); + c3[3] = hc_bytealign_be (w6[3], w7[0], offset); + c3[2] = hc_bytealign_be (w6[2], w6[3], offset); + c3[1] = hc_bytealign_be (w6[1], w6[2], offset); + c3[0] = hc_bytealign_be (w6[0], w6[1], offset); + c2[3] = hc_bytealign_be (w5[3], w6[0], offset); + c2[2] = hc_bytealign_be (w5[2], w5[3], offset); + c2[1] = hc_bytealign_be (w5[1], w5[2], offset); + c2[0] = hc_bytealign_be (w5[0], w5[1], offset); + c1[3] = hc_bytealign_be (w4[3], w5[0], offset); + c1[2] = hc_bytealign_be (w4[2], w4[3], offset); + c1[1] = hc_bytealign_be (w4[1], w4[2], offset); + c1[0] = hc_bytealign_be (w4[0], w4[1], offset); + c0[3] = hc_bytealign_be (w3[3], w4[0], offset); + c0[2] = hc_bytealign_be (w3[2], w3[3], offset); + c0[1] = hc_bytealign_be (w3[1], w3[2], offset); + c0[0] = hc_bytealign_be (w3[0], w3[1], offset); + w7[3] = hc_bytealign_be (w2[3], w3[0], offset); + w7[2] = hc_bytealign_be (w2[2], w2[3], offset); + w7[1] = hc_bytealign_be (w2[1], w2[2], offset); + w7[0] = hc_bytealign_be (w2[0], w2[1], offset); + w6[3] = hc_bytealign_be (w1[3], w2[0], offset); + w6[2] = hc_bytealign_be (w1[2], w1[3], offset); + w6[1] = hc_bytealign_be (w1[1], w1[2], offset); + w6[0] = hc_bytealign_be (w1[0], w1[1], offset); + w5[3] = hc_bytealign_be (w0[3], w1[0], offset); + w5[2] = hc_bytealign_be (w0[2], w0[3], offset); + w5[1] = hc_bytealign_be (w0[1], w0[2], offset); + w5[0] = hc_bytealign_be (w0[0], w0[1], offset); + w4[3] = hc_bytealign_be ( 0, w0[0], offset); + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 20: + c5[0] = hc_bytealign_be (w7[3], 0, offset); + c4[3] = hc_bytealign_be (w7[2], w7[3], offset); + c4[2] = hc_bytealign_be (w7[1], w7[2], offset); + c4[1] = hc_bytealign_be (w7[0], w7[1], offset); + c4[0] = hc_bytealign_be (w6[3], w7[0], offset); + c3[3] = hc_bytealign_be (w6[2], w6[3], offset); + c3[2] = hc_bytealign_be (w6[1], w6[2], offset); + c3[1] = hc_bytealign_be (w6[0], w6[1], offset); + c3[0] = hc_bytealign_be (w5[3], w6[0], offset); + c2[3] = hc_bytealign_be (w5[2], w5[3], offset); + c2[2] = hc_bytealign_be (w5[1], w5[2], offset); + c2[1] = hc_bytealign_be (w5[0], w5[1], offset); + c2[0] = hc_bytealign_be (w4[3], w5[0], offset); + c1[3] = hc_bytealign_be (w4[2], w4[3], offset); + c1[2] = hc_bytealign_be (w4[1], w4[2], offset); + c1[1] = hc_bytealign_be (w4[0], w4[1], offset); + c1[0] = hc_bytealign_be (w3[3], w4[0], offset); + c0[3] = hc_bytealign_be (w3[2], w3[3], offset); + c0[2] = hc_bytealign_be (w3[1], w3[2], offset); + c0[1] = hc_bytealign_be (w3[0], w3[1], offset); + c0[0] = hc_bytealign_be (w2[3], w3[0], offset); + w7[3] = hc_bytealign_be (w2[2], w2[3], offset); + w7[2] = hc_bytealign_be (w2[1], w2[2], offset); + w7[1] = hc_bytealign_be (w2[0], w2[1], offset); + w7[0] = hc_bytealign_be (w1[3], w2[0], offset); + w6[3] = hc_bytealign_be (w1[2], w1[3], offset); + w6[2] = hc_bytealign_be (w1[1], w1[2], offset); + w6[1] = hc_bytealign_be (w1[0], w1[1], offset); + w6[0] = hc_bytealign_be (w0[3], w1[0], offset); + w5[3] = hc_bytealign_be (w0[2], w0[3], offset); + w5[2] = hc_bytealign_be (w0[1], w0[2], offset); + w5[1] = hc_bytealign_be (w0[0], w0[1], offset); + w5[0] = hc_bytealign_be ( 0, w0[0], offset); + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 21: + c5[1] = hc_bytealign_be (w7[3], 0, offset); + c5[0] = hc_bytealign_be (w7[2], w7[3], offset); + c4[3] = hc_bytealign_be (w7[1], w7[2], offset); + c4[2] = hc_bytealign_be (w7[0], w7[1], offset); + c4[1] = hc_bytealign_be (w6[3], w7[0], offset); + c4[0] = hc_bytealign_be (w6[2], w6[3], offset); + c3[3] = hc_bytealign_be (w6[1], w6[2], offset); + c3[2] = hc_bytealign_be (w6[0], w6[1], offset); + c3[1] = hc_bytealign_be (w5[3], w6[0], offset); + c3[0] = hc_bytealign_be (w5[2], w5[3], offset); + c2[3] = hc_bytealign_be (w5[1], w5[2], offset); + c2[2] = hc_bytealign_be (w5[0], w5[1], offset); + c2[1] = hc_bytealign_be (w4[3], w5[0], offset); + c2[0] = hc_bytealign_be (w4[2], w4[3], offset); + c1[3] = hc_bytealign_be (w4[1], w4[2], offset); + c1[2] = hc_bytealign_be (w4[0], w4[1], offset); + c1[1] = hc_bytealign_be (w3[3], w4[0], offset); + c1[0] = hc_bytealign_be (w3[2], w3[3], offset); + c0[3] = hc_bytealign_be (w3[1], w3[2], offset); + c0[2] = hc_bytealign_be (w3[0], w3[1], offset); + c0[1] = hc_bytealign_be (w2[3], w3[0], offset); + c0[0] = hc_bytealign_be (w2[2], w2[3], offset); + w7[3] = hc_bytealign_be (w2[1], w2[2], offset); + w7[2] = hc_bytealign_be (w2[0], w2[1], offset); + w7[1] = hc_bytealign_be (w1[3], w2[0], offset); + w7[0] = hc_bytealign_be (w1[2], w1[3], offset); + w6[3] = hc_bytealign_be (w1[1], w1[2], offset); + w6[2] = hc_bytealign_be (w1[0], w1[1], offset); + w6[1] = hc_bytealign_be (w0[3], w1[0], offset); + w6[0] = hc_bytealign_be (w0[2], w0[3], offset); + w5[3] = hc_bytealign_be (w0[1], w0[2], offset); + w5[2] = hc_bytealign_be (w0[0], w0[1], offset); + w5[1] = hc_bytealign_be ( 0, w0[0], offset); + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 22: + c5[2] = hc_bytealign_be (w7[3], 0, offset); + c5[1] = hc_bytealign_be (w7[2], w7[3], offset); + c5[0] = hc_bytealign_be (w7[1], w7[2], offset); + c4[3] = hc_bytealign_be (w7[0], w7[1], offset); + c4[2] = hc_bytealign_be (w6[3], w7[0], offset); + c4[1] = hc_bytealign_be (w6[2], w6[3], offset); + c4[0] = hc_bytealign_be (w6[1], w6[2], offset); + c3[3] = hc_bytealign_be (w6[0], w6[1], offset); + c3[2] = hc_bytealign_be (w5[3], w6[0], offset); + c3[1] = hc_bytealign_be (w5[2], w5[3], offset); + c3[0] = hc_bytealign_be (w5[1], w5[2], offset); + c2[3] = hc_bytealign_be (w5[0], w5[1], offset); + c2[2] = hc_bytealign_be (w4[3], w5[0], offset); + c2[1] = hc_bytealign_be (w4[2], w4[3], offset); + c2[0] = hc_bytealign_be (w4[1], w4[2], offset); + c1[3] = hc_bytealign_be (w4[0], w4[1], offset); + c1[2] = hc_bytealign_be (w3[3], w4[0], offset); + c1[1] = hc_bytealign_be (w3[2], w3[3], offset); + c1[0] = hc_bytealign_be (w3[1], w3[2], offset); + c0[3] = hc_bytealign_be (w3[0], w3[1], offset); + c0[2] = hc_bytealign_be (w2[3], w3[0], offset); + c0[1] = hc_bytealign_be (w2[2], w2[3], offset); + c0[0] = hc_bytealign_be (w2[1], w2[2], offset); + w7[3] = hc_bytealign_be (w2[0], w2[1], offset); + w7[2] = hc_bytealign_be (w1[3], w2[0], offset); + w7[1] = hc_bytealign_be (w1[2], w1[3], offset); + w7[0] = hc_bytealign_be (w1[1], w1[2], offset); + w6[3] = hc_bytealign_be (w1[0], w1[1], offset); + w6[2] = hc_bytealign_be (w0[3], w1[0], offset); + w6[1] = hc_bytealign_be (w0[2], w0[3], offset); + w6[0] = hc_bytealign_be (w0[1], w0[2], offset); + w5[3] = hc_bytealign_be (w0[0], w0[1], offset); + w5[2] = hc_bytealign_be ( 0, w0[0], offset); + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 23: + c5[3] = hc_bytealign_be (w7[3], 0, offset); + c5[2] = hc_bytealign_be (w7[2], w7[3], offset); + c5[1] = hc_bytealign_be (w7[1], w7[2], offset); + c5[0] = hc_bytealign_be (w7[0], w7[1], offset); + c4[3] = hc_bytealign_be (w6[3], w7[0], offset); + c4[2] = hc_bytealign_be (w6[2], w6[3], offset); + c4[1] = hc_bytealign_be (w6[1], w6[2], offset); + c4[0] = hc_bytealign_be (w6[0], w6[1], offset); + c3[3] = hc_bytealign_be (w5[3], w6[0], offset); + c3[2] = hc_bytealign_be (w5[2], w5[3], offset); + c3[1] = hc_bytealign_be (w5[1], w5[2], offset); + c3[0] = hc_bytealign_be (w5[0], w5[1], offset); + c2[3] = hc_bytealign_be (w4[3], w5[0], offset); + c2[2] = hc_bytealign_be (w4[2], w4[3], offset); + c2[1] = hc_bytealign_be (w4[1], w4[2], offset); + c2[0] = hc_bytealign_be (w4[0], w4[1], offset); + c1[3] = hc_bytealign_be (w3[3], w4[0], offset); + c1[2] = hc_bytealign_be (w3[2], w3[3], offset); + c1[1] = hc_bytealign_be (w3[1], w3[2], offset); + c1[0] = hc_bytealign_be (w3[0], w3[1], offset); + c0[3] = hc_bytealign_be (w2[3], w3[0], offset); + c0[2] = hc_bytealign_be (w2[2], w2[3], offset); + c0[1] = hc_bytealign_be (w2[1], w2[2], offset); + c0[0] = hc_bytealign_be (w2[0], w2[1], offset); + w7[3] = hc_bytealign_be (w1[3], w2[0], offset); + w7[2] = hc_bytealign_be (w1[2], w1[3], offset); + w7[1] = hc_bytealign_be (w1[1], w1[2], offset); + w7[0] = hc_bytealign_be (w1[0], w1[1], offset); + w6[3] = hc_bytealign_be (w0[3], w1[0], offset); + w6[2] = hc_bytealign_be (w0[2], w0[3], offset); + w6[1] = hc_bytealign_be (w0[1], w0[2], offset); + w6[0] = hc_bytealign_be (w0[0], w0[1], offset); + w5[3] = hc_bytealign_be ( 0, w0[0], offset); + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 24: + c6[0] = hc_bytealign_be (w7[3], 0, offset); + c5[3] = hc_bytealign_be (w7[2], w7[3], offset); + c5[2] = hc_bytealign_be (w7[1], w7[2], offset); + c5[1] = hc_bytealign_be (w7[0], w7[1], offset); + c5[0] = hc_bytealign_be (w6[3], w7[0], offset); + c4[3] = hc_bytealign_be (w6[2], w6[3], offset); + c4[2] = hc_bytealign_be (w6[1], w6[2], offset); + c4[1] = hc_bytealign_be (w6[0], w6[1], offset); + c4[0] = hc_bytealign_be (w5[3], w6[0], offset); + c3[3] = hc_bytealign_be (w5[2], w5[3], offset); + c3[2] = hc_bytealign_be (w5[1], w5[2], offset); + c3[1] = hc_bytealign_be (w5[0], w5[1], offset); + c3[0] = hc_bytealign_be (w4[3], w5[0], offset); + c2[3] = hc_bytealign_be (w4[2], w4[3], offset); + c2[2] = hc_bytealign_be (w4[1], w4[2], offset); + c2[1] = hc_bytealign_be (w4[0], w4[1], offset); + c2[0] = hc_bytealign_be (w3[3], w4[0], offset); + c1[3] = hc_bytealign_be (w3[2], w3[3], offset); + c1[2] = hc_bytealign_be (w3[1], w3[2], offset); + c1[1] = hc_bytealign_be (w3[0], w3[1], offset); + c1[0] = hc_bytealign_be (w2[3], w3[0], offset); + c0[3] = hc_bytealign_be (w2[2], w2[3], offset); + c0[2] = hc_bytealign_be (w2[1], w2[2], offset); + c0[1] = hc_bytealign_be (w2[0], w2[1], offset); + c0[0] = hc_bytealign_be (w1[3], w2[0], offset); + w7[3] = hc_bytealign_be (w1[2], w1[3], offset); + w7[2] = hc_bytealign_be (w1[1], w1[2], offset); + w7[1] = hc_bytealign_be (w1[0], w1[1], offset); + w7[0] = hc_bytealign_be (w0[3], w1[0], offset); + w6[3] = hc_bytealign_be (w0[2], w0[3], offset); + w6[2] = hc_bytealign_be (w0[1], w0[2], offset); + w6[1] = hc_bytealign_be (w0[0], w0[1], offset); + w6[0] = hc_bytealign_be ( 0, w0[0], offset); + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 25: + c6[1] = hc_bytealign_be (w7[3], 0, offset); + c6[0] = hc_bytealign_be (w7[2], w7[3], offset); + c5[3] = hc_bytealign_be (w7[1], w7[2], offset); + c5[2] = hc_bytealign_be (w7[0], w7[1], offset); + c5[1] = hc_bytealign_be (w6[3], w7[0], offset); + c5[0] = hc_bytealign_be (w6[2], w6[3], offset); + c4[3] = hc_bytealign_be (w6[1], w6[2], offset); + c4[2] = hc_bytealign_be (w6[0], w6[1], offset); + c4[1] = hc_bytealign_be (w5[3], w6[0], offset); + c4[0] = hc_bytealign_be (w5[2], w5[3], offset); + c3[3] = hc_bytealign_be (w5[1], w5[2], offset); + c3[2] = hc_bytealign_be (w5[0], w5[1], offset); + c3[1] = hc_bytealign_be (w4[3], w5[0], offset); + c3[0] = hc_bytealign_be (w4[2], w4[3], offset); + c2[3] = hc_bytealign_be (w4[1], w4[2], offset); + c2[2] = hc_bytealign_be (w4[0], w4[1], offset); + c2[1] = hc_bytealign_be (w3[3], w4[0], offset); + c2[0] = hc_bytealign_be (w3[2], w3[3], offset); + c1[3] = hc_bytealign_be (w3[1], w3[2], offset); + c1[2] = hc_bytealign_be (w3[0], w3[1], offset); + c1[1] = hc_bytealign_be (w2[3], w3[0], offset); + c1[0] = hc_bytealign_be (w2[2], w2[3], offset); + c0[3] = hc_bytealign_be (w2[1], w2[2], offset); + c0[2] = hc_bytealign_be (w2[0], w2[1], offset); + c0[1] = hc_bytealign_be (w1[3], w2[0], offset); + c0[0] = hc_bytealign_be (w1[2], w1[3], offset); + w7[3] = hc_bytealign_be (w1[1], w1[2], offset); + w7[2] = hc_bytealign_be (w1[0], w1[1], offset); + w7[1] = hc_bytealign_be (w0[3], w1[0], offset); + w7[0] = hc_bytealign_be (w0[2], w0[3], offset); + w6[3] = hc_bytealign_be (w0[1], w0[2], offset); + w6[2] = hc_bytealign_be (w0[0], w0[1], offset); + w6[1] = hc_bytealign_be ( 0, w0[0], offset); + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 26: + c6[2] = hc_bytealign_be (w7[3], 0, offset); + c6[1] = hc_bytealign_be (w7[2], w7[3], offset); + c6[0] = hc_bytealign_be (w7[1], w7[2], offset); + c5[3] = hc_bytealign_be (w7[0], w7[1], offset); + c5[2] = hc_bytealign_be (w6[3], w7[0], offset); + c5[1] = hc_bytealign_be (w6[2], w6[3], offset); + c5[0] = hc_bytealign_be (w6[1], w6[2], offset); + c4[3] = hc_bytealign_be (w6[0], w6[1], offset); + c4[2] = hc_bytealign_be (w5[3], w6[0], offset); + c4[1] = hc_bytealign_be (w5[2], w5[3], offset); + c4[0] = hc_bytealign_be (w5[1], w5[2], offset); + c3[3] = hc_bytealign_be (w5[0], w5[1], offset); + c3[2] = hc_bytealign_be (w4[3], w5[0], offset); + c3[1] = hc_bytealign_be (w4[2], w4[3], offset); + c3[0] = hc_bytealign_be (w4[1], w4[2], offset); + c2[3] = hc_bytealign_be (w4[0], w4[1], offset); + c2[2] = hc_bytealign_be (w3[3], w4[0], offset); + c2[1] = hc_bytealign_be (w3[2], w3[3], offset); + c2[0] = hc_bytealign_be (w3[1], w3[2], offset); + c1[3] = hc_bytealign_be (w3[0], w3[1], offset); + c1[2] = hc_bytealign_be (w2[3], w3[0], offset); + c1[1] = hc_bytealign_be (w2[2], w2[3], offset); + c1[0] = hc_bytealign_be (w2[1], w2[2], offset); + c0[3] = hc_bytealign_be (w2[0], w2[1], offset); + c0[2] = hc_bytealign_be (w1[3], w2[0], offset); + c0[1] = hc_bytealign_be (w1[2], w1[3], offset); + c0[0] = hc_bytealign_be (w1[1], w1[2], offset); + w7[3] = hc_bytealign_be (w1[0], w1[1], offset); + w7[2] = hc_bytealign_be (w0[3], w1[0], offset); + w7[1] = hc_bytealign_be (w0[2], w0[3], offset); + w7[0] = hc_bytealign_be (w0[1], w0[2], offset); + w6[3] = hc_bytealign_be (w0[0], w0[1], offset); + w6[2] = hc_bytealign_be ( 0, w0[0], offset); + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 27: + c6[3] = hc_bytealign_be (w7[3], 0, offset); + c6[2] = hc_bytealign_be (w7[2], w7[3], offset); + c6[1] = hc_bytealign_be (w7[1], w7[2], offset); + c6[0] = hc_bytealign_be (w7[0], w7[1], offset); + c5[3] = hc_bytealign_be (w6[3], w7[0], offset); + c5[2] = hc_bytealign_be (w6[2], w6[3], offset); + c5[1] = hc_bytealign_be (w6[1], w6[2], offset); + c5[0] = hc_bytealign_be (w6[0], w6[1], offset); + c4[3] = hc_bytealign_be (w5[3], w6[0], offset); + c4[2] = hc_bytealign_be (w5[2], w5[3], offset); + c4[1] = hc_bytealign_be (w5[1], w5[2], offset); + c4[0] = hc_bytealign_be (w5[0], w5[1], offset); + c3[3] = hc_bytealign_be (w4[3], w5[0], offset); + c3[2] = hc_bytealign_be (w4[2], w4[3], offset); + c3[1] = hc_bytealign_be (w4[1], w4[2], offset); + c3[0] = hc_bytealign_be (w4[0], w4[1], offset); + c2[3] = hc_bytealign_be (w3[3], w4[0], offset); + c2[2] = hc_bytealign_be (w3[2], w3[3], offset); + c2[1] = hc_bytealign_be (w3[1], w3[2], offset); + c2[0] = hc_bytealign_be (w3[0], w3[1], offset); + c1[3] = hc_bytealign_be (w2[3], w3[0], offset); + c1[2] = hc_bytealign_be (w2[2], w2[3], offset); + c1[1] = hc_bytealign_be (w2[1], w2[2], offset); + c1[0] = hc_bytealign_be (w2[0], w2[1], offset); + c0[3] = hc_bytealign_be (w1[3], w2[0], offset); + c0[2] = hc_bytealign_be (w1[2], w1[3], offset); + c0[1] = hc_bytealign_be (w1[1], w1[2], offset); + c0[0] = hc_bytealign_be (w1[0], w1[1], offset); + w7[3] = hc_bytealign_be (w0[3], w1[0], offset); + w7[2] = hc_bytealign_be (w0[2], w0[3], offset); + w7[1] = hc_bytealign_be (w0[1], w0[2], offset); + w7[0] = hc_bytealign_be (w0[0], w0[1], offset); + w6[3] = hc_bytealign_be ( 0, w0[0], offset); + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 28: + c7[0] = hc_bytealign_be (w7[3], 0, offset); + c6[3] = hc_bytealign_be (w7[2], w7[3], offset); + c6[2] = hc_bytealign_be (w7[1], w7[2], offset); + c6[1] = hc_bytealign_be (w7[0], w7[1], offset); + c6[0] = hc_bytealign_be (w6[3], w7[0], offset); + c5[3] = hc_bytealign_be (w6[2], w6[3], offset); + c5[2] = hc_bytealign_be (w6[1], w6[2], offset); + c5[1] = hc_bytealign_be (w6[0], w6[1], offset); + c5[0] = hc_bytealign_be (w5[3], w6[0], offset); + c4[3] = hc_bytealign_be (w5[2], w5[3], offset); + c4[2] = hc_bytealign_be (w5[1], w5[2], offset); + c4[1] = hc_bytealign_be (w5[0], w5[1], offset); + c4[0] = hc_bytealign_be (w4[3], w5[0], offset); + c3[3] = hc_bytealign_be (w4[2], w4[3], offset); + c3[2] = hc_bytealign_be (w4[1], w4[2], offset); + c3[1] = hc_bytealign_be (w4[0], w4[1], offset); + c3[0] = hc_bytealign_be (w3[3], w4[0], offset); + c2[3] = hc_bytealign_be (w3[2], w3[3], offset); + c2[2] = hc_bytealign_be (w3[1], w3[2], offset); + c2[1] = hc_bytealign_be (w3[0], w3[1], offset); + c2[0] = hc_bytealign_be (w2[3], w3[0], offset); + c1[3] = hc_bytealign_be (w2[2], w2[3], offset); + c1[2] = hc_bytealign_be (w2[1], w2[2], offset); + c1[1] = hc_bytealign_be (w2[0], w2[1], offset); + c1[0] = hc_bytealign_be (w1[3], w2[0], offset); + c0[3] = hc_bytealign_be (w1[2], w1[3], offset); + c0[2] = hc_bytealign_be (w1[1], w1[2], offset); + c0[1] = hc_bytealign_be (w1[0], w1[1], offset); + c0[0] = hc_bytealign_be (w0[3], w1[0], offset); + w7[3] = hc_bytealign_be (w0[2], w0[3], offset); + w7[2] = hc_bytealign_be (w0[1], w0[2], offset); + w7[1] = hc_bytealign_be (w0[0], w0[1], offset); + w7[0] = hc_bytealign_be ( 0, w0[0], offset); + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 29: + c7[1] = hc_bytealign_be (w7[3], 0, offset); + c7[0] = hc_bytealign_be (w7[2], w7[3], offset); + c6[3] = hc_bytealign_be (w7[1], w7[2], offset); + c6[2] = hc_bytealign_be (w7[0], w7[1], offset); + c6[1] = hc_bytealign_be (w6[3], w7[0], offset); + c6[0] = hc_bytealign_be (w6[2], w6[3], offset); + c5[3] = hc_bytealign_be (w6[1], w6[2], offset); + c5[2] = hc_bytealign_be (w6[0], w6[1], offset); + c5[1] = hc_bytealign_be (w5[3], w6[0], offset); + c5[0] = hc_bytealign_be (w5[2], w5[3], offset); + c4[3] = hc_bytealign_be (w5[1], w5[2], offset); + c4[2] = hc_bytealign_be (w5[0], w5[1], offset); + c4[1] = hc_bytealign_be (w4[3], w5[0], offset); + c4[0] = hc_bytealign_be (w4[2], w4[3], offset); + c3[3] = hc_bytealign_be (w4[1], w4[2], offset); + c3[2] = hc_bytealign_be (w4[0], w4[1], offset); + c3[1] = hc_bytealign_be (w3[3], w4[0], offset); + c3[0] = hc_bytealign_be (w3[2], w3[3], offset); + c2[3] = hc_bytealign_be (w3[1], w3[2], offset); + c2[2] = hc_bytealign_be (w3[0], w3[1], offset); + c2[1] = hc_bytealign_be (w2[3], w3[0], offset); + c2[0] = hc_bytealign_be (w2[2], w2[3], offset); + c1[3] = hc_bytealign_be (w2[1], w2[2], offset); + c1[2] = hc_bytealign_be (w2[0], w2[1], offset); + c1[1] = hc_bytealign_be (w1[3], w2[0], offset); + c1[0] = hc_bytealign_be (w1[2], w1[3], offset); + c0[3] = hc_bytealign_be (w1[1], w1[2], offset); + c0[2] = hc_bytealign_be (w1[0], w1[1], offset); + c0[1] = hc_bytealign_be (w0[3], w1[0], offset); + c0[0] = hc_bytealign_be (w0[2], w0[3], offset); + w7[3] = hc_bytealign_be (w0[1], w0[2], offset); + w7[2] = hc_bytealign_be (w0[0], w0[1], offset); + w7[1] = hc_bytealign_be ( 0, w0[0], offset); + w7[0] = 0; + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 30: + c7[2] = hc_bytealign_be (w7[3], 0, offset); + c7[1] = hc_bytealign_be (w7[2], w7[3], offset); + c7[0] = hc_bytealign_be (w7[1], w7[2], offset); + c6[3] = hc_bytealign_be (w7[0], w7[1], offset); + c6[2] = hc_bytealign_be (w6[3], w7[0], offset); + c6[1] = hc_bytealign_be (w6[2], w6[3], offset); + c6[0] = hc_bytealign_be (w6[1], w6[2], offset); + c5[3] = hc_bytealign_be (w6[0], w6[1], offset); + c5[2] = hc_bytealign_be (w5[3], w6[0], offset); + c5[1] = hc_bytealign_be (w5[2], w5[3], offset); + c5[0] = hc_bytealign_be (w5[1], w5[2], offset); + c4[3] = hc_bytealign_be (w5[0], w5[1], offset); + c4[2] = hc_bytealign_be (w4[3], w5[0], offset); + c4[1] = hc_bytealign_be (w4[2], w4[3], offset); + c4[0] = hc_bytealign_be (w4[1], w4[2], offset); + c3[3] = hc_bytealign_be (w4[0], w4[1], offset); + c3[2] = hc_bytealign_be (w3[3], w4[0], offset); + c3[1] = hc_bytealign_be (w3[2], w3[3], offset); + c3[0] = hc_bytealign_be (w3[1], w3[2], offset); + c2[3] = hc_bytealign_be (w3[0], w3[1], offset); + c2[2] = hc_bytealign_be (w2[3], w3[0], offset); + c2[1] = hc_bytealign_be (w2[2], w2[3], offset); + c2[0] = hc_bytealign_be (w2[1], w2[2], offset); + c1[3] = hc_bytealign_be (w2[0], w2[1], offset); + c1[2] = hc_bytealign_be (w1[3], w2[0], offset); + c1[1] = hc_bytealign_be (w1[2], w1[3], offset); + c1[0] = hc_bytealign_be (w1[1], w1[2], offset); + c0[3] = hc_bytealign_be (w1[0], w1[1], offset); + c0[2] = hc_bytealign_be (w0[3], w1[0], offset); + c0[1] = hc_bytealign_be (w0[2], w0[3], offset); + c0[0] = hc_bytealign_be (w0[1], w0[2], offset); + w7[3] = hc_bytealign_be (w0[0], w0[1], offset); + w7[2] = hc_bytealign_be ( 0, w0[0], offset); + w7[1] = 0; + w7[0] = 0; + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 31: + c7[3] = hc_bytealign_be (w7[3], 0, offset); + c7[2] = hc_bytealign_be (w7[2], w7[3], offset); + c7[1] = hc_bytealign_be (w7[1], w7[2], offset); + c7[0] = hc_bytealign_be (w7[0], w7[1], offset); + c6[3] = hc_bytealign_be (w6[3], w7[0], offset); + c6[2] = hc_bytealign_be (w6[2], w6[3], offset); + c6[1] = hc_bytealign_be (w6[1], w6[2], offset); + c6[0] = hc_bytealign_be (w6[0], w6[1], offset); + c5[3] = hc_bytealign_be (w5[3], w6[0], offset); + c5[2] = hc_bytealign_be (w5[2], w5[3], offset); + c5[1] = hc_bytealign_be (w5[1], w5[2], offset); + c5[0] = hc_bytealign_be (w5[0], w5[1], offset); + c4[3] = hc_bytealign_be (w4[3], w5[0], offset); + c4[2] = hc_bytealign_be (w4[2], w4[3], offset); + c4[1] = hc_bytealign_be (w4[1], w4[2], offset); + c4[0] = hc_bytealign_be (w4[0], w4[1], offset); + c3[3] = hc_bytealign_be (w3[3], w4[0], offset); + c3[2] = hc_bytealign_be (w3[2], w3[3], offset); + c3[1] = hc_bytealign_be (w3[1], w3[2], offset); + c3[0] = hc_bytealign_be (w3[0], w3[1], offset); + c2[3] = hc_bytealign_be (w2[3], w3[0], offset); + c2[2] = hc_bytealign_be (w2[2], w2[3], offset); + c2[1] = hc_bytealign_be (w2[1], w2[2], offset); + c2[0] = hc_bytealign_be (w2[0], w2[1], offset); + c1[3] = hc_bytealign_be (w1[3], w2[0], offset); + c1[2] = hc_bytealign_be (w1[2], w1[3], offset); + c1[1] = hc_bytealign_be (w1[1], w1[2], offset); + c1[0] = hc_bytealign_be (w1[0], w1[1], offset); + c0[3] = hc_bytealign_be (w0[3], w1[0], offset); + c0[2] = hc_bytealign_be (w0[2], w0[3], offset); + c0[1] = hc_bytealign_be (w0[1], w0[2], offset); + c0[0] = hc_bytealign_be (w0[0], w0[1], offset); + w7[3] = hc_bytealign_be ( 0, w0[0], offset); + w7[2] = 0; + w7[1] = 0; + w7[0] = 0; + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + } + #endif + + #if (defined IS_AMD && HAS_VPERM == 1) || defined IS_NV + + #if defined IS_NV + const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; + #endif + + #if defined IS_AMD + const int selector = 0x0706050403020100 >> ((offset & 3) * 8); + #endif + + switch (offset_switch) + { + case 0: + c0[0] = hc_byte_perm ( 0, w7[3], selector); + w7[3] = hc_byte_perm (w7[3], w7[2], selector); + w7[2] = hc_byte_perm (w7[2], w7[1], selector); + w7[1] = hc_byte_perm (w7[1], w7[0], selector); + w7[0] = hc_byte_perm (w7[0], w6[3], selector); + w6[3] = hc_byte_perm (w6[3], w6[2], selector); + w6[2] = hc_byte_perm (w6[2], w6[1], selector); + w6[1] = hc_byte_perm (w6[1], w6[0], selector); + w6[0] = hc_byte_perm (w6[0], w5[3], selector); + w5[3] = hc_byte_perm (w5[3], w5[2], selector); + w5[2] = hc_byte_perm (w5[2], w5[1], selector); + w5[1] = hc_byte_perm (w5[1], w5[0], selector); + w5[0] = hc_byte_perm (w5[0], w4[3], selector); + w4[3] = hc_byte_perm (w4[3], w4[2], selector); + w4[2] = hc_byte_perm (w4[2], w4[1], selector); + w4[1] = hc_byte_perm (w4[1], w4[0], selector); + w4[0] = hc_byte_perm (w4[0], w3[3], selector); + w3[3] = hc_byte_perm (w3[3], w3[2], selector); + w3[2] = hc_byte_perm (w3[2], w3[1], selector); + w3[1] = hc_byte_perm (w3[1], w3[0], selector); + w3[0] = hc_byte_perm (w3[0], w2[3], selector); + w2[3] = hc_byte_perm (w2[3], w2[2], selector); + w2[2] = hc_byte_perm (w2[2], w2[1], selector); + w2[1] = hc_byte_perm (w2[1], w2[0], selector); + w2[0] = hc_byte_perm (w2[0], w1[3], selector); + w1[3] = hc_byte_perm (w1[3], w1[2], selector); + w1[2] = hc_byte_perm (w1[2], w1[1], selector); + w1[1] = hc_byte_perm (w1[1], w1[0], selector); + w1[0] = hc_byte_perm (w1[0], w0[3], selector); + w0[3] = hc_byte_perm (w0[3], w0[2], selector); + w0[2] = hc_byte_perm (w0[2], w0[1], selector); + w0[1] = hc_byte_perm (w0[1], w0[0], selector); + w0[0] = hc_byte_perm (w0[0], 0, selector); + + break; + + case 1: + c0[1] = hc_byte_perm ( 0, w7[3], selector); + c0[0] = hc_byte_perm (w7[3], w7[2], selector); + w7[3] = hc_byte_perm (w7[2], w7[1], selector); + w7[2] = hc_byte_perm (w7[1], w7[0], selector); + w7[1] = hc_byte_perm (w7[0], w6[3], selector); + w7[0] = hc_byte_perm (w6[3], w6[2], selector); + w6[3] = hc_byte_perm (w6[2], w6[1], selector); + w6[2] = hc_byte_perm (w6[1], w6[0], selector); + w6[1] = hc_byte_perm (w6[0], w5[3], selector); + w6[0] = hc_byte_perm (w5[3], w5[2], selector); + w5[3] = hc_byte_perm (w5[2], w5[1], selector); + w5[2] = hc_byte_perm (w5[1], w5[0], selector); + w5[1] = hc_byte_perm (w5[0], w4[3], selector); + w5[0] = hc_byte_perm (w4[3], w4[2], selector); + w4[3] = hc_byte_perm (w4[2], w4[1], selector); + w4[2] = hc_byte_perm (w4[1], w4[0], selector); + w4[1] = hc_byte_perm (w4[0], w3[3], selector); + w4[0] = hc_byte_perm (w3[3], w3[2], selector); + w3[3] = hc_byte_perm (w3[2], w3[1], selector); + w3[2] = hc_byte_perm (w3[1], w3[0], selector); + w3[1] = hc_byte_perm (w3[0], w2[3], selector); + w3[0] = hc_byte_perm (w2[3], w2[2], selector); + w2[3] = hc_byte_perm (w2[2], w2[1], selector); + w2[2] = hc_byte_perm (w2[1], w2[0], selector); + w2[1] = hc_byte_perm (w2[0], w1[3], selector); + w2[0] = hc_byte_perm (w1[3], w1[2], selector); + w1[3] = hc_byte_perm (w1[2], w1[1], selector); + w1[2] = hc_byte_perm (w1[1], w1[0], selector); + w1[1] = hc_byte_perm (w1[0], w0[3], selector); + w1[0] = hc_byte_perm (w0[3], w0[2], selector); + w0[3] = hc_byte_perm (w0[2], w0[1], selector); + w0[2] = hc_byte_perm (w0[1], w0[0], selector); + w0[1] = hc_byte_perm (w0[0], 0, selector); + w0[0] = 0; + + break; + + case 2: + c0[2] = hc_byte_perm ( 0, w7[3], selector); + c0[1] = hc_byte_perm (w7[3], w7[2], selector); + c0[0] = hc_byte_perm (w7[2], w7[1], selector); + w7[3] = hc_byte_perm (w7[1], w7[0], selector); + w7[2] = hc_byte_perm (w7[0], w6[3], selector); + w7[1] = hc_byte_perm (w6[3], w6[2], selector); + w7[0] = hc_byte_perm (w6[2], w6[1], selector); + w6[3] = hc_byte_perm (w6[1], w6[0], selector); + w6[2] = hc_byte_perm (w6[0], w5[3], selector); + w6[1] = hc_byte_perm (w5[3], w5[2], selector); + w6[0] = hc_byte_perm (w5[2], w5[1], selector); + w5[3] = hc_byte_perm (w5[1], w5[0], selector); + w5[2] = hc_byte_perm (w5[0], w4[3], selector); + w5[1] = hc_byte_perm (w4[3], w4[2], selector); + w5[0] = hc_byte_perm (w4[2], w4[1], selector); + w4[3] = hc_byte_perm (w4[1], w4[0], selector); + w4[2] = hc_byte_perm (w4[0], w3[3], selector); + w4[1] = hc_byte_perm (w3[3], w3[2], selector); + w4[0] = hc_byte_perm (w3[2], w3[1], selector); + w3[3] = hc_byte_perm (w3[1], w3[0], selector); + w3[2] = hc_byte_perm (w3[0], w2[3], selector); + w3[1] = hc_byte_perm (w2[3], w2[2], selector); + w3[0] = hc_byte_perm (w2[2], w2[1], selector); + w2[3] = hc_byte_perm (w2[1], w2[0], selector); + w2[2] = hc_byte_perm (w2[0], w1[3], selector); + w2[1] = hc_byte_perm (w1[3], w1[2], selector); + w2[0] = hc_byte_perm (w1[2], w1[1], selector); + w1[3] = hc_byte_perm (w1[1], w1[0], selector); + w1[2] = hc_byte_perm (w1[0], w0[3], selector); + w1[1] = hc_byte_perm (w0[3], w0[2], selector); + w1[0] = hc_byte_perm (w0[2], w0[1], selector); + w0[3] = hc_byte_perm (w0[1], w0[0], selector); + w0[2] = hc_byte_perm (w0[0], 0, selector); + w0[1] = 0; + w0[0] = 0; + + break; + + case 3: + c0[3] = hc_byte_perm ( 0, w7[3], selector); + c0[2] = hc_byte_perm (w7[3], w7[2], selector); + c0[1] = hc_byte_perm (w7[2], w7[1], selector); + c0[0] = hc_byte_perm (w7[1], w7[0], selector); + w7[3] = hc_byte_perm (w7[0], w6[3], selector); + w7[2] = hc_byte_perm (w6[3], w6[2], selector); + w7[1] = hc_byte_perm (w6[2], w6[1], selector); + w7[0] = hc_byte_perm (w6[1], w6[0], selector); + w6[3] = hc_byte_perm (w6[0], w5[3], selector); + w6[2] = hc_byte_perm (w5[3], w5[2], selector); + w6[1] = hc_byte_perm (w5[2], w5[1], selector); + w6[0] = hc_byte_perm (w5[1], w5[0], selector); + w5[3] = hc_byte_perm (w5[0], w4[3], selector); + w5[2] = hc_byte_perm (w4[3], w4[2], selector); + w5[1] = hc_byte_perm (w4[2], w4[1], selector); + w5[0] = hc_byte_perm (w4[1], w4[0], selector); + w4[3] = hc_byte_perm (w4[0], w3[3], selector); + w4[2] = hc_byte_perm (w3[3], w3[2], selector); + w4[1] = hc_byte_perm (w3[2], w3[1], selector); + w4[0] = hc_byte_perm (w3[1], w3[0], selector); + w3[3] = hc_byte_perm (w3[0], w2[3], selector); + w3[2] = hc_byte_perm (w2[3], w2[2], selector); + w3[1] = hc_byte_perm (w2[2], w2[1], selector); + w3[0] = hc_byte_perm (w2[1], w2[0], selector); + w2[3] = hc_byte_perm (w2[0], w1[3], selector); + w2[2] = hc_byte_perm (w1[3], w1[2], selector); + w2[1] = hc_byte_perm (w1[2], w1[1], selector); + w2[0] = hc_byte_perm (w1[1], w1[0], selector); + w1[3] = hc_byte_perm (w1[0], w0[3], selector); + w1[2] = hc_byte_perm (w0[3], w0[2], selector); + w1[1] = hc_byte_perm (w0[2], w0[1], selector); + w1[0] = hc_byte_perm (w0[1], w0[0], selector); + w0[3] = hc_byte_perm (w0[0], 0, selector); + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 4: + c1[0] = hc_byte_perm ( 0, w7[3], selector); + c0[3] = hc_byte_perm (w7[3], w7[2], selector); + c0[2] = hc_byte_perm (w7[2], w7[1], selector); + c0[1] = hc_byte_perm (w7[1], w7[0], selector); + c0[0] = hc_byte_perm (w7[0], w6[3], selector); + w7[3] = hc_byte_perm (w6[3], w6[2], selector); + w7[2] = hc_byte_perm (w6[2], w6[1], selector); + w7[1] = hc_byte_perm (w6[1], w6[0], selector); + w7[0] = hc_byte_perm (w6[0], w5[3], selector); + w6[3] = hc_byte_perm (w5[3], w5[2], selector); + w6[2] = hc_byte_perm (w5[2], w5[1], selector); + w6[1] = hc_byte_perm (w5[1], w5[0], selector); + w6[0] = hc_byte_perm (w5[0], w4[3], selector); + w5[3] = hc_byte_perm (w4[3], w4[2], selector); + w5[2] = hc_byte_perm (w4[2], w4[1], selector); + w5[1] = hc_byte_perm (w4[1], w4[0], selector); + w5[0] = hc_byte_perm (w4[0], w3[3], selector); + w4[3] = hc_byte_perm (w3[3], w3[2], selector); + w4[2] = hc_byte_perm (w3[2], w3[1], selector); + w4[1] = hc_byte_perm (w3[1], w3[0], selector); + w4[0] = hc_byte_perm (w3[0], w2[3], selector); + w3[3] = hc_byte_perm (w2[3], w2[2], selector); + w3[2] = hc_byte_perm (w2[2], w2[1], selector); + w3[1] = hc_byte_perm (w2[1], w2[0], selector); + w3[0] = hc_byte_perm (w2[0], w1[3], selector); + w2[3] = hc_byte_perm (w1[3], w1[2], selector); + w2[2] = hc_byte_perm (w1[2], w1[1], selector); + w2[1] = hc_byte_perm (w1[1], w1[0], selector); + w2[0] = hc_byte_perm (w1[0], w0[3], selector); + w1[3] = hc_byte_perm (w0[3], w0[2], selector); + w1[2] = hc_byte_perm (w0[2], w0[1], selector); + w1[1] = hc_byte_perm (w0[1], w0[0], selector); + w1[0] = hc_byte_perm (w0[0], 0, selector); + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 5: + c1[1] = hc_byte_perm ( 0, w7[3], selector); + c1[0] = hc_byte_perm (w7[3], w7[2], selector); + c0[3] = hc_byte_perm (w7[2], w7[1], selector); + c0[2] = hc_byte_perm (w7[1], w7[0], selector); + c0[1] = hc_byte_perm (w7[0], w6[3], selector); + c0[0] = hc_byte_perm (w6[3], w6[2], selector); + w7[3] = hc_byte_perm (w6[2], w6[1], selector); + w7[2] = hc_byte_perm (w6[1], w6[0], selector); + w7[1] = hc_byte_perm (w6[0], w5[3], selector); + w7[0] = hc_byte_perm (w5[3], w5[2], selector); + w6[3] = hc_byte_perm (w5[2], w5[1], selector); + w6[2] = hc_byte_perm (w5[1], w5[0], selector); + w6[1] = hc_byte_perm (w5[0], w4[3], selector); + w6[0] = hc_byte_perm (w4[3], w4[2], selector); + w5[3] = hc_byte_perm (w4[2], w4[1], selector); + w5[2] = hc_byte_perm (w4[1], w4[0], selector); + w5[1] = hc_byte_perm (w4[0], w3[3], selector); + w5[0] = hc_byte_perm (w3[3], w3[2], selector); + w4[3] = hc_byte_perm (w3[2], w3[1], selector); + w4[2] = hc_byte_perm (w3[1], w3[0], selector); + w4[1] = hc_byte_perm (w3[0], w2[3], selector); + w4[0] = hc_byte_perm (w2[3], w2[2], selector); + w3[3] = hc_byte_perm (w2[2], w2[1], selector); + w3[2] = hc_byte_perm (w2[1], w2[0], selector); + w3[1] = hc_byte_perm (w2[0], w1[3], selector); + w3[0] = hc_byte_perm (w1[3], w1[2], selector); + w2[3] = hc_byte_perm (w1[2], w1[1], selector); + w2[2] = hc_byte_perm (w1[1], w1[0], selector); + w2[1] = hc_byte_perm (w1[0], w0[3], selector); + w2[0] = hc_byte_perm (w0[3], w0[2], selector); + w1[3] = hc_byte_perm (w0[2], w0[1], selector); + w1[2] = hc_byte_perm (w0[1], w0[0], selector); + w1[1] = hc_byte_perm (w0[0], 0, selector); + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 6: + c1[2] = hc_byte_perm ( 0, w7[3], selector); + c1[1] = hc_byte_perm (w7[3], w7[2], selector); + c1[0] = hc_byte_perm (w7[2], w7[1], selector); + c0[3] = hc_byte_perm (w7[1], w7[0], selector); + c0[2] = hc_byte_perm (w7[0], w6[3], selector); + c0[1] = hc_byte_perm (w6[3], w6[2], selector); + c0[0] = hc_byte_perm (w6[2], w6[1], selector); + w7[3] = hc_byte_perm (w6[1], w6[0], selector); + w7[2] = hc_byte_perm (w6[0], w5[3], selector); + w7[1] = hc_byte_perm (w5[3], w5[2], selector); + w7[0] = hc_byte_perm (w5[2], w5[1], selector); + w6[3] = hc_byte_perm (w5[1], w5[0], selector); + w6[2] = hc_byte_perm (w5[0], w4[3], selector); + w6[1] = hc_byte_perm (w4[3], w4[2], selector); + w6[0] = hc_byte_perm (w4[2], w4[1], selector); + w5[3] = hc_byte_perm (w4[1], w4[0], selector); + w5[2] = hc_byte_perm (w4[0], w3[3], selector); + w5[1] = hc_byte_perm (w3[3], w3[2], selector); + w5[0] = hc_byte_perm (w3[2], w3[1], selector); + w4[3] = hc_byte_perm (w3[1], w3[0], selector); + w4[2] = hc_byte_perm (w3[0], w2[3], selector); + w4[1] = hc_byte_perm (w2[3], w2[2], selector); + w4[0] = hc_byte_perm (w2[2], w2[1], selector); + w3[3] = hc_byte_perm (w2[1], w2[0], selector); + w3[2] = hc_byte_perm (w2[0], w1[3], selector); + w3[1] = hc_byte_perm (w1[3], w1[2], selector); + w3[0] = hc_byte_perm (w1[2], w1[1], selector); + w2[3] = hc_byte_perm (w1[1], w1[0], selector); + w2[2] = hc_byte_perm (w1[0], w0[3], selector); + w2[1] = hc_byte_perm (w0[3], w0[2], selector); + w2[0] = hc_byte_perm (w0[2], w0[1], selector); + w1[3] = hc_byte_perm (w0[1], w0[0], selector); + w1[2] = hc_byte_perm (w0[0], 0, selector); + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 7: + c1[3] = hc_byte_perm ( 0, w7[3], selector); + c1[2] = hc_byte_perm (w7[3], w7[2], selector); + c1[1] = hc_byte_perm (w7[2], w7[1], selector); + c1[0] = hc_byte_perm (w7[1], w7[0], selector); + c0[3] = hc_byte_perm (w7[0], w6[3], selector); + c0[2] = hc_byte_perm (w6[3], w6[2], selector); + c0[1] = hc_byte_perm (w6[2], w6[1], selector); + c0[0] = hc_byte_perm (w6[1], w6[0], selector); + w7[3] = hc_byte_perm (w6[0], w5[3], selector); + w7[2] = hc_byte_perm (w5[3], w5[2], selector); + w7[1] = hc_byte_perm (w5[2], w5[1], selector); + w7[0] = hc_byte_perm (w5[1], w5[0], selector); + w6[3] = hc_byte_perm (w5[0], w4[3], selector); + w6[2] = hc_byte_perm (w4[3], w4[2], selector); + w6[1] = hc_byte_perm (w4[2], w4[1], selector); + w6[0] = hc_byte_perm (w4[1], w4[0], selector); + w5[3] = hc_byte_perm (w4[0], w3[3], selector); + w5[2] = hc_byte_perm (w3[3], w3[2], selector); + w5[1] = hc_byte_perm (w3[2], w3[1], selector); + w5[0] = hc_byte_perm (w3[1], w3[0], selector); + w4[3] = hc_byte_perm (w3[0], w2[3], selector); + w4[2] = hc_byte_perm (w2[3], w2[2], selector); + w4[1] = hc_byte_perm (w2[2], w2[1], selector); + w4[0] = hc_byte_perm (w2[1], w2[0], selector); + w3[3] = hc_byte_perm (w2[0], w1[3], selector); + w3[2] = hc_byte_perm (w1[3], w1[2], selector); + w3[1] = hc_byte_perm (w1[2], w1[1], selector); + w3[0] = hc_byte_perm (w1[1], w1[0], selector); + w2[3] = hc_byte_perm (w1[0], w0[3], selector); + w2[2] = hc_byte_perm (w0[3], w0[2], selector); + w2[1] = hc_byte_perm (w0[2], w0[1], selector); + w2[0] = hc_byte_perm (w0[1], w0[0], selector); + w1[3] = hc_byte_perm (w0[0], 0, selector); + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 8: + c2[0] = hc_byte_perm ( 0, w7[3], selector); + c1[3] = hc_byte_perm (w7[3], w7[2], selector); + c1[2] = hc_byte_perm (w7[2], w7[1], selector); + c1[1] = hc_byte_perm (w7[1], w7[0], selector); + c1[0] = hc_byte_perm (w7[0], w6[3], selector); + c0[3] = hc_byte_perm (w6[3], w6[2], selector); + c0[2] = hc_byte_perm (w6[2], w6[1], selector); + c0[1] = hc_byte_perm (w6[1], w6[0], selector); + c0[0] = hc_byte_perm (w6[0], w5[3], selector); + w7[3] = hc_byte_perm (w5[3], w5[2], selector); + w7[2] = hc_byte_perm (w5[2], w5[1], selector); + w7[1] = hc_byte_perm (w5[1], w5[0], selector); + w7[0] = hc_byte_perm (w5[0], w4[3], selector); + w6[3] = hc_byte_perm (w4[3], w4[2], selector); + w6[2] = hc_byte_perm (w4[2], w4[1], selector); + w6[1] = hc_byte_perm (w4[1], w4[0], selector); + w6[0] = hc_byte_perm (w4[0], w3[3], selector); + w5[3] = hc_byte_perm (w3[3], w3[2], selector); + w5[2] = hc_byte_perm (w3[2], w3[1], selector); + w5[1] = hc_byte_perm (w3[1], w3[0], selector); + w5[0] = hc_byte_perm (w3[0], w2[3], selector); + w4[3] = hc_byte_perm (w2[3], w2[2], selector); + w4[2] = hc_byte_perm (w2[2], w2[1], selector); + w4[1] = hc_byte_perm (w2[1], w2[0], selector); + w4[0] = hc_byte_perm (w2[0], w1[3], selector); + w3[3] = hc_byte_perm (w1[3], w1[2], selector); + w3[2] = hc_byte_perm (w1[2], w1[1], selector); + w3[1] = hc_byte_perm (w1[1], w1[0], selector); + w3[0] = hc_byte_perm (w1[0], w0[3], selector); + w2[3] = hc_byte_perm (w0[3], w0[2], selector); + w2[2] = hc_byte_perm (w0[2], w0[1], selector); + w2[1] = hc_byte_perm (w0[1], w0[0], selector); + w2[0] = hc_byte_perm (w0[0], 0, selector); + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 9: + c2[1] = hc_byte_perm ( 0, w7[3], selector); + c2[0] = hc_byte_perm (w7[3], w7[2], selector); + c1[3] = hc_byte_perm (w7[2], w7[1], selector); + c1[2] = hc_byte_perm (w7[1], w7[0], selector); + c1[1] = hc_byte_perm (w7[0], w6[3], selector); + c1[0] = hc_byte_perm (w6[3], w6[2], selector); + c0[3] = hc_byte_perm (w6[2], w6[1], selector); + c0[2] = hc_byte_perm (w6[1], w6[0], selector); + c0[1] = hc_byte_perm (w6[0], w5[3], selector); + c0[0] = hc_byte_perm (w5[3], w5[2], selector); + w7[3] = hc_byte_perm (w5[2], w5[1], selector); + w7[2] = hc_byte_perm (w5[1], w5[0], selector); + w7[1] = hc_byte_perm (w5[0], w4[3], selector); + w7[0] = hc_byte_perm (w4[3], w4[2], selector); + w6[3] = hc_byte_perm (w4[2], w4[1], selector); + w6[2] = hc_byte_perm (w4[1], w4[0], selector); + w6[1] = hc_byte_perm (w4[0], w3[3], selector); + w6[0] = hc_byte_perm (w3[3], w3[2], selector); + w5[3] = hc_byte_perm (w3[2], w3[1], selector); + w5[2] = hc_byte_perm (w3[1], w3[0], selector); + w5[1] = hc_byte_perm (w3[0], w2[3], selector); + w5[0] = hc_byte_perm (w2[3], w2[2], selector); + w4[3] = hc_byte_perm (w2[2], w2[1], selector); + w4[2] = hc_byte_perm (w2[1], w2[0], selector); + w4[1] = hc_byte_perm (w2[0], w1[3], selector); + w4[0] = hc_byte_perm (w1[3], w1[2], selector); + w3[3] = hc_byte_perm (w1[2], w1[1], selector); + w3[2] = hc_byte_perm (w1[1], w1[0], selector); + w3[1] = hc_byte_perm (w1[0], w0[3], selector); + w3[0] = hc_byte_perm (w0[3], w0[2], selector); + w2[3] = hc_byte_perm (w0[2], w0[1], selector); + w2[2] = hc_byte_perm (w0[1], w0[0], selector); + w2[1] = hc_byte_perm (w0[0], 0, selector); + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 10: + c2[2] = hc_byte_perm ( 0, w7[3], selector); + c2[1] = hc_byte_perm (w7[3], w7[2], selector); + c2[0] = hc_byte_perm (w7[2], w7[1], selector); + c1[3] = hc_byte_perm (w7[1], w7[0], selector); + c1[2] = hc_byte_perm (w7[0], w6[3], selector); + c1[1] = hc_byte_perm (w6[3], w6[2], selector); + c1[0] = hc_byte_perm (w6[2], w6[1], selector); + c0[3] = hc_byte_perm (w6[1], w6[0], selector); + c0[2] = hc_byte_perm (w6[0], w5[3], selector); + c0[1] = hc_byte_perm (w5[3], w5[2], selector); + c0[0] = hc_byte_perm (w5[2], w5[1], selector); + w7[3] = hc_byte_perm (w5[1], w5[0], selector); + w7[2] = hc_byte_perm (w5[0], w4[3], selector); + w7[1] = hc_byte_perm (w4[3], w4[2], selector); + w7[0] = hc_byte_perm (w4[2], w4[1], selector); + w6[3] = hc_byte_perm (w4[1], w4[0], selector); + w6[2] = hc_byte_perm (w4[0], w3[3], selector); + w6[1] = hc_byte_perm (w3[3], w3[2], selector); + w6[0] = hc_byte_perm (w3[2], w3[1], selector); + w5[3] = hc_byte_perm (w3[1], w3[0], selector); + w5[2] = hc_byte_perm (w3[0], w2[3], selector); + w5[1] = hc_byte_perm (w2[3], w2[2], selector); + w5[0] = hc_byte_perm (w2[2], w2[1], selector); + w4[3] = hc_byte_perm (w2[1], w2[0], selector); + w4[2] = hc_byte_perm (w2[0], w1[3], selector); + w4[1] = hc_byte_perm (w1[3], w1[2], selector); + w4[0] = hc_byte_perm (w1[2], w1[1], selector); + w3[3] = hc_byte_perm (w1[1], w1[0], selector); + w3[2] = hc_byte_perm (w1[0], w0[3], selector); + w3[1] = hc_byte_perm (w0[3], w0[2], selector); + w3[0] = hc_byte_perm (w0[2], w0[1], selector); + w2[3] = hc_byte_perm (w0[1], w0[0], selector); + w2[2] = hc_byte_perm (w0[0], 0, selector); + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 11: + c2[3] = hc_byte_perm ( 0, w7[3], selector); + c2[2] = hc_byte_perm (w7[3], w7[2], selector); + c2[1] = hc_byte_perm (w7[2], w7[1], selector); + c2[0] = hc_byte_perm (w7[1], w7[0], selector); + c1[3] = hc_byte_perm (w7[0], w6[3], selector); + c1[2] = hc_byte_perm (w6[3], w6[2], selector); + c1[1] = hc_byte_perm (w6[2], w6[1], selector); + c1[0] = hc_byte_perm (w6[1], w6[0], selector); + c0[3] = hc_byte_perm (w6[0], w5[3], selector); + c0[2] = hc_byte_perm (w5[3], w5[2], selector); + c0[1] = hc_byte_perm (w5[2], w5[1], selector); + c0[0] = hc_byte_perm (w5[1], w5[0], selector); + w7[3] = hc_byte_perm (w5[0], w4[3], selector); + w7[2] = hc_byte_perm (w4[3], w4[2], selector); + w7[1] = hc_byte_perm (w4[2], w4[1], selector); + w7[0] = hc_byte_perm (w4[1], w4[0], selector); + w6[3] = hc_byte_perm (w4[0], w3[3], selector); + w6[2] = hc_byte_perm (w3[3], w3[2], selector); + w6[1] = hc_byte_perm (w3[2], w3[1], selector); + w6[0] = hc_byte_perm (w3[1], w3[0], selector); + w5[3] = hc_byte_perm (w3[0], w2[3], selector); + w5[2] = hc_byte_perm (w2[3], w2[2], selector); + w5[1] = hc_byte_perm (w2[2], w2[1], selector); + w5[0] = hc_byte_perm (w2[1], w2[0], selector); + w4[3] = hc_byte_perm (w2[0], w1[3], selector); + w4[2] = hc_byte_perm (w1[3], w1[2], selector); + w4[1] = hc_byte_perm (w1[2], w1[1], selector); + w4[0] = hc_byte_perm (w1[1], w1[0], selector); + w3[3] = hc_byte_perm (w1[0], w0[3], selector); + w3[2] = hc_byte_perm (w0[3], w0[2], selector); + w3[1] = hc_byte_perm (w0[2], w0[1], selector); + w3[0] = hc_byte_perm (w0[1], w0[0], selector); + w2[3] = hc_byte_perm (w0[0], 0, selector); + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 12: + c3[0] = hc_byte_perm ( 0, w7[3], selector); + c2[3] = hc_byte_perm (w7[3], w7[2], selector); + c2[2] = hc_byte_perm (w7[2], w7[1], selector); + c2[1] = hc_byte_perm (w7[1], w7[0], selector); + c2[0] = hc_byte_perm (w7[0], w6[3], selector); + c1[3] = hc_byte_perm (w6[3], w6[2], selector); + c1[2] = hc_byte_perm (w6[2], w6[1], selector); + c1[1] = hc_byte_perm (w6[1], w6[0], selector); + c1[0] = hc_byte_perm (w6[0], w5[3], selector); + c0[3] = hc_byte_perm (w5[3], w5[2], selector); + c0[2] = hc_byte_perm (w5[2], w5[1], selector); + c0[1] = hc_byte_perm (w5[1], w5[0], selector); + c0[0] = hc_byte_perm (w5[0], w4[3], selector); + w7[3] = hc_byte_perm (w4[3], w4[2], selector); + w7[2] = hc_byte_perm (w4[2], w4[1], selector); + w7[1] = hc_byte_perm (w4[1], w4[0], selector); + w7[0] = hc_byte_perm (w4[0], w3[3], selector); + w6[3] = hc_byte_perm (w3[3], w3[2], selector); + w6[2] = hc_byte_perm (w3[2], w3[1], selector); + w6[1] = hc_byte_perm (w3[1], w3[0], selector); + w6[0] = hc_byte_perm (w3[0], w2[3], selector); + w5[3] = hc_byte_perm (w2[3], w2[2], selector); + w5[2] = hc_byte_perm (w2[2], w2[1], selector); + w5[1] = hc_byte_perm (w2[1], w2[0], selector); + w5[0] = hc_byte_perm (w2[0], w1[3], selector); + w4[3] = hc_byte_perm (w1[3], w1[2], selector); + w4[2] = hc_byte_perm (w1[2], w1[1], selector); + w4[1] = hc_byte_perm (w1[1], w1[0], selector); + w4[0] = hc_byte_perm (w1[0], w0[3], selector); + w3[3] = hc_byte_perm (w0[3], w0[2], selector); + w3[2] = hc_byte_perm (w0[2], w0[1], selector); + w3[1] = hc_byte_perm (w0[1], w0[0], selector); + w3[0] = hc_byte_perm (w0[0], 0, selector); + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 13: + c3[1] = hc_byte_perm ( 0, w7[3], selector); + c3[0] = hc_byte_perm (w7[3], w7[2], selector); + c2[3] = hc_byte_perm (w7[2], w7[1], selector); + c2[2] = hc_byte_perm (w7[1], w7[0], selector); + c2[1] = hc_byte_perm (w7[0], w6[3], selector); + c2[0] = hc_byte_perm (w6[3], w6[2], selector); + c1[3] = hc_byte_perm (w6[2], w6[1], selector); + c1[2] = hc_byte_perm (w6[1], w6[0], selector); + c1[1] = hc_byte_perm (w6[0], w5[3], selector); + c1[0] = hc_byte_perm (w5[3], w5[2], selector); + c0[3] = hc_byte_perm (w5[2], w5[1], selector); + c0[2] = hc_byte_perm (w5[1], w5[0], selector); + c0[1] = hc_byte_perm (w5[0], w4[3], selector); + c0[0] = hc_byte_perm (w4[3], w4[2], selector); + w7[3] = hc_byte_perm (w4[2], w4[1], selector); + w7[2] = hc_byte_perm (w4[1], w4[0], selector); + w7[1] = hc_byte_perm (w4[0], w3[3], selector); + w7[0] = hc_byte_perm (w3[3], w3[2], selector); + w6[3] = hc_byte_perm (w3[2], w3[1], selector); + w6[2] = hc_byte_perm (w3[1], w3[0], selector); + w6[1] = hc_byte_perm (w3[0], w2[3], selector); + w6[0] = hc_byte_perm (w2[3], w2[2], selector); + w5[3] = hc_byte_perm (w2[2], w2[1], selector); + w5[2] = hc_byte_perm (w2[1], w2[0], selector); + w5[1] = hc_byte_perm (w2[0], w1[3], selector); + w5[0] = hc_byte_perm (w1[3], w1[2], selector); + w4[3] = hc_byte_perm (w1[2], w1[1], selector); + w4[2] = hc_byte_perm (w1[1], w1[0], selector); + w4[1] = hc_byte_perm (w1[0], w0[3], selector); + w4[0] = hc_byte_perm (w0[3], w0[2], selector); + w3[3] = hc_byte_perm (w0[2], w0[1], selector); + w3[2] = hc_byte_perm (w0[1], w0[0], selector); + w3[1] = hc_byte_perm (w0[0], 0, selector); + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 14: + c3[2] = hc_byte_perm ( 0, w7[3], selector); + c3[1] = hc_byte_perm (w7[3], w7[2], selector); + c3[0] = hc_byte_perm (w7[2], w7[1], selector); + c2[3] = hc_byte_perm (w7[1], w7[0], selector); + c2[2] = hc_byte_perm (w7[0], w6[3], selector); + c2[1] = hc_byte_perm (w6[3], w6[2], selector); + c2[0] = hc_byte_perm (w6[2], w6[1], selector); + c1[3] = hc_byte_perm (w6[1], w6[0], selector); + c1[2] = hc_byte_perm (w6[0], w5[3], selector); + c1[1] = hc_byte_perm (w5[3], w5[2], selector); + c1[0] = hc_byte_perm (w5[2], w5[1], selector); + c0[3] = hc_byte_perm (w5[1], w5[0], selector); + c0[2] = hc_byte_perm (w5[0], w4[3], selector); + c0[1] = hc_byte_perm (w4[3], w4[2], selector); + c0[0] = hc_byte_perm (w4[2], w4[1], selector); + w7[3] = hc_byte_perm (w4[1], w4[0], selector); + w7[2] = hc_byte_perm (w4[0], w3[3], selector); + w7[1] = hc_byte_perm (w3[3], w3[2], selector); + w7[0] = hc_byte_perm (w3[2], w3[1], selector); + w6[3] = hc_byte_perm (w3[1], w3[0], selector); + w6[2] = hc_byte_perm (w3[0], w2[3], selector); + w6[1] = hc_byte_perm (w2[3], w2[2], selector); + w6[0] = hc_byte_perm (w2[2], w2[1], selector); + w5[3] = hc_byte_perm (w2[1], w2[0], selector); + w5[2] = hc_byte_perm (w2[0], w1[3], selector); + w5[1] = hc_byte_perm (w1[3], w1[2], selector); + w5[0] = hc_byte_perm (w1[2], w1[1], selector); + w4[3] = hc_byte_perm (w1[1], w1[0], selector); + w4[2] = hc_byte_perm (w1[0], w0[3], selector); + w4[1] = hc_byte_perm (w0[3], w0[2], selector); + w4[0] = hc_byte_perm (w0[2], w0[1], selector); + w3[3] = hc_byte_perm (w0[1], w0[0], selector); + w3[2] = hc_byte_perm (w0[0], 0, selector); + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 15: + c3[3] = hc_byte_perm ( 0, w7[3], selector); + c3[2] = hc_byte_perm (w7[3], w7[2], selector); + c3[1] = hc_byte_perm (w7[2], w7[1], selector); + c3[0] = hc_byte_perm (w7[1], w7[0], selector); + c2[3] = hc_byte_perm (w7[0], w6[3], selector); + c2[2] = hc_byte_perm (w6[3], w6[2], selector); + c2[1] = hc_byte_perm (w6[2], w6[1], selector); + c2[0] = hc_byte_perm (w6[1], w6[0], selector); + c1[3] = hc_byte_perm (w6[0], w5[3], selector); + c1[2] = hc_byte_perm (w5[3], w5[2], selector); + c1[1] = hc_byte_perm (w5[2], w5[1], selector); + c1[0] = hc_byte_perm (w5[1], w5[0], selector); + c0[3] = hc_byte_perm (w5[0], w4[3], selector); + c0[2] = hc_byte_perm (w4[3], w4[2], selector); + c0[1] = hc_byte_perm (w4[2], w4[1], selector); + c0[0] = hc_byte_perm (w4[1], w4[0], selector); + w7[3] = hc_byte_perm (w4[0], w3[3], selector); + w7[2] = hc_byte_perm (w3[3], w3[2], selector); + w7[1] = hc_byte_perm (w3[2], w3[1], selector); + w7[0] = hc_byte_perm (w3[1], w3[0], selector); + w6[3] = hc_byte_perm (w3[0], w2[3], selector); + w6[2] = hc_byte_perm (w2[3], w2[2], selector); + w6[1] = hc_byte_perm (w2[2], w2[1], selector); + w6[0] = hc_byte_perm (w2[1], w2[0], selector); + w5[3] = hc_byte_perm (w2[0], w1[3], selector); + w5[2] = hc_byte_perm (w1[3], w1[2], selector); + w5[1] = hc_byte_perm (w1[2], w1[1], selector); + w5[0] = hc_byte_perm (w1[1], w1[0], selector); + w4[3] = hc_byte_perm (w1[0], w0[3], selector); + w4[2] = hc_byte_perm (w0[3], w0[2], selector); + w4[1] = hc_byte_perm (w0[2], w0[1], selector); + w4[0] = hc_byte_perm (w0[1], w0[0], selector); + w3[3] = hc_byte_perm (w0[0], 0, selector); + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 16: + c4[0] = hc_byte_perm ( 0, w7[3], selector); + c3[3] = hc_byte_perm (w7[3], w7[2], selector); + c3[2] = hc_byte_perm (w7[2], w7[1], selector); + c3[1] = hc_byte_perm (w7[1], w7[0], selector); + c3[0] = hc_byte_perm (w7[0], w6[3], selector); + c2[3] = hc_byte_perm (w6[3], w6[2], selector); + c2[2] = hc_byte_perm (w6[2], w6[1], selector); + c2[1] = hc_byte_perm (w6[1], w6[0], selector); + c2[0] = hc_byte_perm (w6[0], w5[3], selector); + c1[3] = hc_byte_perm (w5[3], w5[2], selector); + c1[2] = hc_byte_perm (w5[2], w5[1], selector); + c1[1] = hc_byte_perm (w5[1], w5[0], selector); + c1[0] = hc_byte_perm (w5[0], w4[3], selector); + c0[3] = hc_byte_perm (w4[3], w4[2], selector); + c0[2] = hc_byte_perm (w4[2], w4[1], selector); + c0[1] = hc_byte_perm (w4[1], w4[0], selector); + c0[0] = hc_byte_perm (w4[0], w3[3], selector); + w7[3] = hc_byte_perm (w3[3], w3[2], selector); + w7[2] = hc_byte_perm (w3[2], w3[1], selector); + w7[1] = hc_byte_perm (w3[1], w3[0], selector); + w7[0] = hc_byte_perm (w3[0], w2[3], selector); + w6[3] = hc_byte_perm (w2[3], w2[2], selector); + w6[2] = hc_byte_perm (w2[2], w2[1], selector); + w6[1] = hc_byte_perm (w2[1], w2[0], selector); + w6[0] = hc_byte_perm (w2[0], w1[3], selector); + w5[3] = hc_byte_perm (w1[3], w1[2], selector); + w5[2] = hc_byte_perm (w1[2], w1[1], selector); + w5[1] = hc_byte_perm (w1[1], w1[0], selector); + w5[0] = hc_byte_perm (w1[0], w0[3], selector); + w4[3] = hc_byte_perm (w0[3], w0[2], selector); + w4[2] = hc_byte_perm (w0[2], w0[1], selector); + w4[1] = hc_byte_perm (w0[1], w0[0], selector); + w4[0] = hc_byte_perm (w0[0], 0, selector); + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 17: + c4[1] = hc_byte_perm ( 0, w7[3], selector); + c4[0] = hc_byte_perm (w7[3], w7[2], selector); + c3[3] = hc_byte_perm (w7[2], w7[1], selector); + c3[2] = hc_byte_perm (w7[1], w7[0], selector); + c3[1] = hc_byte_perm (w7[0], w6[3], selector); + c3[0] = hc_byte_perm (w6[3], w6[2], selector); + c2[3] = hc_byte_perm (w6[2], w6[1], selector); + c2[2] = hc_byte_perm (w6[1], w6[0], selector); + c2[1] = hc_byte_perm (w6[0], w5[3], selector); + c2[0] = hc_byte_perm (w5[3], w5[2], selector); + c1[3] = hc_byte_perm (w5[2], w5[1], selector); + c1[2] = hc_byte_perm (w5[1], w5[0], selector); + c1[1] = hc_byte_perm (w5[0], w4[3], selector); + c1[0] = hc_byte_perm (w4[3], w4[2], selector); + c0[3] = hc_byte_perm (w4[2], w4[1], selector); + c0[2] = hc_byte_perm (w4[1], w4[0], selector); + c0[1] = hc_byte_perm (w4[0], w3[3], selector); + c0[0] = hc_byte_perm (w3[3], w3[2], selector); + w7[3] = hc_byte_perm (w3[2], w3[1], selector); + w7[2] = hc_byte_perm (w3[1], w3[0], selector); + w7[1] = hc_byte_perm (w3[0], w2[3], selector); + w7[0] = hc_byte_perm (w2[3], w2[2], selector); + w6[3] = hc_byte_perm (w2[2], w2[1], selector); + w6[2] = hc_byte_perm (w2[1], w2[0], selector); + w6[1] = hc_byte_perm (w2[0], w1[3], selector); + w6[0] = hc_byte_perm (w1[3], w1[2], selector); + w5[3] = hc_byte_perm (w1[2], w1[1], selector); + w5[2] = hc_byte_perm (w1[1], w1[0], selector); + w5[1] = hc_byte_perm (w1[0], w0[3], selector); + w5[0] = hc_byte_perm (w0[3], w0[2], selector); + w4[3] = hc_byte_perm (w0[2], w0[1], selector); + w4[2] = hc_byte_perm (w0[1], w0[0], selector); + w4[1] = hc_byte_perm (w0[0], 0, selector); + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 18: + c4[2] = hc_byte_perm ( 0, w7[3], selector); + c4[1] = hc_byte_perm (w7[3], w7[2], selector); + c4[0] = hc_byte_perm (w7[2], w7[1], selector); + c3[3] = hc_byte_perm (w7[1], w7[0], selector); + c3[2] = hc_byte_perm (w7[0], w6[3], selector); + c3[1] = hc_byte_perm (w6[3], w6[2], selector); + c3[0] = hc_byte_perm (w6[2], w6[1], selector); + c2[3] = hc_byte_perm (w6[1], w6[0], selector); + c2[2] = hc_byte_perm (w6[0], w5[3], selector); + c2[1] = hc_byte_perm (w5[3], w5[2], selector); + c2[0] = hc_byte_perm (w5[2], w5[1], selector); + c1[3] = hc_byte_perm (w5[1], w5[0], selector); + c1[2] = hc_byte_perm (w5[0], w4[3], selector); + c1[1] = hc_byte_perm (w4[3], w4[2], selector); + c1[0] = hc_byte_perm (w4[2], w4[1], selector); + c0[3] = hc_byte_perm (w4[1], w4[0], selector); + c0[2] = hc_byte_perm (w4[0], w3[3], selector); + c0[1] = hc_byte_perm (w3[3], w3[2], selector); + c0[0] = hc_byte_perm (w3[2], w3[1], selector); + w7[3] = hc_byte_perm (w3[1], w3[0], selector); + w7[2] = hc_byte_perm (w3[0], w2[3], selector); + w7[1] = hc_byte_perm (w2[3], w2[2], selector); + w7[0] = hc_byte_perm (w2[2], w2[1], selector); + w6[3] = hc_byte_perm (w2[1], w2[0], selector); + w6[2] = hc_byte_perm (w2[0], w1[3], selector); + w6[1] = hc_byte_perm (w1[3], w1[2], selector); + w6[0] = hc_byte_perm (w1[2], w1[1], selector); + w5[3] = hc_byte_perm (w1[1], w1[0], selector); + w5[2] = hc_byte_perm (w1[0], w0[3], selector); + w5[1] = hc_byte_perm (w0[3], w0[2], selector); + w5[0] = hc_byte_perm (w0[2], w0[1], selector); + w4[3] = hc_byte_perm (w0[1], w0[0], selector); + w4[2] = hc_byte_perm (w0[0], 0, selector); + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 19: + c4[3] = hc_byte_perm ( 0, w7[3], selector); + c4[2] = hc_byte_perm (w7[3], w7[2], selector); + c4[1] = hc_byte_perm (w7[2], w7[1], selector); + c4[0] = hc_byte_perm (w7[1], w7[0], selector); + c3[3] = hc_byte_perm (w7[0], w6[3], selector); + c3[2] = hc_byte_perm (w6[3], w6[2], selector); + c3[1] = hc_byte_perm (w6[2], w6[1], selector); + c3[0] = hc_byte_perm (w6[1], w6[0], selector); + c2[3] = hc_byte_perm (w6[0], w5[3], selector); + c2[2] = hc_byte_perm (w5[3], w5[2], selector); + c2[1] = hc_byte_perm (w5[2], w5[1], selector); + c2[0] = hc_byte_perm (w5[1], w5[0], selector); + c1[3] = hc_byte_perm (w5[0], w4[3], selector); + c1[2] = hc_byte_perm (w4[3], w4[2], selector); + c1[1] = hc_byte_perm (w4[2], w4[1], selector); + c1[0] = hc_byte_perm (w4[1], w4[0], selector); + c0[3] = hc_byte_perm (w4[0], w3[3], selector); + c0[2] = hc_byte_perm (w3[3], w3[2], selector); + c0[1] = hc_byte_perm (w3[2], w3[1], selector); + c0[0] = hc_byte_perm (w3[1], w3[0], selector); + w7[3] = hc_byte_perm (w3[0], w2[3], selector); + w7[2] = hc_byte_perm (w2[3], w2[2], selector); + w7[1] = hc_byte_perm (w2[2], w2[1], selector); + w7[0] = hc_byte_perm (w2[1], w2[0], selector); + w6[3] = hc_byte_perm (w2[0], w1[3], selector); + w6[2] = hc_byte_perm (w1[3], w1[2], selector); + w6[1] = hc_byte_perm (w1[2], w1[1], selector); + w6[0] = hc_byte_perm (w1[1], w1[0], selector); + w5[3] = hc_byte_perm (w1[0], w0[3], selector); + w5[2] = hc_byte_perm (w0[3], w0[2], selector); + w5[1] = hc_byte_perm (w0[2], w0[1], selector); + w5[0] = hc_byte_perm (w0[1], w0[0], selector); + w4[3] = hc_byte_perm (w0[0], 0, selector); + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 20: + c5[0] = hc_byte_perm ( 0, w7[3], selector); + c4[3] = hc_byte_perm (w7[3], w7[2], selector); + c4[2] = hc_byte_perm (w7[2], w7[1], selector); + c4[1] = hc_byte_perm (w7[1], w7[0], selector); + c4[0] = hc_byte_perm (w7[0], w6[3], selector); + c3[3] = hc_byte_perm (w6[3], w6[2], selector); + c3[2] = hc_byte_perm (w6[2], w6[1], selector); + c3[1] = hc_byte_perm (w6[1], w6[0], selector); + c3[0] = hc_byte_perm (w6[0], w5[3], selector); + c2[3] = hc_byte_perm (w5[3], w5[2], selector); + c2[2] = hc_byte_perm (w5[2], w5[1], selector); + c2[1] = hc_byte_perm (w5[1], w5[0], selector); + c2[0] = hc_byte_perm (w5[0], w4[3], selector); + c1[3] = hc_byte_perm (w4[3], w4[2], selector); + c1[2] = hc_byte_perm (w4[2], w4[1], selector); + c1[1] = hc_byte_perm (w4[1], w4[0], selector); + c1[0] = hc_byte_perm (w4[0], w3[3], selector); + c0[3] = hc_byte_perm (w3[3], w3[2], selector); + c0[2] = hc_byte_perm (w3[2], w3[1], selector); + c0[1] = hc_byte_perm (w3[1], w3[0], selector); + c0[0] = hc_byte_perm (w3[0], w2[3], selector); + w7[3] = hc_byte_perm (w2[3], w2[2], selector); + w7[2] = hc_byte_perm (w2[2], w2[1], selector); + w7[1] = hc_byte_perm (w2[1], w2[0], selector); + w7[0] = hc_byte_perm (w2[0], w1[3], selector); + w6[3] = hc_byte_perm (w1[3], w1[2], selector); + w6[2] = hc_byte_perm (w1[2], w1[1], selector); + w6[1] = hc_byte_perm (w1[1], w1[0], selector); + w6[0] = hc_byte_perm (w1[0], w0[3], selector); + w5[3] = hc_byte_perm (w0[3], w0[2], selector); + w5[2] = hc_byte_perm (w0[2], w0[1], selector); + w5[1] = hc_byte_perm (w0[1], w0[0], selector); + w5[0] = hc_byte_perm (w0[0], 0, selector); + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 21: + c5[1] = hc_byte_perm ( 0, w7[3], selector); + c5[0] = hc_byte_perm (w7[3], w7[2], selector); + c4[3] = hc_byte_perm (w7[2], w7[1], selector); + c4[2] = hc_byte_perm (w7[1], w7[0], selector); + c4[1] = hc_byte_perm (w7[0], w6[3], selector); + c4[0] = hc_byte_perm (w6[3], w6[2], selector); + c3[3] = hc_byte_perm (w6[2], w6[1], selector); + c3[2] = hc_byte_perm (w6[1], w6[0], selector); + c3[1] = hc_byte_perm (w6[0], w5[3], selector); + c3[0] = hc_byte_perm (w5[3], w5[2], selector); + c2[3] = hc_byte_perm (w5[2], w5[1], selector); + c2[2] = hc_byte_perm (w5[1], w5[0], selector); + c2[1] = hc_byte_perm (w5[0], w4[3], selector); + c2[0] = hc_byte_perm (w4[3], w4[2], selector); + c1[3] = hc_byte_perm (w4[2], w4[1], selector); + c1[2] = hc_byte_perm (w4[1], w4[0], selector); + c1[1] = hc_byte_perm (w4[0], w3[3], selector); + c1[0] = hc_byte_perm (w3[3], w3[2], selector); + c0[3] = hc_byte_perm (w3[2], w3[1], selector); + c0[2] = hc_byte_perm (w3[1], w3[0], selector); + c0[1] = hc_byte_perm (w3[0], w2[3], selector); + c0[0] = hc_byte_perm (w2[3], w2[2], selector); + w7[3] = hc_byte_perm (w2[2], w2[1], selector); + w7[2] = hc_byte_perm (w2[1], w2[0], selector); + w7[1] = hc_byte_perm (w2[0], w1[3], selector); + w7[0] = hc_byte_perm (w1[3], w1[2], selector); + w6[3] = hc_byte_perm (w1[2], w1[1], selector); + w6[2] = hc_byte_perm (w1[1], w1[0], selector); + w6[1] = hc_byte_perm (w1[0], w0[3], selector); + w6[0] = hc_byte_perm (w0[3], w0[2], selector); + w5[3] = hc_byte_perm (w0[2], w0[1], selector); + w5[2] = hc_byte_perm (w0[1], w0[0], selector); + w5[1] = hc_byte_perm (w0[0], 0, selector); + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 22: + c5[2] = hc_byte_perm ( 0, w7[3], selector); + c5[1] = hc_byte_perm (w7[3], w7[2], selector); + c5[0] = hc_byte_perm (w7[2], w7[1], selector); + c4[3] = hc_byte_perm (w7[1], w7[0], selector); + c4[2] = hc_byte_perm (w7[0], w6[3], selector); + c4[1] = hc_byte_perm (w6[3], w6[2], selector); + c4[0] = hc_byte_perm (w6[2], w6[1], selector); + c3[3] = hc_byte_perm (w6[1], w6[0], selector); + c3[2] = hc_byte_perm (w6[0], w5[3], selector); + c3[1] = hc_byte_perm (w5[3], w5[2], selector); + c3[0] = hc_byte_perm (w5[2], w5[1], selector); + c2[3] = hc_byte_perm (w5[1], w5[0], selector); + c2[2] = hc_byte_perm (w5[0], w4[3], selector); + c2[1] = hc_byte_perm (w4[3], w4[2], selector); + c2[0] = hc_byte_perm (w4[2], w4[1], selector); + c1[3] = hc_byte_perm (w4[1], w4[0], selector); + c1[2] = hc_byte_perm (w4[0], w3[3], selector); + c1[1] = hc_byte_perm (w3[3], w3[2], selector); + c1[0] = hc_byte_perm (w3[2], w3[1], selector); + c0[3] = hc_byte_perm (w3[1], w3[0], selector); + c0[2] = hc_byte_perm (w3[0], w2[3], selector); + c0[1] = hc_byte_perm (w2[3], w2[2], selector); + c0[0] = hc_byte_perm (w2[2], w2[1], selector); + w7[3] = hc_byte_perm (w2[1], w2[0], selector); + w7[2] = hc_byte_perm (w2[0], w1[3], selector); + w7[1] = hc_byte_perm (w1[3], w1[2], selector); + w7[0] = hc_byte_perm (w1[2], w1[1], selector); + w6[3] = hc_byte_perm (w1[1], w1[0], selector); + w6[2] = hc_byte_perm (w1[0], w0[3], selector); + w6[1] = hc_byte_perm (w0[3], w0[2], selector); + w6[0] = hc_byte_perm (w0[2], w0[1], selector); + w5[3] = hc_byte_perm (w0[1], w0[0], selector); + w5[2] = hc_byte_perm (w0[0], 0, selector); + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 23: + c5[3] = hc_byte_perm ( 0, w7[3], selector); + c5[2] = hc_byte_perm (w7[3], w7[2], selector); + c5[1] = hc_byte_perm (w7[2], w7[1], selector); + c5[0] = hc_byte_perm (w7[1], w7[0], selector); + c4[3] = hc_byte_perm (w7[0], w6[3], selector); + c4[2] = hc_byte_perm (w6[3], w6[2], selector); + c4[1] = hc_byte_perm (w6[2], w6[1], selector); + c4[0] = hc_byte_perm (w6[1], w6[0], selector); + c3[3] = hc_byte_perm (w6[0], w5[3], selector); + c3[2] = hc_byte_perm (w5[3], w5[2], selector); + c3[1] = hc_byte_perm (w5[2], w5[1], selector); + c3[0] = hc_byte_perm (w5[1], w5[0], selector); + c2[3] = hc_byte_perm (w5[0], w4[3], selector); + c2[2] = hc_byte_perm (w4[3], w4[2], selector); + c2[1] = hc_byte_perm (w4[2], w4[1], selector); + c2[0] = hc_byte_perm (w4[1], w4[0], selector); + c1[3] = hc_byte_perm (w4[0], w3[3], selector); + c1[2] = hc_byte_perm (w3[3], w3[2], selector); + c1[1] = hc_byte_perm (w3[2], w3[1], selector); + c1[0] = hc_byte_perm (w3[1], w3[0], selector); + c0[3] = hc_byte_perm (w3[0], w2[3], selector); + c0[2] = hc_byte_perm (w2[3], w2[2], selector); + c0[1] = hc_byte_perm (w2[2], w2[1], selector); + c0[0] = hc_byte_perm (w2[1], w2[0], selector); + w7[3] = hc_byte_perm (w2[0], w1[3], selector); + w7[2] = hc_byte_perm (w1[3], w1[2], selector); + w7[1] = hc_byte_perm (w1[2], w1[1], selector); + w7[0] = hc_byte_perm (w1[1], w1[0], selector); + w6[3] = hc_byte_perm (w1[0], w0[3], selector); + w6[2] = hc_byte_perm (w0[3], w0[2], selector); + w6[1] = hc_byte_perm (w0[2], w0[1], selector); + w6[0] = hc_byte_perm (w0[1], w0[0], selector); + w5[3] = hc_byte_perm (w0[0], 0, selector); + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 24: + c6[0] = hc_byte_perm ( 0, w7[3], selector); + c5[3] = hc_byte_perm (w7[3], w7[2], selector); + c5[2] = hc_byte_perm (w7[2], w7[1], selector); + c5[1] = hc_byte_perm (w7[1], w7[0], selector); + c5[0] = hc_byte_perm (w7[0], w6[3], selector); + c4[3] = hc_byte_perm (w6[3], w6[2], selector); + c4[2] = hc_byte_perm (w6[2], w6[1], selector); + c4[1] = hc_byte_perm (w6[1], w6[0], selector); + c4[0] = hc_byte_perm (w6[0], w5[3], selector); + c3[3] = hc_byte_perm (w5[3], w5[2], selector); + c3[2] = hc_byte_perm (w5[2], w5[1], selector); + c3[1] = hc_byte_perm (w5[1], w5[0], selector); + c3[0] = hc_byte_perm (w5[0], w4[3], selector); + c2[3] = hc_byte_perm (w4[3], w4[2], selector); + c2[2] = hc_byte_perm (w4[2], w4[1], selector); + c2[1] = hc_byte_perm (w4[1], w4[0], selector); + c2[0] = hc_byte_perm (w4[0], w3[3], selector); + c1[3] = hc_byte_perm (w3[3], w3[2], selector); + c1[2] = hc_byte_perm (w3[2], w3[1], selector); + c1[1] = hc_byte_perm (w3[1], w3[0], selector); + c1[0] = hc_byte_perm (w3[0], w2[3], selector); + c0[3] = hc_byte_perm (w2[3], w2[2], selector); + c0[2] = hc_byte_perm (w2[2], w2[1], selector); + c0[1] = hc_byte_perm (w2[1], w2[0], selector); + c0[0] = hc_byte_perm (w2[0], w1[3], selector); + w7[3] = hc_byte_perm (w1[3], w1[2], selector); + w7[2] = hc_byte_perm (w1[2], w1[1], selector); + w7[1] = hc_byte_perm (w1[1], w1[0], selector); + w7[0] = hc_byte_perm (w1[0], w0[3], selector); + w6[3] = hc_byte_perm (w0[3], w0[2], selector); + w6[2] = hc_byte_perm (w0[2], w0[1], selector); + w6[1] = hc_byte_perm (w0[1], w0[0], selector); + w6[0] = hc_byte_perm (w0[0], 0, selector); + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 25: + c6[1] = hc_byte_perm ( 0, w7[3], selector); + c6[0] = hc_byte_perm (w7[3], w7[2], selector); + c5[3] = hc_byte_perm (w7[2], w7[1], selector); + c5[2] = hc_byte_perm (w7[1], w7[0], selector); + c5[1] = hc_byte_perm (w7[0], w6[3], selector); + c5[0] = hc_byte_perm (w6[3], w6[2], selector); + c4[3] = hc_byte_perm (w6[2], w6[1], selector); + c4[2] = hc_byte_perm (w6[1], w6[0], selector); + c4[1] = hc_byte_perm (w6[0], w5[3], selector); + c4[0] = hc_byte_perm (w5[3], w5[2], selector); + c3[3] = hc_byte_perm (w5[2], w5[1], selector); + c3[2] = hc_byte_perm (w5[1], w5[0], selector); + c3[1] = hc_byte_perm (w5[0], w4[3], selector); + c3[0] = hc_byte_perm (w4[3], w4[2], selector); + c2[3] = hc_byte_perm (w4[2], w4[1], selector); + c2[2] = hc_byte_perm (w4[1], w4[0], selector); + c2[1] = hc_byte_perm (w4[0], w3[3], selector); + c2[0] = hc_byte_perm (w3[3], w3[2], selector); + c1[3] = hc_byte_perm (w3[2], w3[1], selector); + c1[2] = hc_byte_perm (w3[1], w3[0], selector); + c1[1] = hc_byte_perm (w3[0], w2[3], selector); + c1[0] = hc_byte_perm (w2[3], w2[2], selector); + c0[3] = hc_byte_perm (w2[2], w2[1], selector); + c0[2] = hc_byte_perm (w2[1], w2[0], selector); + c0[1] = hc_byte_perm (w2[0], w1[3], selector); + c0[0] = hc_byte_perm (w1[3], w1[2], selector); + w7[3] = hc_byte_perm (w1[2], w1[1], selector); + w7[2] = hc_byte_perm (w1[1], w1[0], selector); + w7[1] = hc_byte_perm (w1[0], w0[3], selector); + w7[0] = hc_byte_perm (w0[3], w0[2], selector); + w6[3] = hc_byte_perm (w0[2], w0[1], selector); + w6[2] = hc_byte_perm (w0[1], w0[0], selector); + w6[1] = hc_byte_perm (w0[0], 0, selector); + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 26: + c6[2] = hc_byte_perm ( 0, w7[3], selector); + c6[1] = hc_byte_perm (w7[3], w7[2], selector); + c6[0] = hc_byte_perm (w7[2], w7[1], selector); + c5[3] = hc_byte_perm (w7[1], w7[0], selector); + c5[2] = hc_byte_perm (w7[0], w6[3], selector); + c5[1] = hc_byte_perm (w6[3], w6[2], selector); + c5[0] = hc_byte_perm (w6[2], w6[1], selector); + c4[3] = hc_byte_perm (w6[1], w6[0], selector); + c4[2] = hc_byte_perm (w6[0], w5[3], selector); + c4[1] = hc_byte_perm (w5[3], w5[2], selector); + c4[0] = hc_byte_perm (w5[2], w5[1], selector); + c3[3] = hc_byte_perm (w5[1], w5[0], selector); + c3[2] = hc_byte_perm (w5[0], w4[3], selector); + c3[1] = hc_byte_perm (w4[3], w4[2], selector); + c3[0] = hc_byte_perm (w4[2], w4[1], selector); + c2[3] = hc_byte_perm (w4[1], w4[0], selector); + c2[2] = hc_byte_perm (w4[0], w3[3], selector); + c2[1] = hc_byte_perm (w3[3], w3[2], selector); + c2[0] = hc_byte_perm (w3[2], w3[1], selector); + c1[3] = hc_byte_perm (w3[1], w3[0], selector); + c1[2] = hc_byte_perm (w3[0], w2[3], selector); + c1[1] = hc_byte_perm (w2[3], w2[2], selector); + c1[0] = hc_byte_perm (w2[2], w2[1], selector); + c0[3] = hc_byte_perm (w2[1], w2[0], selector); + c0[2] = hc_byte_perm (w2[0], w1[3], selector); + c0[1] = hc_byte_perm (w1[3], w1[2], selector); + c0[0] = hc_byte_perm (w1[2], w1[1], selector); + w7[3] = hc_byte_perm (w1[1], w1[0], selector); + w7[2] = hc_byte_perm (w1[0], w0[3], selector); + w7[1] = hc_byte_perm (w0[3], w0[2], selector); + w7[0] = hc_byte_perm (w0[2], w0[1], selector); + w6[3] = hc_byte_perm (w0[1], w0[0], selector); + w6[2] = hc_byte_perm (w0[0], 0, selector); + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 27: + c6[3] = hc_byte_perm ( 0, w7[3], selector); + c6[2] = hc_byte_perm (w7[3], w7[2], selector); + c6[1] = hc_byte_perm (w7[2], w7[1], selector); + c6[0] = hc_byte_perm (w7[1], w7[0], selector); + c5[3] = hc_byte_perm (w7[0], w6[3], selector); + c5[2] = hc_byte_perm (w6[3], w6[2], selector); + c5[1] = hc_byte_perm (w6[2], w6[1], selector); + c5[0] = hc_byte_perm (w6[1], w6[0], selector); + c4[3] = hc_byte_perm (w6[0], w5[3], selector); + c4[2] = hc_byte_perm (w5[3], w5[2], selector); + c4[1] = hc_byte_perm (w5[2], w5[1], selector); + c4[0] = hc_byte_perm (w5[1], w5[0], selector); + c3[3] = hc_byte_perm (w5[0], w4[3], selector); + c3[2] = hc_byte_perm (w4[3], w4[2], selector); + c3[1] = hc_byte_perm (w4[2], w4[1], selector); + c3[0] = hc_byte_perm (w4[1], w4[0], selector); + c2[3] = hc_byte_perm (w4[0], w3[3], selector); + c2[2] = hc_byte_perm (w3[3], w3[2], selector); + c2[1] = hc_byte_perm (w3[2], w3[1], selector); + c2[0] = hc_byte_perm (w3[1], w3[0], selector); + c1[3] = hc_byte_perm (w3[0], w2[3], selector); + c1[2] = hc_byte_perm (w2[3], w2[2], selector); + c1[1] = hc_byte_perm (w2[2], w2[1], selector); + c1[0] = hc_byte_perm (w2[1], w2[0], selector); + c0[3] = hc_byte_perm (w2[0], w1[3], selector); + c0[2] = hc_byte_perm (w1[3], w1[2], selector); + c0[1] = hc_byte_perm (w1[2], w1[1], selector); + c0[0] = hc_byte_perm (w1[1], w1[0], selector); + w7[3] = hc_byte_perm (w1[0], w0[3], selector); + w7[2] = hc_byte_perm (w0[3], w0[2], selector); + w7[1] = hc_byte_perm (w0[2], w0[1], selector); + w7[0] = hc_byte_perm (w0[1], w0[0], selector); + w6[3] = hc_byte_perm (w0[0], 0, selector); + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 28: + c7[0] = hc_byte_perm ( 0, w7[3], selector); + c6[3] = hc_byte_perm (w7[3], w7[2], selector); + c6[2] = hc_byte_perm (w7[2], w7[1], selector); + c6[1] = hc_byte_perm (w7[1], w7[0], selector); + c6[0] = hc_byte_perm (w7[0], w6[3], selector); + c5[3] = hc_byte_perm (w6[3], w6[2], selector); + c5[2] = hc_byte_perm (w6[2], w6[1], selector); + c5[1] = hc_byte_perm (w6[1], w6[0], selector); + c5[0] = hc_byte_perm (w6[0], w5[3], selector); + c4[3] = hc_byte_perm (w5[3], w5[2], selector); + c4[2] = hc_byte_perm (w5[2], w5[1], selector); + c4[1] = hc_byte_perm (w5[1], w5[0], selector); + c4[0] = hc_byte_perm (w5[0], w4[3], selector); + c3[3] = hc_byte_perm (w4[3], w4[2], selector); + c3[2] = hc_byte_perm (w4[2], w4[1], selector); + c3[1] = hc_byte_perm (w4[1], w4[0], selector); + c3[0] = hc_byte_perm (w4[0], w3[3], selector); + c2[3] = hc_byte_perm (w3[3], w3[2], selector); + c2[2] = hc_byte_perm (w3[2], w3[1], selector); + c2[1] = hc_byte_perm (w3[1], w3[0], selector); + c2[0] = hc_byte_perm (w3[0], w2[3], selector); + c1[3] = hc_byte_perm (w2[3], w2[2], selector); + c1[2] = hc_byte_perm (w2[2], w2[1], selector); + c1[1] = hc_byte_perm (w2[1], w2[0], selector); + c1[0] = hc_byte_perm (w2[0], w1[3], selector); + c0[3] = hc_byte_perm (w1[3], w1[2], selector); + c0[2] = hc_byte_perm (w1[2], w1[1], selector); + c0[1] = hc_byte_perm (w1[1], w1[0], selector); + c0[0] = hc_byte_perm (w1[0], w0[3], selector); + w7[3] = hc_byte_perm (w0[3], w0[2], selector); + w7[2] = hc_byte_perm (w0[2], w0[1], selector); + w7[1] = hc_byte_perm (w0[1], w0[0], selector); + w7[0] = hc_byte_perm (w0[0], 0, selector); + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 29: + c7[1] = hc_byte_perm ( 0, w7[3], selector); + c7[0] = hc_byte_perm (w7[3], w7[2], selector); + c6[3] = hc_byte_perm (w7[2], w7[1], selector); + c6[2] = hc_byte_perm (w7[1], w7[0], selector); + c6[1] = hc_byte_perm (w7[0], w6[3], selector); + c6[0] = hc_byte_perm (w6[3], w6[2], selector); + c5[3] = hc_byte_perm (w6[2], w6[1], selector); + c5[2] = hc_byte_perm (w6[1], w6[0], selector); + c5[1] = hc_byte_perm (w6[0], w5[3], selector); + c5[0] = hc_byte_perm (w5[3], w5[2], selector); + c4[3] = hc_byte_perm (w5[2], w5[1], selector); + c4[2] = hc_byte_perm (w5[1], w5[0], selector); + c4[1] = hc_byte_perm (w5[0], w4[3], selector); + c4[0] = hc_byte_perm (w4[3], w4[2], selector); + c3[3] = hc_byte_perm (w4[2], w4[1], selector); + c3[2] = hc_byte_perm (w4[1], w4[0], selector); + c3[1] = hc_byte_perm (w4[0], w3[3], selector); + c3[0] = hc_byte_perm (w3[3], w3[2], selector); + c2[3] = hc_byte_perm (w3[2], w3[1], selector); + c2[2] = hc_byte_perm (w3[1], w3[0], selector); + c2[1] = hc_byte_perm (w3[0], w2[3], selector); + c2[0] = hc_byte_perm (w2[3], w2[2], selector); + c1[3] = hc_byte_perm (w2[2], w2[1], selector); + c1[2] = hc_byte_perm (w2[1], w2[0], selector); + c1[1] = hc_byte_perm (w2[0], w1[3], selector); + c1[0] = hc_byte_perm (w1[3], w1[2], selector); + c0[3] = hc_byte_perm (w1[2], w1[1], selector); + c0[2] = hc_byte_perm (w1[1], w1[0], selector); + c0[1] = hc_byte_perm (w1[0], w0[3], selector); + c0[0] = hc_byte_perm (w0[3], w0[2], selector); + w7[3] = hc_byte_perm (w0[2], w0[1], selector); + w7[2] = hc_byte_perm (w0[1], w0[0], selector); + w7[1] = hc_byte_perm (w0[0], 0, selector); + w7[0] = 0; + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 30: + c7[2] = hc_byte_perm ( 0, w7[3], selector); + c7[1] = hc_byte_perm (w7[3], w7[2], selector); + c7[0] = hc_byte_perm (w7[2], w7[1], selector); + c6[3] = hc_byte_perm (w7[1], w7[0], selector); + c6[2] = hc_byte_perm (w7[0], w6[3], selector); + c6[1] = hc_byte_perm (w6[3], w6[2], selector); + c6[0] = hc_byte_perm (w6[2], w6[1], selector); + c5[3] = hc_byte_perm (w6[1], w6[0], selector); + c5[2] = hc_byte_perm (w6[0], w5[3], selector); + c5[1] = hc_byte_perm (w5[3], w5[2], selector); + c5[0] = hc_byte_perm (w5[2], w5[1], selector); + c4[3] = hc_byte_perm (w5[1], w5[0], selector); + c4[2] = hc_byte_perm (w5[0], w4[3], selector); + c4[1] = hc_byte_perm (w4[3], w4[2], selector); + c4[0] = hc_byte_perm (w4[2], w4[1], selector); + c3[3] = hc_byte_perm (w4[1], w4[0], selector); + c3[2] = hc_byte_perm (w4[0], w3[3], selector); + c3[1] = hc_byte_perm (w3[3], w3[2], selector); + c3[0] = hc_byte_perm (w3[2], w3[1], selector); + c2[3] = hc_byte_perm (w3[1], w3[0], selector); + c2[2] = hc_byte_perm (w3[0], w2[3], selector); + c2[1] = hc_byte_perm (w2[3], w2[2], selector); + c2[0] = hc_byte_perm (w2[2], w2[1], selector); + c1[3] = hc_byte_perm (w2[1], w2[0], selector); + c1[2] = hc_byte_perm (w2[0], w1[3], selector); + c1[1] = hc_byte_perm (w1[3], w1[2], selector); + c1[0] = hc_byte_perm (w1[2], w1[1], selector); + c0[3] = hc_byte_perm (w1[1], w1[0], selector); + c0[2] = hc_byte_perm (w1[0], w0[3], selector); + c0[1] = hc_byte_perm (w0[3], w0[2], selector); + c0[0] = hc_byte_perm (w0[2], w0[1], selector); + w7[3] = hc_byte_perm (w0[1], w0[0], selector); + w7[2] = hc_byte_perm (w0[0], 0, selector); + w7[1] = 0; + w7[0] = 0; + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 31: + c7[3] = hc_byte_perm ( 0, w7[3], selector); + c7[2] = hc_byte_perm (w7[3], w7[2], selector); + c7[1] = hc_byte_perm (w7[2], w7[1], selector); + c7[0] = hc_byte_perm (w7[1], w7[0], selector); + c6[3] = hc_byte_perm (w7[0], w6[3], selector); + c6[2] = hc_byte_perm (w6[3], w6[2], selector); + c6[1] = hc_byte_perm (w6[2], w6[1], selector); + c6[0] = hc_byte_perm (w6[1], w6[0], selector); + c5[3] = hc_byte_perm (w6[0], w5[3], selector); + c5[2] = hc_byte_perm (w5[3], w5[2], selector); + c5[1] = hc_byte_perm (w5[2], w5[1], selector); + c5[0] = hc_byte_perm (w5[1], w5[0], selector); + c4[3] = hc_byte_perm (w5[0], w4[3], selector); + c4[2] = hc_byte_perm (w4[3], w4[2], selector); + c4[1] = hc_byte_perm (w4[2], w4[1], selector); + c4[0] = hc_byte_perm (w4[1], w4[0], selector); + c3[3] = hc_byte_perm (w4[0], w3[3], selector); + c3[2] = hc_byte_perm (w3[3], w3[2], selector); + c3[1] = hc_byte_perm (w3[2], w3[1], selector); + c3[0] = hc_byte_perm (w3[1], w3[0], selector); + c2[3] = hc_byte_perm (w3[0], w2[3], selector); + c2[2] = hc_byte_perm (w2[3], w2[2], selector); + c2[1] = hc_byte_perm (w2[2], w2[1], selector); + c2[0] = hc_byte_perm (w2[1], w2[0], selector); + c1[3] = hc_byte_perm (w2[0], w1[3], selector); + c1[2] = hc_byte_perm (w1[3], w1[2], selector); + c1[1] = hc_byte_perm (w1[2], w1[1], selector); + c1[0] = hc_byte_perm (w1[1], w1[0], selector); + c0[3] = hc_byte_perm (w1[0], w0[3], selector); + c0[2] = hc_byte_perm (w0[3], w0[2], selector); + c0[1] = hc_byte_perm (w0[2], w0[1], selector); + c0[0] = hc_byte_perm (w0[1], w0[0], selector); + w7[3] = hc_byte_perm (w0[0], 0, selector); + w7[2] = 0; + w7[1] = 0; + w7[0] = 0; + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + } + #endif +} + +DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset) +{ + const int offset_switch = offset / 4; + + #if (defined IS_AMD && HAS_VPERM == 0) || defined IS_GENERIC + switch (offset_switch) + { + case 0: + w[63] = hc_bytealign (w[62], w[63], offset); + w[62] = hc_bytealign (w[61], w[62], offset); + w[61] = hc_bytealign (w[60], w[61], offset); + w[60] = hc_bytealign (w[59], w[60], offset); + w[59] = hc_bytealign (w[58], w[59], offset); + w[58] = hc_bytealign (w[57], w[58], offset); + w[57] = hc_bytealign (w[56], w[57], offset); + w[56] = hc_bytealign (w[55], w[56], offset); + w[55] = hc_bytealign (w[54], w[55], offset); + w[54] = hc_bytealign (w[53], w[54], offset); + w[53] = hc_bytealign (w[52], w[53], offset); + w[52] = hc_bytealign (w[51], w[52], offset); + w[51] = hc_bytealign (w[50], w[51], offset); + w[50] = hc_bytealign (w[49], w[50], offset); + w[49] = hc_bytealign (w[48], w[49], offset); + w[48] = hc_bytealign (w[47], w[48], offset); + w[47] = hc_bytealign (w[46], w[47], offset); + w[46] = hc_bytealign (w[45], w[46], offset); + w[45] = hc_bytealign (w[44], w[45], offset); + w[44] = hc_bytealign (w[43], w[44], offset); + w[43] = hc_bytealign (w[42], w[43], offset); + w[42] = hc_bytealign (w[41], w[42], offset); + w[41] = hc_bytealign (w[40], w[41], offset); + w[40] = hc_bytealign (w[39], w[40], offset); + w[39] = hc_bytealign (w[38], w[39], offset); + w[38] = hc_bytealign (w[37], w[38], offset); + w[37] = hc_bytealign (w[36], w[37], offset); + w[36] = hc_bytealign (w[35], w[36], offset); + w[35] = hc_bytealign (w[34], w[35], offset); + w[34] = hc_bytealign (w[33], w[34], offset); + w[33] = hc_bytealign (w[32], w[33], offset); + w[32] = hc_bytealign (w[31], w[32], offset); + w[31] = hc_bytealign (w[30], w[31], offset); + w[30] = hc_bytealign (w[29], w[30], offset); + w[29] = hc_bytealign (w[28], w[29], offset); + w[28] = hc_bytealign (w[27], w[28], offset); + w[27] = hc_bytealign (w[26], w[27], offset); + w[26] = hc_bytealign (w[25], w[26], offset); + w[25] = hc_bytealign (w[24], w[25], offset); + w[24] = hc_bytealign (w[23], w[24], offset); + w[23] = hc_bytealign (w[22], w[23], offset); + w[22] = hc_bytealign (w[21], w[22], offset); + w[21] = hc_bytealign (w[20], w[21], offset); + w[20] = hc_bytealign (w[19], w[20], offset); + w[19] = hc_bytealign (w[18], w[19], offset); + w[18] = hc_bytealign (w[17], w[18], offset); + w[17] = hc_bytealign (w[16], w[17], offset); + w[16] = hc_bytealign (w[15], w[16], offset); + w[15] = hc_bytealign (w[14], w[15], offset); + w[14] = hc_bytealign (w[13], w[14], offset); + w[13] = hc_bytealign (w[12], w[13], offset); + w[12] = hc_bytealign (w[11], w[12], offset); + w[11] = hc_bytealign (w[10], w[11], offset); + w[10] = hc_bytealign (w[ 9], w[10], offset); + w[ 9] = hc_bytealign (w[ 8], w[ 9], offset); + w[ 8] = hc_bytealign (w[ 7], w[ 8], offset); + w[ 7] = hc_bytealign (w[ 6], w[ 7], offset); + w[ 6] = hc_bytealign (w[ 5], w[ 6], offset); + w[ 5] = hc_bytealign (w[ 4], w[ 5], offset); + w[ 4] = hc_bytealign (w[ 3], w[ 4], offset); + w[ 3] = hc_bytealign (w[ 2], w[ 3], offset); + w[ 2] = hc_bytealign (w[ 1], w[ 2], offset); + w[ 1] = hc_bytealign (w[ 0], w[ 1], offset); + w[ 0] = hc_bytealign ( 0, w[ 0], offset); + + break; + + case 1: + w[63] = hc_bytealign (w[61], w[62], offset); + w[62] = hc_bytealign (w[60], w[61], offset); + w[61] = hc_bytealign (w[59], w[60], offset); + w[60] = hc_bytealign (w[58], w[59], offset); + w[59] = hc_bytealign (w[57], w[58], offset); + w[58] = hc_bytealign (w[56], w[57], offset); + w[57] = hc_bytealign (w[55], w[56], offset); + w[56] = hc_bytealign (w[54], w[55], offset); + w[55] = hc_bytealign (w[53], w[54], offset); + w[54] = hc_bytealign (w[52], w[53], offset); + w[53] = hc_bytealign (w[51], w[52], offset); + w[52] = hc_bytealign (w[50], w[51], offset); + w[51] = hc_bytealign (w[49], w[50], offset); + w[50] = hc_bytealign (w[48], w[49], offset); + w[49] = hc_bytealign (w[47], w[48], offset); + w[48] = hc_bytealign (w[46], w[47], offset); + w[47] = hc_bytealign (w[45], w[46], offset); + w[46] = hc_bytealign (w[44], w[45], offset); + w[45] = hc_bytealign (w[43], w[44], offset); + w[44] = hc_bytealign (w[42], w[43], offset); + w[43] = hc_bytealign (w[41], w[42], offset); + w[42] = hc_bytealign (w[40], w[41], offset); w[41] = hc_bytealign (w[39], w[40], offset); w[40] = hc_bytealign (w[38], w[39], offset); w[39] = hc_bytealign (w[37], w[38], offset); @@ -22494,63 +25882,335 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 8: - w[63] = hc_bytealign_be (w[54], w[55], offset); - w[62] = hc_bytealign_be (w[53], w[54], offset); - w[61] = hc_bytealign_be (w[52], w[53], offset); - w[60] = hc_bytealign_be (w[51], w[52], offset); - w[59] = hc_bytealign_be (w[50], w[51], offset); - w[58] = hc_bytealign_be (w[49], w[50], offset); - w[57] = hc_bytealign_be (w[48], w[49], offset); - w[56] = hc_bytealign_be (w[47], w[48], offset); - w[55] = hc_bytealign_be (w[46], w[47], offset); - w[54] = hc_bytealign_be (w[45], w[46], offset); - w[53] = hc_bytealign_be (w[44], w[45], offset); - w[52] = hc_bytealign_be (w[43], w[44], offset); - w[51] = hc_bytealign_be (w[42], w[43], offset); - w[50] = hc_bytealign_be (w[41], w[42], offset); - w[49] = hc_bytealign_be (w[40], w[41], offset); - w[48] = hc_bytealign_be (w[39], w[40], offset); - w[47] = hc_bytealign_be (w[38], w[39], offset); - w[46] = hc_bytealign_be (w[37], w[38], offset); - w[45] = hc_bytealign_be (w[36], w[37], offset); - w[44] = hc_bytealign_be (w[35], w[36], offset); - w[43] = hc_bytealign_be (w[34], w[35], offset); - w[42] = hc_bytealign_be (w[33], w[34], offset); - w[41] = hc_bytealign_be (w[32], w[33], offset); - w[40] = hc_bytealign_be (w[31], w[32], offset); - w[39] = hc_bytealign_be (w[30], w[31], offset); - w[38] = hc_bytealign_be (w[29], w[30], offset); - w[37] = hc_bytealign_be (w[28], w[29], offset); - w[36] = hc_bytealign_be (w[27], w[28], offset); - w[35] = hc_bytealign_be (w[26], w[27], offset); - w[34] = hc_bytealign_be (w[25], w[26], offset); - w[33] = hc_bytealign_be (w[24], w[25], offset); - w[32] = hc_bytealign_be (w[23], w[24], offset); - w[31] = hc_bytealign_be (w[22], w[23], offset); - w[30] = hc_bytealign_be (w[21], w[22], offset); - w[29] = hc_bytealign_be (w[20], w[21], offset); - w[28] = hc_bytealign_be (w[19], w[20], offset); - w[27] = hc_bytealign_be (w[18], w[19], offset); - w[26] = hc_bytealign_be (w[17], w[18], offset); - w[25] = hc_bytealign_be (w[16], w[17], offset); - w[24] = hc_bytealign_be (w[15], w[16], offset); - w[23] = hc_bytealign_be (w[14], w[15], offset); - w[22] = hc_bytealign_be (w[13], w[14], offset); - w[21] = hc_bytealign_be (w[12], w[13], offset); - w[20] = hc_bytealign_be (w[11], w[12], offset); - w[19] = hc_bytealign_be (w[10], w[11], offset); - w[18] = hc_bytealign_be (w[ 9], w[10], offset); - w[17] = hc_bytealign_be (w[ 8], w[ 9], offset); - w[16] = hc_bytealign_be (w[ 7], w[ 8], offset); - w[15] = hc_bytealign_be (w[ 6], w[ 7], offset); - w[14] = hc_bytealign_be (w[ 5], w[ 6], offset); - w[13] = hc_bytealign_be (w[ 4], w[ 5], offset); - w[12] = hc_bytealign_be (w[ 3], w[ 4], offset); - w[11] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[10] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[ 9] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[ 8] = hc_bytealign_be ( 0, w[ 0], offset); + case 8: + w[63] = hc_bytealign_be (w[54], w[55], offset); + w[62] = hc_bytealign_be (w[53], w[54], offset); + w[61] = hc_bytealign_be (w[52], w[53], offset); + w[60] = hc_bytealign_be (w[51], w[52], offset); + w[59] = hc_bytealign_be (w[50], w[51], offset); + w[58] = hc_bytealign_be (w[49], w[50], offset); + w[57] = hc_bytealign_be (w[48], w[49], offset); + w[56] = hc_bytealign_be (w[47], w[48], offset); + w[55] = hc_bytealign_be (w[46], w[47], offset); + w[54] = hc_bytealign_be (w[45], w[46], offset); + w[53] = hc_bytealign_be (w[44], w[45], offset); + w[52] = hc_bytealign_be (w[43], w[44], offset); + w[51] = hc_bytealign_be (w[42], w[43], offset); + w[50] = hc_bytealign_be (w[41], w[42], offset); + w[49] = hc_bytealign_be (w[40], w[41], offset); + w[48] = hc_bytealign_be (w[39], w[40], offset); + w[47] = hc_bytealign_be (w[38], w[39], offset); + w[46] = hc_bytealign_be (w[37], w[38], offset); + w[45] = hc_bytealign_be (w[36], w[37], offset); + w[44] = hc_bytealign_be (w[35], w[36], offset); + w[43] = hc_bytealign_be (w[34], w[35], offset); + w[42] = hc_bytealign_be (w[33], w[34], offset); + w[41] = hc_bytealign_be (w[32], w[33], offset); + w[40] = hc_bytealign_be (w[31], w[32], offset); + w[39] = hc_bytealign_be (w[30], w[31], offset); + w[38] = hc_bytealign_be (w[29], w[30], offset); + w[37] = hc_bytealign_be (w[28], w[29], offset); + w[36] = hc_bytealign_be (w[27], w[28], offset); + w[35] = hc_bytealign_be (w[26], w[27], offset); + w[34] = hc_bytealign_be (w[25], w[26], offset); + w[33] = hc_bytealign_be (w[24], w[25], offset); + w[32] = hc_bytealign_be (w[23], w[24], offset); + w[31] = hc_bytealign_be (w[22], w[23], offset); + w[30] = hc_bytealign_be (w[21], w[22], offset); + w[29] = hc_bytealign_be (w[20], w[21], offset); + w[28] = hc_bytealign_be (w[19], w[20], offset); + w[27] = hc_bytealign_be (w[18], w[19], offset); + w[26] = hc_bytealign_be (w[17], w[18], offset); + w[25] = hc_bytealign_be (w[16], w[17], offset); + w[24] = hc_bytealign_be (w[15], w[16], offset); + w[23] = hc_bytealign_be (w[14], w[15], offset); + w[22] = hc_bytealign_be (w[13], w[14], offset); + w[21] = hc_bytealign_be (w[12], w[13], offset); + w[20] = hc_bytealign_be (w[11], w[12], offset); + w[19] = hc_bytealign_be (w[10], w[11], offset); + w[18] = hc_bytealign_be (w[ 9], w[10], offset); + w[17] = hc_bytealign_be (w[ 8], w[ 9], offset); + w[16] = hc_bytealign_be (w[ 7], w[ 8], offset); + w[15] = hc_bytealign_be (w[ 6], w[ 7], offset); + w[14] = hc_bytealign_be (w[ 5], w[ 6], offset); + w[13] = hc_bytealign_be (w[ 4], w[ 5], offset); + w[12] = hc_bytealign_be (w[ 3], w[ 4], offset); + w[11] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[10] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[ 9] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[ 8] = hc_bytealign_be ( 0, w[ 0], offset); + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 9: + w[63] = hc_bytealign_be (w[53], w[54], offset); + w[62] = hc_bytealign_be (w[52], w[53], offset); + w[61] = hc_bytealign_be (w[51], w[52], offset); + w[60] = hc_bytealign_be (w[50], w[51], offset); + w[59] = hc_bytealign_be (w[49], w[50], offset); + w[58] = hc_bytealign_be (w[48], w[49], offset); + w[57] = hc_bytealign_be (w[47], w[48], offset); + w[56] = hc_bytealign_be (w[46], w[47], offset); + w[55] = hc_bytealign_be (w[45], w[46], offset); + w[54] = hc_bytealign_be (w[44], w[45], offset); + w[53] = hc_bytealign_be (w[43], w[44], offset); + w[52] = hc_bytealign_be (w[42], w[43], offset); + w[51] = hc_bytealign_be (w[41], w[42], offset); + w[50] = hc_bytealign_be (w[40], w[41], offset); + w[49] = hc_bytealign_be (w[39], w[40], offset); + w[48] = hc_bytealign_be (w[38], w[39], offset); + w[47] = hc_bytealign_be (w[37], w[38], offset); + w[46] = hc_bytealign_be (w[36], w[37], offset); + w[45] = hc_bytealign_be (w[35], w[36], offset); + w[44] = hc_bytealign_be (w[34], w[35], offset); + w[43] = hc_bytealign_be (w[33], w[34], offset); + w[42] = hc_bytealign_be (w[32], w[33], offset); + w[41] = hc_bytealign_be (w[31], w[32], offset); + w[40] = hc_bytealign_be (w[30], w[31], offset); + w[39] = hc_bytealign_be (w[29], w[30], offset); + w[38] = hc_bytealign_be (w[28], w[29], offset); + w[37] = hc_bytealign_be (w[27], w[28], offset); + w[36] = hc_bytealign_be (w[26], w[27], offset); + w[35] = hc_bytealign_be (w[25], w[26], offset); + w[34] = hc_bytealign_be (w[24], w[25], offset); + w[33] = hc_bytealign_be (w[23], w[24], offset); + w[32] = hc_bytealign_be (w[22], w[23], offset); + w[31] = hc_bytealign_be (w[21], w[22], offset); + w[30] = hc_bytealign_be (w[20], w[21], offset); + w[29] = hc_bytealign_be (w[19], w[20], offset); + w[28] = hc_bytealign_be (w[18], w[19], offset); + w[27] = hc_bytealign_be (w[17], w[18], offset); + w[26] = hc_bytealign_be (w[16], w[17], offset); + w[25] = hc_bytealign_be (w[15], w[16], offset); + w[24] = hc_bytealign_be (w[14], w[15], offset); + w[23] = hc_bytealign_be (w[13], w[14], offset); + w[22] = hc_bytealign_be (w[12], w[13], offset); + w[21] = hc_bytealign_be (w[11], w[12], offset); + w[20] = hc_bytealign_be (w[10], w[11], offset); + w[19] = hc_bytealign_be (w[ 9], w[10], offset); + w[18] = hc_bytealign_be (w[ 8], w[ 9], offset); + w[17] = hc_bytealign_be (w[ 7], w[ 8], offset); + w[16] = hc_bytealign_be (w[ 6], w[ 7], offset); + w[15] = hc_bytealign_be (w[ 5], w[ 6], offset); + w[14] = hc_bytealign_be (w[ 4], w[ 5], offset); + w[13] = hc_bytealign_be (w[ 3], w[ 4], offset); + w[12] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[11] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[10] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[ 9] = hc_bytealign_be ( 0, w[ 0], offset); + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 10: + w[63] = hc_bytealign_be (w[52], w[53], offset); + w[62] = hc_bytealign_be (w[51], w[52], offset); + w[61] = hc_bytealign_be (w[50], w[51], offset); + w[60] = hc_bytealign_be (w[49], w[50], offset); + w[59] = hc_bytealign_be (w[48], w[49], offset); + w[58] = hc_bytealign_be (w[47], w[48], offset); + w[57] = hc_bytealign_be (w[46], w[47], offset); + w[56] = hc_bytealign_be (w[45], w[46], offset); + w[55] = hc_bytealign_be (w[44], w[45], offset); + w[54] = hc_bytealign_be (w[43], w[44], offset); + w[53] = hc_bytealign_be (w[42], w[43], offset); + w[52] = hc_bytealign_be (w[41], w[42], offset); + w[51] = hc_bytealign_be (w[40], w[41], offset); + w[50] = hc_bytealign_be (w[39], w[40], offset); + w[49] = hc_bytealign_be (w[38], w[39], offset); + w[48] = hc_bytealign_be (w[37], w[38], offset); + w[47] = hc_bytealign_be (w[36], w[37], offset); + w[46] = hc_bytealign_be (w[35], w[36], offset); + w[45] = hc_bytealign_be (w[34], w[35], offset); + w[44] = hc_bytealign_be (w[33], w[34], offset); + w[43] = hc_bytealign_be (w[32], w[33], offset); + w[42] = hc_bytealign_be (w[31], w[32], offset); + w[41] = hc_bytealign_be (w[30], w[31], offset); + w[40] = hc_bytealign_be (w[29], w[30], offset); + w[39] = hc_bytealign_be (w[28], w[29], offset); + w[38] = hc_bytealign_be (w[27], w[28], offset); + w[37] = hc_bytealign_be (w[26], w[27], offset); + w[36] = hc_bytealign_be (w[25], w[26], offset); + w[35] = hc_bytealign_be (w[24], w[25], offset); + w[34] = hc_bytealign_be (w[23], w[24], offset); + w[33] = hc_bytealign_be (w[22], w[23], offset); + w[32] = hc_bytealign_be (w[21], w[22], offset); + w[31] = hc_bytealign_be (w[20], w[21], offset); + w[30] = hc_bytealign_be (w[19], w[20], offset); + w[29] = hc_bytealign_be (w[18], w[19], offset); + w[28] = hc_bytealign_be (w[17], w[18], offset); + w[27] = hc_bytealign_be (w[16], w[17], offset); + w[26] = hc_bytealign_be (w[15], w[16], offset); + w[25] = hc_bytealign_be (w[14], w[15], offset); + w[24] = hc_bytealign_be (w[13], w[14], offset); + w[23] = hc_bytealign_be (w[12], w[13], offset); + w[22] = hc_bytealign_be (w[11], w[12], offset); + w[21] = hc_bytealign_be (w[10], w[11], offset); + w[20] = hc_bytealign_be (w[ 9], w[10], offset); + w[19] = hc_bytealign_be (w[ 8], w[ 9], offset); + w[18] = hc_bytealign_be (w[ 7], w[ 8], offset); + w[17] = hc_bytealign_be (w[ 6], w[ 7], offset); + w[16] = hc_bytealign_be (w[ 5], w[ 6], offset); + w[15] = hc_bytealign_be (w[ 4], w[ 5], offset); + w[14] = hc_bytealign_be (w[ 3], w[ 4], offset); + w[13] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[12] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[11] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[10] = hc_bytealign_be ( 0, w[ 0], offset); + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 11: + w[63] = hc_bytealign_be (w[51], w[52], offset); + w[62] = hc_bytealign_be (w[50], w[51], offset); + w[61] = hc_bytealign_be (w[49], w[50], offset); + w[60] = hc_bytealign_be (w[48], w[49], offset); + w[59] = hc_bytealign_be (w[47], w[48], offset); + w[58] = hc_bytealign_be (w[46], w[47], offset); + w[57] = hc_bytealign_be (w[45], w[46], offset); + w[56] = hc_bytealign_be (w[44], w[45], offset); + w[55] = hc_bytealign_be (w[43], w[44], offset); + w[54] = hc_bytealign_be (w[42], w[43], offset); + w[53] = hc_bytealign_be (w[41], w[42], offset); + w[52] = hc_bytealign_be (w[40], w[41], offset); + w[51] = hc_bytealign_be (w[39], w[40], offset); + w[50] = hc_bytealign_be (w[38], w[39], offset); + w[49] = hc_bytealign_be (w[37], w[38], offset); + w[48] = hc_bytealign_be (w[36], w[37], offset); + w[47] = hc_bytealign_be (w[35], w[36], offset); + w[46] = hc_bytealign_be (w[34], w[35], offset); + w[45] = hc_bytealign_be (w[33], w[34], offset); + w[44] = hc_bytealign_be (w[32], w[33], offset); + w[43] = hc_bytealign_be (w[31], w[32], offset); + w[42] = hc_bytealign_be (w[30], w[31], offset); + w[41] = hc_bytealign_be (w[29], w[30], offset); + w[40] = hc_bytealign_be (w[28], w[29], offset); + w[39] = hc_bytealign_be (w[27], w[28], offset); + w[38] = hc_bytealign_be (w[26], w[27], offset); + w[37] = hc_bytealign_be (w[25], w[26], offset); + w[36] = hc_bytealign_be (w[24], w[25], offset); + w[35] = hc_bytealign_be (w[23], w[24], offset); + w[34] = hc_bytealign_be (w[22], w[23], offset); + w[33] = hc_bytealign_be (w[21], w[22], offset); + w[32] = hc_bytealign_be (w[20], w[21], offset); + w[31] = hc_bytealign_be (w[19], w[20], offset); + w[30] = hc_bytealign_be (w[18], w[19], offset); + w[29] = hc_bytealign_be (w[17], w[18], offset); + w[28] = hc_bytealign_be (w[16], w[17], offset); + w[27] = hc_bytealign_be (w[15], w[16], offset); + w[26] = hc_bytealign_be (w[14], w[15], offset); + w[25] = hc_bytealign_be (w[13], w[14], offset); + w[24] = hc_bytealign_be (w[12], w[13], offset); + w[23] = hc_bytealign_be (w[11], w[12], offset); + w[22] = hc_bytealign_be (w[10], w[11], offset); + w[21] = hc_bytealign_be (w[ 9], w[10], offset); + w[20] = hc_bytealign_be (w[ 8], w[ 9], offset); + w[19] = hc_bytealign_be (w[ 7], w[ 8], offset); + w[18] = hc_bytealign_be (w[ 6], w[ 7], offset); + w[17] = hc_bytealign_be (w[ 5], w[ 6], offset); + w[16] = hc_bytealign_be (w[ 4], w[ 5], offset); + w[15] = hc_bytealign_be (w[ 3], w[ 4], offset); + w[14] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[13] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[12] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[11] = hc_bytealign_be ( 0, w[ 0], offset); + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 12: + w[63] = hc_bytealign_be (w[50], w[51], offset); + w[62] = hc_bytealign_be (w[49], w[50], offset); + w[61] = hc_bytealign_be (w[48], w[49], offset); + w[60] = hc_bytealign_be (w[47], w[48], offset); + w[59] = hc_bytealign_be (w[46], w[47], offset); + w[58] = hc_bytealign_be (w[45], w[46], offset); + w[57] = hc_bytealign_be (w[44], w[45], offset); + w[56] = hc_bytealign_be (w[43], w[44], offset); + w[55] = hc_bytealign_be (w[42], w[43], offset); + w[54] = hc_bytealign_be (w[41], w[42], offset); + w[53] = hc_bytealign_be (w[40], w[41], offset); + w[52] = hc_bytealign_be (w[39], w[40], offset); + w[51] = hc_bytealign_be (w[38], w[39], offset); + w[50] = hc_bytealign_be (w[37], w[38], offset); + w[49] = hc_bytealign_be (w[36], w[37], offset); + w[48] = hc_bytealign_be (w[35], w[36], offset); + w[47] = hc_bytealign_be (w[34], w[35], offset); + w[46] = hc_bytealign_be (w[33], w[34], offset); + w[45] = hc_bytealign_be (w[32], w[33], offset); + w[44] = hc_bytealign_be (w[31], w[32], offset); + w[43] = hc_bytealign_be (w[30], w[31], offset); + w[42] = hc_bytealign_be (w[29], w[30], offset); + w[41] = hc_bytealign_be (w[28], w[29], offset); + w[40] = hc_bytealign_be (w[27], w[28], offset); + w[39] = hc_bytealign_be (w[26], w[27], offset); + w[38] = hc_bytealign_be (w[25], w[26], offset); + w[37] = hc_bytealign_be (w[24], w[25], offset); + w[36] = hc_bytealign_be (w[23], w[24], offset); + w[35] = hc_bytealign_be (w[22], w[23], offset); + w[34] = hc_bytealign_be (w[21], w[22], offset); + w[33] = hc_bytealign_be (w[20], w[21], offset); + w[32] = hc_bytealign_be (w[19], w[20], offset); + w[31] = hc_bytealign_be (w[18], w[19], offset); + w[30] = hc_bytealign_be (w[17], w[18], offset); + w[29] = hc_bytealign_be (w[16], w[17], offset); + w[28] = hc_bytealign_be (w[15], w[16], offset); + w[27] = hc_bytealign_be (w[14], w[15], offset); + w[26] = hc_bytealign_be (w[13], w[14], offset); + w[25] = hc_bytealign_be (w[12], w[13], offset); + w[24] = hc_bytealign_be (w[11], w[12], offset); + w[23] = hc_bytealign_be (w[10], w[11], offset); + w[22] = hc_bytealign_be (w[ 9], w[10], offset); + w[21] = hc_bytealign_be (w[ 8], w[ 9], offset); + w[20] = hc_bytealign_be (w[ 7], w[ 8], offset); + w[19] = hc_bytealign_be (w[ 6], w[ 7], offset); + w[18] = hc_bytealign_be (w[ 5], w[ 6], offset); + w[17] = hc_bytealign_be (w[ 4], w[ 5], offset); + w[16] = hc_bytealign_be (w[ 3], w[ 4], offset); + w[15] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[14] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[13] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[12] = hc_bytealign_be ( 0, w[ 0], offset); + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; @@ -22562,62 +26222,62 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 9: - w[63] = hc_bytealign_be (w[53], w[54], offset); - w[62] = hc_bytealign_be (w[52], w[53], offset); - w[61] = hc_bytealign_be (w[51], w[52], offset); - w[60] = hc_bytealign_be (w[50], w[51], offset); - w[59] = hc_bytealign_be (w[49], w[50], offset); - w[58] = hc_bytealign_be (w[48], w[49], offset); - w[57] = hc_bytealign_be (w[47], w[48], offset); - w[56] = hc_bytealign_be (w[46], w[47], offset); - w[55] = hc_bytealign_be (w[45], w[46], offset); - w[54] = hc_bytealign_be (w[44], w[45], offset); - w[53] = hc_bytealign_be (w[43], w[44], offset); - w[52] = hc_bytealign_be (w[42], w[43], offset); - w[51] = hc_bytealign_be (w[41], w[42], offset); - w[50] = hc_bytealign_be (w[40], w[41], offset); - w[49] = hc_bytealign_be (w[39], w[40], offset); - w[48] = hc_bytealign_be (w[38], w[39], offset); - w[47] = hc_bytealign_be (w[37], w[38], offset); - w[46] = hc_bytealign_be (w[36], w[37], offset); - w[45] = hc_bytealign_be (w[35], w[36], offset); - w[44] = hc_bytealign_be (w[34], w[35], offset); - w[43] = hc_bytealign_be (w[33], w[34], offset); - w[42] = hc_bytealign_be (w[32], w[33], offset); - w[41] = hc_bytealign_be (w[31], w[32], offset); - w[40] = hc_bytealign_be (w[30], w[31], offset); - w[39] = hc_bytealign_be (w[29], w[30], offset); - w[38] = hc_bytealign_be (w[28], w[29], offset); - w[37] = hc_bytealign_be (w[27], w[28], offset); - w[36] = hc_bytealign_be (w[26], w[27], offset); - w[35] = hc_bytealign_be (w[25], w[26], offset); - w[34] = hc_bytealign_be (w[24], w[25], offset); - w[33] = hc_bytealign_be (w[23], w[24], offset); - w[32] = hc_bytealign_be (w[22], w[23], offset); - w[31] = hc_bytealign_be (w[21], w[22], offset); - w[30] = hc_bytealign_be (w[20], w[21], offset); - w[29] = hc_bytealign_be (w[19], w[20], offset); - w[28] = hc_bytealign_be (w[18], w[19], offset); - w[27] = hc_bytealign_be (w[17], w[18], offset); - w[26] = hc_bytealign_be (w[16], w[17], offset); - w[25] = hc_bytealign_be (w[15], w[16], offset); - w[24] = hc_bytealign_be (w[14], w[15], offset); - w[23] = hc_bytealign_be (w[13], w[14], offset); - w[22] = hc_bytealign_be (w[12], w[13], offset); - w[21] = hc_bytealign_be (w[11], w[12], offset); - w[20] = hc_bytealign_be (w[10], w[11], offset); - w[19] = hc_bytealign_be (w[ 9], w[10], offset); - w[18] = hc_bytealign_be (w[ 8], w[ 9], offset); - w[17] = hc_bytealign_be (w[ 7], w[ 8], offset); - w[16] = hc_bytealign_be (w[ 6], w[ 7], offset); - w[15] = hc_bytealign_be (w[ 5], w[ 6], offset); - w[14] = hc_bytealign_be (w[ 4], w[ 5], offset); - w[13] = hc_bytealign_be (w[ 3], w[ 4], offset); - w[12] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[11] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[10] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[ 9] = hc_bytealign_be ( 0, w[ 0], offset); + case 13: + w[63] = hc_bytealign_be (w[49], w[50], offset); + w[62] = hc_bytealign_be (w[48], w[49], offset); + w[61] = hc_bytealign_be (w[47], w[48], offset); + w[60] = hc_bytealign_be (w[46], w[47], offset); + w[59] = hc_bytealign_be (w[45], w[46], offset); + w[58] = hc_bytealign_be (w[44], w[45], offset); + w[57] = hc_bytealign_be (w[43], w[44], offset); + w[56] = hc_bytealign_be (w[42], w[43], offset); + w[55] = hc_bytealign_be (w[41], w[42], offset); + w[54] = hc_bytealign_be (w[40], w[41], offset); + w[53] = hc_bytealign_be (w[39], w[40], offset); + w[52] = hc_bytealign_be (w[38], w[39], offset); + w[51] = hc_bytealign_be (w[37], w[38], offset); + w[50] = hc_bytealign_be (w[36], w[37], offset); + w[49] = hc_bytealign_be (w[35], w[36], offset); + w[48] = hc_bytealign_be (w[34], w[35], offset); + w[47] = hc_bytealign_be (w[33], w[34], offset); + w[46] = hc_bytealign_be (w[32], w[33], offset); + w[45] = hc_bytealign_be (w[31], w[32], offset); + w[44] = hc_bytealign_be (w[30], w[31], offset); + w[43] = hc_bytealign_be (w[29], w[30], offset); + w[42] = hc_bytealign_be (w[28], w[29], offset); + w[41] = hc_bytealign_be (w[27], w[28], offset); + w[40] = hc_bytealign_be (w[26], w[27], offset); + w[39] = hc_bytealign_be (w[25], w[26], offset); + w[38] = hc_bytealign_be (w[24], w[25], offset); + w[37] = hc_bytealign_be (w[23], w[24], offset); + w[36] = hc_bytealign_be (w[22], w[23], offset); + w[35] = hc_bytealign_be (w[21], w[22], offset); + w[34] = hc_bytealign_be (w[20], w[21], offset); + w[33] = hc_bytealign_be (w[19], w[20], offset); + w[32] = hc_bytealign_be (w[18], w[19], offset); + w[31] = hc_bytealign_be (w[17], w[18], offset); + w[30] = hc_bytealign_be (w[16], w[17], offset); + w[29] = hc_bytealign_be (w[15], w[16], offset); + w[28] = hc_bytealign_be (w[14], w[15], offset); + w[27] = hc_bytealign_be (w[13], w[14], offset); + w[26] = hc_bytealign_be (w[12], w[13], offset); + w[25] = hc_bytealign_be (w[11], w[12], offset); + w[24] = hc_bytealign_be (w[10], w[11], offset); + w[23] = hc_bytealign_be (w[ 9], w[10], offset); + w[22] = hc_bytealign_be (w[ 8], w[ 9], offset); + w[21] = hc_bytealign_be (w[ 7], w[ 8], offset); + w[20] = hc_bytealign_be (w[ 6], w[ 7], offset); + w[19] = hc_bytealign_be (w[ 5], w[ 6], offset); + w[18] = hc_bytealign_be (w[ 4], w[ 5], offset); + w[17] = hc_bytealign_be (w[ 3], w[ 4], offset); + w[16] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[15] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[14] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[13] = hc_bytealign_be ( 0, w[ 0], offset); + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; @@ -22630,61 +26290,61 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 10: - w[63] = hc_bytealign_be (w[52], w[53], offset); - w[62] = hc_bytealign_be (w[51], w[52], offset); - w[61] = hc_bytealign_be (w[50], w[51], offset); - w[60] = hc_bytealign_be (w[49], w[50], offset); - w[59] = hc_bytealign_be (w[48], w[49], offset); - w[58] = hc_bytealign_be (w[47], w[48], offset); - w[57] = hc_bytealign_be (w[46], w[47], offset); - w[56] = hc_bytealign_be (w[45], w[46], offset); - w[55] = hc_bytealign_be (w[44], w[45], offset); - w[54] = hc_bytealign_be (w[43], w[44], offset); - w[53] = hc_bytealign_be (w[42], w[43], offset); - w[52] = hc_bytealign_be (w[41], w[42], offset); - w[51] = hc_bytealign_be (w[40], w[41], offset); - w[50] = hc_bytealign_be (w[39], w[40], offset); - w[49] = hc_bytealign_be (w[38], w[39], offset); - w[48] = hc_bytealign_be (w[37], w[38], offset); - w[47] = hc_bytealign_be (w[36], w[37], offset); - w[46] = hc_bytealign_be (w[35], w[36], offset); - w[45] = hc_bytealign_be (w[34], w[35], offset); - w[44] = hc_bytealign_be (w[33], w[34], offset); - w[43] = hc_bytealign_be (w[32], w[33], offset); - w[42] = hc_bytealign_be (w[31], w[32], offset); - w[41] = hc_bytealign_be (w[30], w[31], offset); - w[40] = hc_bytealign_be (w[29], w[30], offset); - w[39] = hc_bytealign_be (w[28], w[29], offset); - w[38] = hc_bytealign_be (w[27], w[28], offset); - w[37] = hc_bytealign_be (w[26], w[27], offset); - w[36] = hc_bytealign_be (w[25], w[26], offset); - w[35] = hc_bytealign_be (w[24], w[25], offset); - w[34] = hc_bytealign_be (w[23], w[24], offset); - w[33] = hc_bytealign_be (w[22], w[23], offset); - w[32] = hc_bytealign_be (w[21], w[22], offset); - w[31] = hc_bytealign_be (w[20], w[21], offset); - w[30] = hc_bytealign_be (w[19], w[20], offset); - w[29] = hc_bytealign_be (w[18], w[19], offset); - w[28] = hc_bytealign_be (w[17], w[18], offset); - w[27] = hc_bytealign_be (w[16], w[17], offset); - w[26] = hc_bytealign_be (w[15], w[16], offset); - w[25] = hc_bytealign_be (w[14], w[15], offset); - w[24] = hc_bytealign_be (w[13], w[14], offset); - w[23] = hc_bytealign_be (w[12], w[13], offset); - w[22] = hc_bytealign_be (w[11], w[12], offset); - w[21] = hc_bytealign_be (w[10], w[11], offset); - w[20] = hc_bytealign_be (w[ 9], w[10], offset); - w[19] = hc_bytealign_be (w[ 8], w[ 9], offset); - w[18] = hc_bytealign_be (w[ 7], w[ 8], offset); - w[17] = hc_bytealign_be (w[ 6], w[ 7], offset); - w[16] = hc_bytealign_be (w[ 5], w[ 6], offset); - w[15] = hc_bytealign_be (w[ 4], w[ 5], offset); - w[14] = hc_bytealign_be (w[ 3], w[ 4], offset); - w[13] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[12] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[11] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[10] = hc_bytealign_be ( 0, w[ 0], offset); + case 14: + w[63] = hc_bytealign_be (w[48], w[49], offset); + w[62] = hc_bytealign_be (w[47], w[48], offset); + w[61] = hc_bytealign_be (w[46], w[47], offset); + w[60] = hc_bytealign_be (w[45], w[46], offset); + w[59] = hc_bytealign_be (w[44], w[45], offset); + w[58] = hc_bytealign_be (w[43], w[44], offset); + w[57] = hc_bytealign_be (w[42], w[43], offset); + w[56] = hc_bytealign_be (w[41], w[42], offset); + w[55] = hc_bytealign_be (w[40], w[41], offset); + w[54] = hc_bytealign_be (w[39], w[40], offset); + w[53] = hc_bytealign_be (w[38], w[39], offset); + w[52] = hc_bytealign_be (w[37], w[38], offset); + w[51] = hc_bytealign_be (w[36], w[37], offset); + w[50] = hc_bytealign_be (w[35], w[36], offset); + w[49] = hc_bytealign_be (w[34], w[35], offset); + w[48] = hc_bytealign_be (w[33], w[34], offset); + w[47] = hc_bytealign_be (w[32], w[33], offset); + w[46] = hc_bytealign_be (w[31], w[32], offset); + w[45] = hc_bytealign_be (w[30], w[31], offset); + w[44] = hc_bytealign_be (w[29], w[30], offset); + w[43] = hc_bytealign_be (w[28], w[29], offset); + w[42] = hc_bytealign_be (w[27], w[28], offset); + w[41] = hc_bytealign_be (w[26], w[27], offset); + w[40] = hc_bytealign_be (w[25], w[26], offset); + w[39] = hc_bytealign_be (w[24], w[25], offset); + w[38] = hc_bytealign_be (w[23], w[24], offset); + w[37] = hc_bytealign_be (w[22], w[23], offset); + w[36] = hc_bytealign_be (w[21], w[22], offset); + w[35] = hc_bytealign_be (w[20], w[21], offset); + w[34] = hc_bytealign_be (w[19], w[20], offset); + w[33] = hc_bytealign_be (w[18], w[19], offset); + w[32] = hc_bytealign_be (w[17], w[18], offset); + w[31] = hc_bytealign_be (w[16], w[17], offset); + w[30] = hc_bytealign_be (w[15], w[16], offset); + w[29] = hc_bytealign_be (w[14], w[15], offset); + w[28] = hc_bytealign_be (w[13], w[14], offset); + w[27] = hc_bytealign_be (w[12], w[13], offset); + w[26] = hc_bytealign_be (w[11], w[12], offset); + w[25] = hc_bytealign_be (w[10], w[11], offset); + w[24] = hc_bytealign_be (w[ 9], w[10], offset); + w[23] = hc_bytealign_be (w[ 8], w[ 9], offset); + w[22] = hc_bytealign_be (w[ 7], w[ 8], offset); + w[21] = hc_bytealign_be (w[ 6], w[ 7], offset); + w[20] = hc_bytealign_be (w[ 5], w[ 6], offset); + w[19] = hc_bytealign_be (w[ 4], w[ 5], offset); + w[18] = hc_bytealign_be (w[ 3], w[ 4], offset); + w[17] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[16] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[15] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[14] = hc_bytealign_be ( 0, w[ 0], offset); + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; @@ -22698,60 +26358,128 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 11: - w[63] = hc_bytealign_be (w[51], w[52], offset); - w[62] = hc_bytealign_be (w[50], w[51], offset); - w[61] = hc_bytealign_be (w[49], w[50], offset); - w[60] = hc_bytealign_be (w[48], w[49], offset); - w[59] = hc_bytealign_be (w[47], w[48], offset); - w[58] = hc_bytealign_be (w[46], w[47], offset); - w[57] = hc_bytealign_be (w[45], w[46], offset); - w[56] = hc_bytealign_be (w[44], w[45], offset); - w[55] = hc_bytealign_be (w[43], w[44], offset); - w[54] = hc_bytealign_be (w[42], w[43], offset); - w[53] = hc_bytealign_be (w[41], w[42], offset); - w[52] = hc_bytealign_be (w[40], w[41], offset); - w[51] = hc_bytealign_be (w[39], w[40], offset); - w[50] = hc_bytealign_be (w[38], w[39], offset); - w[49] = hc_bytealign_be (w[37], w[38], offset); - w[48] = hc_bytealign_be (w[36], w[37], offset); - w[47] = hc_bytealign_be (w[35], w[36], offset); - w[46] = hc_bytealign_be (w[34], w[35], offset); - w[45] = hc_bytealign_be (w[33], w[34], offset); - w[44] = hc_bytealign_be (w[32], w[33], offset); - w[43] = hc_bytealign_be (w[31], w[32], offset); - w[42] = hc_bytealign_be (w[30], w[31], offset); - w[41] = hc_bytealign_be (w[29], w[30], offset); - w[40] = hc_bytealign_be (w[28], w[29], offset); - w[39] = hc_bytealign_be (w[27], w[28], offset); - w[38] = hc_bytealign_be (w[26], w[27], offset); - w[37] = hc_bytealign_be (w[25], w[26], offset); - w[36] = hc_bytealign_be (w[24], w[25], offset); - w[35] = hc_bytealign_be (w[23], w[24], offset); - w[34] = hc_bytealign_be (w[22], w[23], offset); - w[33] = hc_bytealign_be (w[21], w[22], offset); - w[32] = hc_bytealign_be (w[20], w[21], offset); - w[31] = hc_bytealign_be (w[19], w[20], offset); - w[30] = hc_bytealign_be (w[18], w[19], offset); - w[29] = hc_bytealign_be (w[17], w[18], offset); - w[28] = hc_bytealign_be (w[16], w[17], offset); - w[27] = hc_bytealign_be (w[15], w[16], offset); - w[26] = hc_bytealign_be (w[14], w[15], offset); - w[25] = hc_bytealign_be (w[13], w[14], offset); - w[24] = hc_bytealign_be (w[12], w[13], offset); - w[23] = hc_bytealign_be (w[11], w[12], offset); - w[22] = hc_bytealign_be (w[10], w[11], offset); - w[21] = hc_bytealign_be (w[ 9], w[10], offset); - w[20] = hc_bytealign_be (w[ 8], w[ 9], offset); - w[19] = hc_bytealign_be (w[ 7], w[ 8], offset); - w[18] = hc_bytealign_be (w[ 6], w[ 7], offset); - w[17] = hc_bytealign_be (w[ 5], w[ 6], offset); - w[16] = hc_bytealign_be (w[ 4], w[ 5], offset); - w[15] = hc_bytealign_be (w[ 3], w[ 4], offset); - w[14] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[13] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[12] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[11] = hc_bytealign_be ( 0, w[ 0], offset); + case 15: + w[63] = hc_bytealign_be (w[47], w[48], offset); + w[62] = hc_bytealign_be (w[46], w[47], offset); + w[61] = hc_bytealign_be (w[45], w[46], offset); + w[60] = hc_bytealign_be (w[44], w[45], offset); + w[59] = hc_bytealign_be (w[43], w[44], offset); + w[58] = hc_bytealign_be (w[42], w[43], offset); + w[57] = hc_bytealign_be (w[41], w[42], offset); + w[56] = hc_bytealign_be (w[40], w[41], offset); + w[55] = hc_bytealign_be (w[39], w[40], offset); + w[54] = hc_bytealign_be (w[38], w[39], offset); + w[53] = hc_bytealign_be (w[37], w[38], offset); + w[52] = hc_bytealign_be (w[36], w[37], offset); + w[51] = hc_bytealign_be (w[35], w[36], offset); + w[50] = hc_bytealign_be (w[34], w[35], offset); + w[49] = hc_bytealign_be (w[33], w[34], offset); + w[48] = hc_bytealign_be (w[32], w[33], offset); + w[47] = hc_bytealign_be (w[31], w[32], offset); + w[46] = hc_bytealign_be (w[30], w[31], offset); + w[45] = hc_bytealign_be (w[29], w[30], offset); + w[44] = hc_bytealign_be (w[28], w[29], offset); + w[43] = hc_bytealign_be (w[27], w[28], offset); + w[42] = hc_bytealign_be (w[26], w[27], offset); + w[41] = hc_bytealign_be (w[25], w[26], offset); + w[40] = hc_bytealign_be (w[24], w[25], offset); + w[39] = hc_bytealign_be (w[23], w[24], offset); + w[38] = hc_bytealign_be (w[22], w[23], offset); + w[37] = hc_bytealign_be (w[21], w[22], offset); + w[36] = hc_bytealign_be (w[20], w[21], offset); + w[35] = hc_bytealign_be (w[19], w[20], offset); + w[34] = hc_bytealign_be (w[18], w[19], offset); + w[33] = hc_bytealign_be (w[17], w[18], offset); + w[32] = hc_bytealign_be (w[16], w[17], offset); + w[31] = hc_bytealign_be (w[15], w[16], offset); + w[30] = hc_bytealign_be (w[14], w[15], offset); + w[29] = hc_bytealign_be (w[13], w[14], offset); + w[28] = hc_bytealign_be (w[12], w[13], offset); + w[27] = hc_bytealign_be (w[11], w[12], offset); + w[26] = hc_bytealign_be (w[10], w[11], offset); + w[25] = hc_bytealign_be (w[ 9], w[10], offset); + w[24] = hc_bytealign_be (w[ 8], w[ 9], offset); + w[23] = hc_bytealign_be (w[ 7], w[ 8], offset); + w[22] = hc_bytealign_be (w[ 6], w[ 7], offset); + w[21] = hc_bytealign_be (w[ 5], w[ 6], offset); + w[20] = hc_bytealign_be (w[ 4], w[ 5], offset); + w[19] = hc_bytealign_be (w[ 3], w[ 4], offset); + w[18] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[17] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[16] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[15] = hc_bytealign_be ( 0, w[ 0], offset); + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 16: + w[63] = hc_bytealign_be (w[46], w[47], offset); + w[62] = hc_bytealign_be (w[45], w[46], offset); + w[61] = hc_bytealign_be (w[44], w[45], offset); + w[60] = hc_bytealign_be (w[43], w[44], offset); + w[59] = hc_bytealign_be (w[42], w[43], offset); + w[58] = hc_bytealign_be (w[41], w[42], offset); + w[57] = hc_bytealign_be (w[40], w[41], offset); + w[56] = hc_bytealign_be (w[39], w[40], offset); + w[55] = hc_bytealign_be (w[38], w[39], offset); + w[54] = hc_bytealign_be (w[37], w[38], offset); + w[53] = hc_bytealign_be (w[36], w[37], offset); + w[52] = hc_bytealign_be (w[35], w[36], offset); + w[51] = hc_bytealign_be (w[34], w[35], offset); + w[50] = hc_bytealign_be (w[33], w[34], offset); + w[49] = hc_bytealign_be (w[32], w[33], offset); + w[48] = hc_bytealign_be (w[31], w[32], offset); + w[47] = hc_bytealign_be (w[30], w[31], offset); + w[46] = hc_bytealign_be (w[29], w[30], offset); + w[45] = hc_bytealign_be (w[28], w[29], offset); + w[44] = hc_bytealign_be (w[27], w[28], offset); + w[43] = hc_bytealign_be (w[26], w[27], offset); + w[42] = hc_bytealign_be (w[25], w[26], offset); + w[41] = hc_bytealign_be (w[24], w[25], offset); + w[40] = hc_bytealign_be (w[23], w[24], offset); + w[39] = hc_bytealign_be (w[22], w[23], offset); + w[38] = hc_bytealign_be (w[21], w[22], offset); + w[37] = hc_bytealign_be (w[20], w[21], offset); + w[36] = hc_bytealign_be (w[19], w[20], offset); + w[35] = hc_bytealign_be (w[18], w[19], offset); + w[34] = hc_bytealign_be (w[17], w[18], offset); + w[33] = hc_bytealign_be (w[16], w[17], offset); + w[32] = hc_bytealign_be (w[15], w[16], offset); + w[31] = hc_bytealign_be (w[14], w[15], offset); + w[30] = hc_bytealign_be (w[13], w[14], offset); + w[29] = hc_bytealign_be (w[12], w[13], offset); + w[28] = hc_bytealign_be (w[11], w[12], offset); + w[27] = hc_bytealign_be (w[10], w[11], offset); + w[26] = hc_bytealign_be (w[ 9], w[10], offset); + w[25] = hc_bytealign_be (w[ 8], w[ 9], offset); + w[24] = hc_bytealign_be (w[ 7], w[ 8], offset); + w[23] = hc_bytealign_be (w[ 6], w[ 7], offset); + w[22] = hc_bytealign_be (w[ 5], w[ 6], offset); + w[21] = hc_bytealign_be (w[ 4], w[ 5], offset); + w[20] = hc_bytealign_be (w[ 3], w[ 4], offset); + w[19] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[18] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[17] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[16] = hc_bytealign_be ( 0, w[ 0], offset); + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; @@ -22766,59 +26494,59 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 12: - w[63] = hc_bytealign_be (w[50], w[51], offset); - w[62] = hc_bytealign_be (w[49], w[50], offset); - w[61] = hc_bytealign_be (w[48], w[49], offset); - w[60] = hc_bytealign_be (w[47], w[48], offset); - w[59] = hc_bytealign_be (w[46], w[47], offset); - w[58] = hc_bytealign_be (w[45], w[46], offset); - w[57] = hc_bytealign_be (w[44], w[45], offset); - w[56] = hc_bytealign_be (w[43], w[44], offset); - w[55] = hc_bytealign_be (w[42], w[43], offset); - w[54] = hc_bytealign_be (w[41], w[42], offset); - w[53] = hc_bytealign_be (w[40], w[41], offset); - w[52] = hc_bytealign_be (w[39], w[40], offset); - w[51] = hc_bytealign_be (w[38], w[39], offset); - w[50] = hc_bytealign_be (w[37], w[38], offset); - w[49] = hc_bytealign_be (w[36], w[37], offset); - w[48] = hc_bytealign_be (w[35], w[36], offset); - w[47] = hc_bytealign_be (w[34], w[35], offset); - w[46] = hc_bytealign_be (w[33], w[34], offset); - w[45] = hc_bytealign_be (w[32], w[33], offset); - w[44] = hc_bytealign_be (w[31], w[32], offset); - w[43] = hc_bytealign_be (w[30], w[31], offset); - w[42] = hc_bytealign_be (w[29], w[30], offset); - w[41] = hc_bytealign_be (w[28], w[29], offset); - w[40] = hc_bytealign_be (w[27], w[28], offset); - w[39] = hc_bytealign_be (w[26], w[27], offset); - w[38] = hc_bytealign_be (w[25], w[26], offset); - w[37] = hc_bytealign_be (w[24], w[25], offset); - w[36] = hc_bytealign_be (w[23], w[24], offset); - w[35] = hc_bytealign_be (w[22], w[23], offset); - w[34] = hc_bytealign_be (w[21], w[22], offset); - w[33] = hc_bytealign_be (w[20], w[21], offset); - w[32] = hc_bytealign_be (w[19], w[20], offset); - w[31] = hc_bytealign_be (w[18], w[19], offset); - w[30] = hc_bytealign_be (w[17], w[18], offset); - w[29] = hc_bytealign_be (w[16], w[17], offset); - w[28] = hc_bytealign_be (w[15], w[16], offset); - w[27] = hc_bytealign_be (w[14], w[15], offset); - w[26] = hc_bytealign_be (w[13], w[14], offset); - w[25] = hc_bytealign_be (w[12], w[13], offset); - w[24] = hc_bytealign_be (w[11], w[12], offset); - w[23] = hc_bytealign_be (w[10], w[11], offset); - w[22] = hc_bytealign_be (w[ 9], w[10], offset); - w[21] = hc_bytealign_be (w[ 8], w[ 9], offset); - w[20] = hc_bytealign_be (w[ 7], w[ 8], offset); - w[19] = hc_bytealign_be (w[ 6], w[ 7], offset); - w[18] = hc_bytealign_be (w[ 5], w[ 6], offset); - w[17] = hc_bytealign_be (w[ 4], w[ 5], offset); - w[16] = hc_bytealign_be (w[ 3], w[ 4], offset); - w[15] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[14] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[13] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[12] = hc_bytealign_be ( 0, w[ 0], offset); + case 17: + w[63] = hc_bytealign_be (w[45], w[46], offset); + w[62] = hc_bytealign_be (w[44], w[45], offset); + w[61] = hc_bytealign_be (w[43], w[44], offset); + w[60] = hc_bytealign_be (w[42], w[43], offset); + w[59] = hc_bytealign_be (w[41], w[42], offset); + w[58] = hc_bytealign_be (w[40], w[41], offset); + w[57] = hc_bytealign_be (w[39], w[40], offset); + w[56] = hc_bytealign_be (w[38], w[39], offset); + w[55] = hc_bytealign_be (w[37], w[38], offset); + w[54] = hc_bytealign_be (w[36], w[37], offset); + w[53] = hc_bytealign_be (w[35], w[36], offset); + w[52] = hc_bytealign_be (w[34], w[35], offset); + w[51] = hc_bytealign_be (w[33], w[34], offset); + w[50] = hc_bytealign_be (w[32], w[33], offset); + w[49] = hc_bytealign_be (w[31], w[32], offset); + w[48] = hc_bytealign_be (w[30], w[31], offset); + w[47] = hc_bytealign_be (w[29], w[30], offset); + w[46] = hc_bytealign_be (w[28], w[29], offset); + w[45] = hc_bytealign_be (w[27], w[28], offset); + w[44] = hc_bytealign_be (w[26], w[27], offset); + w[43] = hc_bytealign_be (w[25], w[26], offset); + w[42] = hc_bytealign_be (w[24], w[25], offset); + w[41] = hc_bytealign_be (w[23], w[24], offset); + w[40] = hc_bytealign_be (w[22], w[23], offset); + w[39] = hc_bytealign_be (w[21], w[22], offset); + w[38] = hc_bytealign_be (w[20], w[21], offset); + w[37] = hc_bytealign_be (w[19], w[20], offset); + w[36] = hc_bytealign_be (w[18], w[19], offset); + w[35] = hc_bytealign_be (w[17], w[18], offset); + w[34] = hc_bytealign_be (w[16], w[17], offset); + w[33] = hc_bytealign_be (w[15], w[16], offset); + w[32] = hc_bytealign_be (w[14], w[15], offset); + w[31] = hc_bytealign_be (w[13], w[14], offset); + w[30] = hc_bytealign_be (w[12], w[13], offset); + w[29] = hc_bytealign_be (w[11], w[12], offset); + w[28] = hc_bytealign_be (w[10], w[11], offset); + w[27] = hc_bytealign_be (w[ 9], w[10], offset); + w[26] = hc_bytealign_be (w[ 8], w[ 9], offset); + w[25] = hc_bytealign_be (w[ 7], w[ 8], offset); + w[24] = hc_bytealign_be (w[ 6], w[ 7], offset); + w[23] = hc_bytealign_be (w[ 5], w[ 6], offset); + w[22] = hc_bytealign_be (w[ 4], w[ 5], offset); + w[21] = hc_bytealign_be (w[ 3], w[ 4], offset); + w[20] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[19] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[18] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[17] = hc_bytealign_be ( 0, w[ 0], offset); + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; @@ -22834,58 +26562,126 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 13: - w[63] = hc_bytealign_be (w[49], w[50], offset); - w[62] = hc_bytealign_be (w[48], w[49], offset); - w[61] = hc_bytealign_be (w[47], w[48], offset); - w[60] = hc_bytealign_be (w[46], w[47], offset); - w[59] = hc_bytealign_be (w[45], w[46], offset); - w[58] = hc_bytealign_be (w[44], w[45], offset); - w[57] = hc_bytealign_be (w[43], w[44], offset); - w[56] = hc_bytealign_be (w[42], w[43], offset); - w[55] = hc_bytealign_be (w[41], w[42], offset); - w[54] = hc_bytealign_be (w[40], w[41], offset); - w[53] = hc_bytealign_be (w[39], w[40], offset); - w[52] = hc_bytealign_be (w[38], w[39], offset); - w[51] = hc_bytealign_be (w[37], w[38], offset); - w[50] = hc_bytealign_be (w[36], w[37], offset); - w[49] = hc_bytealign_be (w[35], w[36], offset); - w[48] = hc_bytealign_be (w[34], w[35], offset); - w[47] = hc_bytealign_be (w[33], w[34], offset); - w[46] = hc_bytealign_be (w[32], w[33], offset); - w[45] = hc_bytealign_be (w[31], w[32], offset); - w[44] = hc_bytealign_be (w[30], w[31], offset); - w[43] = hc_bytealign_be (w[29], w[30], offset); - w[42] = hc_bytealign_be (w[28], w[29], offset); - w[41] = hc_bytealign_be (w[27], w[28], offset); - w[40] = hc_bytealign_be (w[26], w[27], offset); - w[39] = hc_bytealign_be (w[25], w[26], offset); - w[38] = hc_bytealign_be (w[24], w[25], offset); - w[37] = hc_bytealign_be (w[23], w[24], offset); - w[36] = hc_bytealign_be (w[22], w[23], offset); - w[35] = hc_bytealign_be (w[21], w[22], offset); - w[34] = hc_bytealign_be (w[20], w[21], offset); - w[33] = hc_bytealign_be (w[19], w[20], offset); - w[32] = hc_bytealign_be (w[18], w[19], offset); - w[31] = hc_bytealign_be (w[17], w[18], offset); - w[30] = hc_bytealign_be (w[16], w[17], offset); - w[29] = hc_bytealign_be (w[15], w[16], offset); - w[28] = hc_bytealign_be (w[14], w[15], offset); - w[27] = hc_bytealign_be (w[13], w[14], offset); - w[26] = hc_bytealign_be (w[12], w[13], offset); - w[25] = hc_bytealign_be (w[11], w[12], offset); - w[24] = hc_bytealign_be (w[10], w[11], offset); - w[23] = hc_bytealign_be (w[ 9], w[10], offset); - w[22] = hc_bytealign_be (w[ 8], w[ 9], offset); - w[21] = hc_bytealign_be (w[ 7], w[ 8], offset); - w[20] = hc_bytealign_be (w[ 6], w[ 7], offset); - w[19] = hc_bytealign_be (w[ 5], w[ 6], offset); - w[18] = hc_bytealign_be (w[ 4], w[ 5], offset); - w[17] = hc_bytealign_be (w[ 3], w[ 4], offset); - w[16] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[15] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[14] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[13] = hc_bytealign_be ( 0, w[ 0], offset); + case 18: + w[63] = hc_bytealign_be (w[44], w[45], offset); + w[62] = hc_bytealign_be (w[43], w[44], offset); + w[61] = hc_bytealign_be (w[42], w[43], offset); + w[60] = hc_bytealign_be (w[41], w[42], offset); + w[59] = hc_bytealign_be (w[40], w[41], offset); + w[58] = hc_bytealign_be (w[39], w[40], offset); + w[57] = hc_bytealign_be (w[38], w[39], offset); + w[56] = hc_bytealign_be (w[37], w[38], offset); + w[55] = hc_bytealign_be (w[36], w[37], offset); + w[54] = hc_bytealign_be (w[35], w[36], offset); + w[53] = hc_bytealign_be (w[34], w[35], offset); + w[52] = hc_bytealign_be (w[33], w[34], offset); + w[51] = hc_bytealign_be (w[32], w[33], offset); + w[50] = hc_bytealign_be (w[31], w[32], offset); + w[49] = hc_bytealign_be (w[30], w[31], offset); + w[48] = hc_bytealign_be (w[29], w[30], offset); + w[47] = hc_bytealign_be (w[28], w[29], offset); + w[46] = hc_bytealign_be (w[27], w[28], offset); + w[45] = hc_bytealign_be (w[26], w[27], offset); + w[44] = hc_bytealign_be (w[25], w[26], offset); + w[43] = hc_bytealign_be (w[24], w[25], offset); + w[42] = hc_bytealign_be (w[23], w[24], offset); + w[41] = hc_bytealign_be (w[22], w[23], offset); + w[40] = hc_bytealign_be (w[21], w[22], offset); + w[39] = hc_bytealign_be (w[20], w[21], offset); + w[38] = hc_bytealign_be (w[19], w[20], offset); + w[37] = hc_bytealign_be (w[18], w[19], offset); + w[36] = hc_bytealign_be (w[17], w[18], offset); + w[35] = hc_bytealign_be (w[16], w[17], offset); + w[34] = hc_bytealign_be (w[15], w[16], offset); + w[33] = hc_bytealign_be (w[14], w[15], offset); + w[32] = hc_bytealign_be (w[13], w[14], offset); + w[31] = hc_bytealign_be (w[12], w[13], offset); + w[30] = hc_bytealign_be (w[11], w[12], offset); + w[29] = hc_bytealign_be (w[10], w[11], offset); + w[28] = hc_bytealign_be (w[ 9], w[10], offset); + w[27] = hc_bytealign_be (w[ 8], w[ 9], offset); + w[26] = hc_bytealign_be (w[ 7], w[ 8], offset); + w[25] = hc_bytealign_be (w[ 6], w[ 7], offset); + w[24] = hc_bytealign_be (w[ 5], w[ 6], offset); + w[23] = hc_bytealign_be (w[ 4], w[ 5], offset); + w[22] = hc_bytealign_be (w[ 3], w[ 4], offset); + w[21] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[20] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[19] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[18] = hc_bytealign_be ( 0, w[ 0], offset); + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 19: + w[63] = hc_bytealign_be (w[43], w[44], offset); + w[62] = hc_bytealign_be (w[42], w[43], offset); + w[61] = hc_bytealign_be (w[41], w[42], offset); + w[60] = hc_bytealign_be (w[40], w[41], offset); + w[59] = hc_bytealign_be (w[39], w[40], offset); + w[58] = hc_bytealign_be (w[38], w[39], offset); + w[57] = hc_bytealign_be (w[37], w[38], offset); + w[56] = hc_bytealign_be (w[36], w[37], offset); + w[55] = hc_bytealign_be (w[35], w[36], offset); + w[54] = hc_bytealign_be (w[34], w[35], offset); + w[53] = hc_bytealign_be (w[33], w[34], offset); + w[52] = hc_bytealign_be (w[32], w[33], offset); + w[51] = hc_bytealign_be (w[31], w[32], offset); + w[50] = hc_bytealign_be (w[30], w[31], offset); + w[49] = hc_bytealign_be (w[29], w[30], offset); + w[48] = hc_bytealign_be (w[28], w[29], offset); + w[47] = hc_bytealign_be (w[27], w[28], offset); + w[46] = hc_bytealign_be (w[26], w[27], offset); + w[45] = hc_bytealign_be (w[25], w[26], offset); + w[44] = hc_bytealign_be (w[24], w[25], offset); + w[43] = hc_bytealign_be (w[23], w[24], offset); + w[42] = hc_bytealign_be (w[22], w[23], offset); + w[41] = hc_bytealign_be (w[21], w[22], offset); + w[40] = hc_bytealign_be (w[20], w[21], offset); + w[39] = hc_bytealign_be (w[19], w[20], offset); + w[38] = hc_bytealign_be (w[18], w[19], offset); + w[37] = hc_bytealign_be (w[17], w[18], offset); + w[36] = hc_bytealign_be (w[16], w[17], offset); + w[35] = hc_bytealign_be (w[15], w[16], offset); + w[34] = hc_bytealign_be (w[14], w[15], offset); + w[33] = hc_bytealign_be (w[13], w[14], offset); + w[32] = hc_bytealign_be (w[12], w[13], offset); + w[31] = hc_bytealign_be (w[11], w[12], offset); + w[30] = hc_bytealign_be (w[10], w[11], offset); + w[29] = hc_bytealign_be (w[ 9], w[10], offset); + w[28] = hc_bytealign_be (w[ 8], w[ 9], offset); + w[27] = hc_bytealign_be (w[ 7], w[ 8], offset); + w[26] = hc_bytealign_be (w[ 6], w[ 7], offset); + w[25] = hc_bytealign_be (w[ 5], w[ 6], offset); + w[24] = hc_bytealign_be (w[ 4], w[ 5], offset); + w[23] = hc_bytealign_be (w[ 3], w[ 4], offset); + w[22] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[21] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[20] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[19] = hc_bytealign_be ( 0, w[ 0], offset); + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; @@ -22902,57 +26698,57 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 14: - w[63] = hc_bytealign_be (w[48], w[49], offset); - w[62] = hc_bytealign_be (w[47], w[48], offset); - w[61] = hc_bytealign_be (w[46], w[47], offset); - w[60] = hc_bytealign_be (w[45], w[46], offset); - w[59] = hc_bytealign_be (w[44], w[45], offset); - w[58] = hc_bytealign_be (w[43], w[44], offset); - w[57] = hc_bytealign_be (w[42], w[43], offset); - w[56] = hc_bytealign_be (w[41], w[42], offset); - w[55] = hc_bytealign_be (w[40], w[41], offset); - w[54] = hc_bytealign_be (w[39], w[40], offset); - w[53] = hc_bytealign_be (w[38], w[39], offset); - w[52] = hc_bytealign_be (w[37], w[38], offset); - w[51] = hc_bytealign_be (w[36], w[37], offset); - w[50] = hc_bytealign_be (w[35], w[36], offset); - w[49] = hc_bytealign_be (w[34], w[35], offset); - w[48] = hc_bytealign_be (w[33], w[34], offset); - w[47] = hc_bytealign_be (w[32], w[33], offset); - w[46] = hc_bytealign_be (w[31], w[32], offset); - w[45] = hc_bytealign_be (w[30], w[31], offset); - w[44] = hc_bytealign_be (w[29], w[30], offset); - w[43] = hc_bytealign_be (w[28], w[29], offset); - w[42] = hc_bytealign_be (w[27], w[28], offset); - w[41] = hc_bytealign_be (w[26], w[27], offset); - w[40] = hc_bytealign_be (w[25], w[26], offset); - w[39] = hc_bytealign_be (w[24], w[25], offset); - w[38] = hc_bytealign_be (w[23], w[24], offset); - w[37] = hc_bytealign_be (w[22], w[23], offset); - w[36] = hc_bytealign_be (w[21], w[22], offset); - w[35] = hc_bytealign_be (w[20], w[21], offset); - w[34] = hc_bytealign_be (w[19], w[20], offset); - w[33] = hc_bytealign_be (w[18], w[19], offset); - w[32] = hc_bytealign_be (w[17], w[18], offset); - w[31] = hc_bytealign_be (w[16], w[17], offset); - w[30] = hc_bytealign_be (w[15], w[16], offset); - w[29] = hc_bytealign_be (w[14], w[15], offset); - w[28] = hc_bytealign_be (w[13], w[14], offset); - w[27] = hc_bytealign_be (w[12], w[13], offset); - w[26] = hc_bytealign_be (w[11], w[12], offset); - w[25] = hc_bytealign_be (w[10], w[11], offset); - w[24] = hc_bytealign_be (w[ 9], w[10], offset); - w[23] = hc_bytealign_be (w[ 8], w[ 9], offset); - w[22] = hc_bytealign_be (w[ 7], w[ 8], offset); - w[21] = hc_bytealign_be (w[ 6], w[ 7], offset); - w[20] = hc_bytealign_be (w[ 5], w[ 6], offset); - w[19] = hc_bytealign_be (w[ 4], w[ 5], offset); - w[18] = hc_bytealign_be (w[ 3], w[ 4], offset); - w[17] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[16] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[15] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[14] = hc_bytealign_be ( 0, w[ 0], offset); + case 20: + w[63] = hc_bytealign_be (w[42], w[43], offset); + w[62] = hc_bytealign_be (w[41], w[42], offset); + w[61] = hc_bytealign_be (w[40], w[41], offset); + w[60] = hc_bytealign_be (w[39], w[40], offset); + w[59] = hc_bytealign_be (w[38], w[39], offset); + w[58] = hc_bytealign_be (w[37], w[38], offset); + w[57] = hc_bytealign_be (w[36], w[37], offset); + w[56] = hc_bytealign_be (w[35], w[36], offset); + w[55] = hc_bytealign_be (w[34], w[35], offset); + w[54] = hc_bytealign_be (w[33], w[34], offset); + w[53] = hc_bytealign_be (w[32], w[33], offset); + w[52] = hc_bytealign_be (w[31], w[32], offset); + w[51] = hc_bytealign_be (w[30], w[31], offset); + w[50] = hc_bytealign_be (w[29], w[30], offset); + w[49] = hc_bytealign_be (w[28], w[29], offset); + w[48] = hc_bytealign_be (w[27], w[28], offset); + w[47] = hc_bytealign_be (w[26], w[27], offset); + w[46] = hc_bytealign_be (w[25], w[26], offset); + w[45] = hc_bytealign_be (w[24], w[25], offset); + w[44] = hc_bytealign_be (w[23], w[24], offset); + w[43] = hc_bytealign_be (w[22], w[23], offset); + w[42] = hc_bytealign_be (w[21], w[22], offset); + w[41] = hc_bytealign_be (w[20], w[21], offset); + w[40] = hc_bytealign_be (w[19], w[20], offset); + w[39] = hc_bytealign_be (w[18], w[19], offset); + w[38] = hc_bytealign_be (w[17], w[18], offset); + w[37] = hc_bytealign_be (w[16], w[17], offset); + w[36] = hc_bytealign_be (w[15], w[16], offset); + w[35] = hc_bytealign_be (w[14], w[15], offset); + w[34] = hc_bytealign_be (w[13], w[14], offset); + w[33] = hc_bytealign_be (w[12], w[13], offset); + w[32] = hc_bytealign_be (w[11], w[12], offset); + w[31] = hc_bytealign_be (w[10], w[11], offset); + w[30] = hc_bytealign_be (w[ 9], w[10], offset); + w[29] = hc_bytealign_be (w[ 8], w[ 9], offset); + w[28] = hc_bytealign_be (w[ 7], w[ 8], offset); + w[27] = hc_bytealign_be (w[ 6], w[ 7], offset); + w[26] = hc_bytealign_be (w[ 5], w[ 6], offset); + w[25] = hc_bytealign_be (w[ 4], w[ 5], offset); + w[24] = hc_bytealign_be (w[ 3], w[ 4], offset); + w[23] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[22] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[21] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[20] = hc_bytealign_be ( 0, w[ 0], offset); + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; @@ -22970,56 +26766,56 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 15: - w[63] = hc_bytealign_be (w[47], w[48], offset); - w[62] = hc_bytealign_be (w[46], w[47], offset); - w[61] = hc_bytealign_be (w[45], w[46], offset); - w[60] = hc_bytealign_be (w[44], w[45], offset); - w[59] = hc_bytealign_be (w[43], w[44], offset); - w[58] = hc_bytealign_be (w[42], w[43], offset); - w[57] = hc_bytealign_be (w[41], w[42], offset); - w[56] = hc_bytealign_be (w[40], w[41], offset); - w[55] = hc_bytealign_be (w[39], w[40], offset); - w[54] = hc_bytealign_be (w[38], w[39], offset); - w[53] = hc_bytealign_be (w[37], w[38], offset); - w[52] = hc_bytealign_be (w[36], w[37], offset); - w[51] = hc_bytealign_be (w[35], w[36], offset); - w[50] = hc_bytealign_be (w[34], w[35], offset); - w[49] = hc_bytealign_be (w[33], w[34], offset); - w[48] = hc_bytealign_be (w[32], w[33], offset); - w[47] = hc_bytealign_be (w[31], w[32], offset); - w[46] = hc_bytealign_be (w[30], w[31], offset); - w[45] = hc_bytealign_be (w[29], w[30], offset); - w[44] = hc_bytealign_be (w[28], w[29], offset); - w[43] = hc_bytealign_be (w[27], w[28], offset); - w[42] = hc_bytealign_be (w[26], w[27], offset); - w[41] = hc_bytealign_be (w[25], w[26], offset); - w[40] = hc_bytealign_be (w[24], w[25], offset); - w[39] = hc_bytealign_be (w[23], w[24], offset); - w[38] = hc_bytealign_be (w[22], w[23], offset); - w[37] = hc_bytealign_be (w[21], w[22], offset); - w[36] = hc_bytealign_be (w[20], w[21], offset); - w[35] = hc_bytealign_be (w[19], w[20], offset); - w[34] = hc_bytealign_be (w[18], w[19], offset); - w[33] = hc_bytealign_be (w[17], w[18], offset); - w[32] = hc_bytealign_be (w[16], w[17], offset); - w[31] = hc_bytealign_be (w[15], w[16], offset); - w[30] = hc_bytealign_be (w[14], w[15], offset); - w[29] = hc_bytealign_be (w[13], w[14], offset); - w[28] = hc_bytealign_be (w[12], w[13], offset); - w[27] = hc_bytealign_be (w[11], w[12], offset); - w[26] = hc_bytealign_be (w[10], w[11], offset); - w[25] = hc_bytealign_be (w[ 9], w[10], offset); - w[24] = hc_bytealign_be (w[ 8], w[ 9], offset); - w[23] = hc_bytealign_be (w[ 7], w[ 8], offset); - w[22] = hc_bytealign_be (w[ 6], w[ 7], offset); - w[21] = hc_bytealign_be (w[ 5], w[ 6], offset); - w[20] = hc_bytealign_be (w[ 4], w[ 5], offset); - w[19] = hc_bytealign_be (w[ 3], w[ 4], offset); - w[18] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[17] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[16] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[15] = hc_bytealign_be ( 0, w[ 0], offset); + case 21: + w[63] = hc_bytealign_be (w[41], w[42], offset); + w[62] = hc_bytealign_be (w[40], w[41], offset); + w[61] = hc_bytealign_be (w[39], w[40], offset); + w[60] = hc_bytealign_be (w[38], w[39], offset); + w[59] = hc_bytealign_be (w[37], w[38], offset); + w[58] = hc_bytealign_be (w[36], w[37], offset); + w[57] = hc_bytealign_be (w[35], w[36], offset); + w[56] = hc_bytealign_be (w[34], w[35], offset); + w[55] = hc_bytealign_be (w[33], w[34], offset); + w[54] = hc_bytealign_be (w[32], w[33], offset); + w[53] = hc_bytealign_be (w[31], w[32], offset); + w[52] = hc_bytealign_be (w[30], w[31], offset); + w[51] = hc_bytealign_be (w[29], w[30], offset); + w[50] = hc_bytealign_be (w[28], w[29], offset); + w[49] = hc_bytealign_be (w[27], w[28], offset); + w[48] = hc_bytealign_be (w[26], w[27], offset); + w[47] = hc_bytealign_be (w[25], w[26], offset); + w[46] = hc_bytealign_be (w[24], w[25], offset); + w[45] = hc_bytealign_be (w[23], w[24], offset); + w[44] = hc_bytealign_be (w[22], w[23], offset); + w[43] = hc_bytealign_be (w[21], w[22], offset); + w[42] = hc_bytealign_be (w[20], w[21], offset); + w[41] = hc_bytealign_be (w[19], w[20], offset); + w[40] = hc_bytealign_be (w[18], w[19], offset); + w[39] = hc_bytealign_be (w[17], w[18], offset); + w[38] = hc_bytealign_be (w[16], w[17], offset); + w[37] = hc_bytealign_be (w[15], w[16], offset); + w[36] = hc_bytealign_be (w[14], w[15], offset); + w[35] = hc_bytealign_be (w[13], w[14], offset); + w[34] = hc_bytealign_be (w[12], w[13], offset); + w[33] = hc_bytealign_be (w[11], w[12], offset); + w[32] = hc_bytealign_be (w[10], w[11], offset); + w[31] = hc_bytealign_be (w[ 9], w[10], offset); + w[30] = hc_bytealign_be (w[ 8], w[ 9], offset); + w[29] = hc_bytealign_be (w[ 7], w[ 8], offset); + w[28] = hc_bytealign_be (w[ 6], w[ 7], offset); + w[27] = hc_bytealign_be (w[ 5], w[ 6], offset); + w[26] = hc_bytealign_be (w[ 4], w[ 5], offset); + w[25] = hc_bytealign_be (w[ 3], w[ 4], offset); + w[24] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[23] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[22] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[21] = hc_bytealign_be ( 0, w[ 0], offset); + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; @@ -23038,55 +26834,55 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 16: - w[63] = hc_bytealign_be (w[46], w[47], offset); - w[62] = hc_bytealign_be (w[45], w[46], offset); - w[61] = hc_bytealign_be (w[44], w[45], offset); - w[60] = hc_bytealign_be (w[43], w[44], offset); - w[59] = hc_bytealign_be (w[42], w[43], offset); - w[58] = hc_bytealign_be (w[41], w[42], offset); - w[57] = hc_bytealign_be (w[40], w[41], offset); - w[56] = hc_bytealign_be (w[39], w[40], offset); - w[55] = hc_bytealign_be (w[38], w[39], offset); - w[54] = hc_bytealign_be (w[37], w[38], offset); - w[53] = hc_bytealign_be (w[36], w[37], offset); - w[52] = hc_bytealign_be (w[35], w[36], offset); - w[51] = hc_bytealign_be (w[34], w[35], offset); - w[50] = hc_bytealign_be (w[33], w[34], offset); - w[49] = hc_bytealign_be (w[32], w[33], offset); - w[48] = hc_bytealign_be (w[31], w[32], offset); - w[47] = hc_bytealign_be (w[30], w[31], offset); - w[46] = hc_bytealign_be (w[29], w[30], offset); - w[45] = hc_bytealign_be (w[28], w[29], offset); - w[44] = hc_bytealign_be (w[27], w[28], offset); - w[43] = hc_bytealign_be (w[26], w[27], offset); - w[42] = hc_bytealign_be (w[25], w[26], offset); - w[41] = hc_bytealign_be (w[24], w[25], offset); - w[40] = hc_bytealign_be (w[23], w[24], offset); - w[39] = hc_bytealign_be (w[22], w[23], offset); - w[38] = hc_bytealign_be (w[21], w[22], offset); - w[37] = hc_bytealign_be (w[20], w[21], offset); - w[36] = hc_bytealign_be (w[19], w[20], offset); - w[35] = hc_bytealign_be (w[18], w[19], offset); - w[34] = hc_bytealign_be (w[17], w[18], offset); - w[33] = hc_bytealign_be (w[16], w[17], offset); - w[32] = hc_bytealign_be (w[15], w[16], offset); - w[31] = hc_bytealign_be (w[14], w[15], offset); - w[30] = hc_bytealign_be (w[13], w[14], offset); - w[29] = hc_bytealign_be (w[12], w[13], offset); - w[28] = hc_bytealign_be (w[11], w[12], offset); - w[27] = hc_bytealign_be (w[10], w[11], offset); - w[26] = hc_bytealign_be (w[ 9], w[10], offset); - w[25] = hc_bytealign_be (w[ 8], w[ 9], offset); - w[24] = hc_bytealign_be (w[ 7], w[ 8], offset); - w[23] = hc_bytealign_be (w[ 6], w[ 7], offset); - w[22] = hc_bytealign_be (w[ 5], w[ 6], offset); - w[21] = hc_bytealign_be (w[ 4], w[ 5], offset); - w[20] = hc_bytealign_be (w[ 3], w[ 4], offset); - w[19] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[18] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[17] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[16] = hc_bytealign_be ( 0, w[ 0], offset); + case 22: + w[63] = hc_bytealign_be (w[40], w[41], offset); + w[62] = hc_bytealign_be (w[39], w[40], offset); + w[61] = hc_bytealign_be (w[38], w[39], offset); + w[60] = hc_bytealign_be (w[37], w[38], offset); + w[59] = hc_bytealign_be (w[36], w[37], offset); + w[58] = hc_bytealign_be (w[35], w[36], offset); + w[57] = hc_bytealign_be (w[34], w[35], offset); + w[56] = hc_bytealign_be (w[33], w[34], offset); + w[55] = hc_bytealign_be (w[32], w[33], offset); + w[54] = hc_bytealign_be (w[31], w[32], offset); + w[53] = hc_bytealign_be (w[30], w[31], offset); + w[52] = hc_bytealign_be (w[29], w[30], offset); + w[51] = hc_bytealign_be (w[28], w[29], offset); + w[50] = hc_bytealign_be (w[27], w[28], offset); + w[49] = hc_bytealign_be (w[26], w[27], offset); + w[48] = hc_bytealign_be (w[25], w[26], offset); + w[47] = hc_bytealign_be (w[24], w[25], offset); + w[46] = hc_bytealign_be (w[23], w[24], offset); + w[45] = hc_bytealign_be (w[22], w[23], offset); + w[44] = hc_bytealign_be (w[21], w[22], offset); + w[43] = hc_bytealign_be (w[20], w[21], offset); + w[42] = hc_bytealign_be (w[19], w[20], offset); + w[41] = hc_bytealign_be (w[18], w[19], offset); + w[40] = hc_bytealign_be (w[17], w[18], offset); + w[39] = hc_bytealign_be (w[16], w[17], offset); + w[38] = hc_bytealign_be (w[15], w[16], offset); + w[37] = hc_bytealign_be (w[14], w[15], offset); + w[36] = hc_bytealign_be (w[13], w[14], offset); + w[35] = hc_bytealign_be (w[12], w[13], offset); + w[34] = hc_bytealign_be (w[11], w[12], offset); + w[33] = hc_bytealign_be (w[10], w[11], offset); + w[32] = hc_bytealign_be (w[ 9], w[10], offset); + w[31] = hc_bytealign_be (w[ 8], w[ 9], offset); + w[30] = hc_bytealign_be (w[ 7], w[ 8], offset); + w[29] = hc_bytealign_be (w[ 6], w[ 7], offset); + w[28] = hc_bytealign_be (w[ 5], w[ 6], offset); + w[27] = hc_bytealign_be (w[ 4], w[ 5], offset); + w[26] = hc_bytealign_be (w[ 3], w[ 4], offset); + w[25] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[24] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[23] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[22] = hc_bytealign_be ( 0, w[ 0], offset); + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; @@ -23106,54 +26902,122 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 17: - w[63] = hc_bytealign_be (w[45], w[46], offset); - w[62] = hc_bytealign_be (w[44], w[45], offset); - w[61] = hc_bytealign_be (w[43], w[44], offset); - w[60] = hc_bytealign_be (w[42], w[43], offset); - w[59] = hc_bytealign_be (w[41], w[42], offset); - w[58] = hc_bytealign_be (w[40], w[41], offset); - w[57] = hc_bytealign_be (w[39], w[40], offset); - w[56] = hc_bytealign_be (w[38], w[39], offset); - w[55] = hc_bytealign_be (w[37], w[38], offset); - w[54] = hc_bytealign_be (w[36], w[37], offset); - w[53] = hc_bytealign_be (w[35], w[36], offset); - w[52] = hc_bytealign_be (w[34], w[35], offset); - w[51] = hc_bytealign_be (w[33], w[34], offset); - w[50] = hc_bytealign_be (w[32], w[33], offset); - w[49] = hc_bytealign_be (w[31], w[32], offset); - w[48] = hc_bytealign_be (w[30], w[31], offset); - w[47] = hc_bytealign_be (w[29], w[30], offset); - w[46] = hc_bytealign_be (w[28], w[29], offset); - w[45] = hc_bytealign_be (w[27], w[28], offset); - w[44] = hc_bytealign_be (w[26], w[27], offset); - w[43] = hc_bytealign_be (w[25], w[26], offset); - w[42] = hc_bytealign_be (w[24], w[25], offset); - w[41] = hc_bytealign_be (w[23], w[24], offset); - w[40] = hc_bytealign_be (w[22], w[23], offset); - w[39] = hc_bytealign_be (w[21], w[22], offset); - w[38] = hc_bytealign_be (w[20], w[21], offset); - w[37] = hc_bytealign_be (w[19], w[20], offset); - w[36] = hc_bytealign_be (w[18], w[19], offset); - w[35] = hc_bytealign_be (w[17], w[18], offset); - w[34] = hc_bytealign_be (w[16], w[17], offset); - w[33] = hc_bytealign_be (w[15], w[16], offset); - w[32] = hc_bytealign_be (w[14], w[15], offset); - w[31] = hc_bytealign_be (w[13], w[14], offset); - w[30] = hc_bytealign_be (w[12], w[13], offset); - w[29] = hc_bytealign_be (w[11], w[12], offset); - w[28] = hc_bytealign_be (w[10], w[11], offset); - w[27] = hc_bytealign_be (w[ 9], w[10], offset); - w[26] = hc_bytealign_be (w[ 8], w[ 9], offset); - w[25] = hc_bytealign_be (w[ 7], w[ 8], offset); - w[24] = hc_bytealign_be (w[ 6], w[ 7], offset); - w[23] = hc_bytealign_be (w[ 5], w[ 6], offset); - w[22] = hc_bytealign_be (w[ 4], w[ 5], offset); - w[21] = hc_bytealign_be (w[ 3], w[ 4], offset); - w[20] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[19] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[18] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[17] = hc_bytealign_be ( 0, w[ 0], offset); + case 23: + w[63] = hc_bytealign_be (w[39], w[40], offset); + w[62] = hc_bytealign_be (w[38], w[39], offset); + w[61] = hc_bytealign_be (w[37], w[38], offset); + w[60] = hc_bytealign_be (w[36], w[37], offset); + w[59] = hc_bytealign_be (w[35], w[36], offset); + w[58] = hc_bytealign_be (w[34], w[35], offset); + w[57] = hc_bytealign_be (w[33], w[34], offset); + w[56] = hc_bytealign_be (w[32], w[33], offset); + w[55] = hc_bytealign_be (w[31], w[32], offset); + w[54] = hc_bytealign_be (w[30], w[31], offset); + w[53] = hc_bytealign_be (w[29], w[30], offset); + w[52] = hc_bytealign_be (w[28], w[29], offset); + w[51] = hc_bytealign_be (w[27], w[28], offset); + w[50] = hc_bytealign_be (w[26], w[27], offset); + w[49] = hc_bytealign_be (w[25], w[26], offset); + w[48] = hc_bytealign_be (w[24], w[25], offset); + w[47] = hc_bytealign_be (w[23], w[24], offset); + w[46] = hc_bytealign_be (w[22], w[23], offset); + w[45] = hc_bytealign_be (w[21], w[22], offset); + w[44] = hc_bytealign_be (w[20], w[21], offset); + w[43] = hc_bytealign_be (w[19], w[20], offset); + w[42] = hc_bytealign_be (w[18], w[19], offset); + w[41] = hc_bytealign_be (w[17], w[18], offset); + w[40] = hc_bytealign_be (w[16], w[17], offset); + w[39] = hc_bytealign_be (w[15], w[16], offset); + w[38] = hc_bytealign_be (w[14], w[15], offset); + w[37] = hc_bytealign_be (w[13], w[14], offset); + w[36] = hc_bytealign_be (w[12], w[13], offset); + w[35] = hc_bytealign_be (w[11], w[12], offset); + w[34] = hc_bytealign_be (w[10], w[11], offset); + w[33] = hc_bytealign_be (w[ 9], w[10], offset); + w[32] = hc_bytealign_be (w[ 8], w[ 9], offset); + w[31] = hc_bytealign_be (w[ 7], w[ 8], offset); + w[30] = hc_bytealign_be (w[ 6], w[ 7], offset); + w[29] = hc_bytealign_be (w[ 5], w[ 6], offset); + w[28] = hc_bytealign_be (w[ 4], w[ 5], offset); + w[27] = hc_bytealign_be (w[ 3], w[ 4], offset); + w[26] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[25] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[24] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[23] = hc_bytealign_be ( 0, w[ 0], offset); + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 24: + w[63] = hc_bytealign_be (w[38], w[39], offset); + w[62] = hc_bytealign_be (w[37], w[38], offset); + w[61] = hc_bytealign_be (w[36], w[37], offset); + w[60] = hc_bytealign_be (w[35], w[36], offset); + w[59] = hc_bytealign_be (w[34], w[35], offset); + w[58] = hc_bytealign_be (w[33], w[34], offset); + w[57] = hc_bytealign_be (w[32], w[33], offset); + w[56] = hc_bytealign_be (w[31], w[32], offset); + w[55] = hc_bytealign_be (w[30], w[31], offset); + w[54] = hc_bytealign_be (w[29], w[30], offset); + w[53] = hc_bytealign_be (w[28], w[29], offset); + w[52] = hc_bytealign_be (w[27], w[28], offset); + w[51] = hc_bytealign_be (w[26], w[27], offset); + w[50] = hc_bytealign_be (w[25], w[26], offset); + w[49] = hc_bytealign_be (w[24], w[25], offset); + w[48] = hc_bytealign_be (w[23], w[24], offset); + w[47] = hc_bytealign_be (w[22], w[23], offset); + w[46] = hc_bytealign_be (w[21], w[22], offset); + w[45] = hc_bytealign_be (w[20], w[21], offset); + w[44] = hc_bytealign_be (w[19], w[20], offset); + w[43] = hc_bytealign_be (w[18], w[19], offset); + w[42] = hc_bytealign_be (w[17], w[18], offset); + w[41] = hc_bytealign_be (w[16], w[17], offset); + w[40] = hc_bytealign_be (w[15], w[16], offset); + w[39] = hc_bytealign_be (w[14], w[15], offset); + w[38] = hc_bytealign_be (w[13], w[14], offset); + w[37] = hc_bytealign_be (w[12], w[13], offset); + w[36] = hc_bytealign_be (w[11], w[12], offset); + w[35] = hc_bytealign_be (w[10], w[11], offset); + w[34] = hc_bytealign_be (w[ 9], w[10], offset); + w[33] = hc_bytealign_be (w[ 8], w[ 9], offset); + w[32] = hc_bytealign_be (w[ 7], w[ 8], offset); + w[31] = hc_bytealign_be (w[ 6], w[ 7], offset); + w[30] = hc_bytealign_be (w[ 5], w[ 6], offset); + w[29] = hc_bytealign_be (w[ 4], w[ 5], offset); + w[28] = hc_bytealign_be (w[ 3], w[ 4], offset); + w[27] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[26] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[25] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[24] = hc_bytealign_be ( 0, w[ 0], offset); + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; @@ -23174,53 +27038,53 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 18: - w[63] = hc_bytealign_be (w[44], w[45], offset); - w[62] = hc_bytealign_be (w[43], w[44], offset); - w[61] = hc_bytealign_be (w[42], w[43], offset); - w[60] = hc_bytealign_be (w[41], w[42], offset); - w[59] = hc_bytealign_be (w[40], w[41], offset); - w[58] = hc_bytealign_be (w[39], w[40], offset); - w[57] = hc_bytealign_be (w[38], w[39], offset); - w[56] = hc_bytealign_be (w[37], w[38], offset); - w[55] = hc_bytealign_be (w[36], w[37], offset); - w[54] = hc_bytealign_be (w[35], w[36], offset); - w[53] = hc_bytealign_be (w[34], w[35], offset); - w[52] = hc_bytealign_be (w[33], w[34], offset); - w[51] = hc_bytealign_be (w[32], w[33], offset); - w[50] = hc_bytealign_be (w[31], w[32], offset); - w[49] = hc_bytealign_be (w[30], w[31], offset); - w[48] = hc_bytealign_be (w[29], w[30], offset); - w[47] = hc_bytealign_be (w[28], w[29], offset); - w[46] = hc_bytealign_be (w[27], w[28], offset); - w[45] = hc_bytealign_be (w[26], w[27], offset); - w[44] = hc_bytealign_be (w[25], w[26], offset); - w[43] = hc_bytealign_be (w[24], w[25], offset); - w[42] = hc_bytealign_be (w[23], w[24], offset); - w[41] = hc_bytealign_be (w[22], w[23], offset); - w[40] = hc_bytealign_be (w[21], w[22], offset); - w[39] = hc_bytealign_be (w[20], w[21], offset); - w[38] = hc_bytealign_be (w[19], w[20], offset); - w[37] = hc_bytealign_be (w[18], w[19], offset); - w[36] = hc_bytealign_be (w[17], w[18], offset); - w[35] = hc_bytealign_be (w[16], w[17], offset); - w[34] = hc_bytealign_be (w[15], w[16], offset); - w[33] = hc_bytealign_be (w[14], w[15], offset); - w[32] = hc_bytealign_be (w[13], w[14], offset); - w[31] = hc_bytealign_be (w[12], w[13], offset); - w[30] = hc_bytealign_be (w[11], w[12], offset); - w[29] = hc_bytealign_be (w[10], w[11], offset); - w[28] = hc_bytealign_be (w[ 9], w[10], offset); - w[27] = hc_bytealign_be (w[ 8], w[ 9], offset); - w[26] = hc_bytealign_be (w[ 7], w[ 8], offset); - w[25] = hc_bytealign_be (w[ 6], w[ 7], offset); - w[24] = hc_bytealign_be (w[ 5], w[ 6], offset); - w[23] = hc_bytealign_be (w[ 4], w[ 5], offset); - w[22] = hc_bytealign_be (w[ 3], w[ 4], offset); - w[21] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[20] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[19] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[18] = hc_bytealign_be ( 0, w[ 0], offset); + case 25: + w[63] = hc_bytealign_be (w[37], w[38], offset); + w[62] = hc_bytealign_be (w[36], w[37], offset); + w[61] = hc_bytealign_be (w[35], w[36], offset); + w[60] = hc_bytealign_be (w[34], w[35], offset); + w[59] = hc_bytealign_be (w[33], w[34], offset); + w[58] = hc_bytealign_be (w[32], w[33], offset); + w[57] = hc_bytealign_be (w[31], w[32], offset); + w[56] = hc_bytealign_be (w[30], w[31], offset); + w[55] = hc_bytealign_be (w[29], w[30], offset); + w[54] = hc_bytealign_be (w[28], w[29], offset); + w[53] = hc_bytealign_be (w[27], w[28], offset); + w[52] = hc_bytealign_be (w[26], w[27], offset); + w[51] = hc_bytealign_be (w[25], w[26], offset); + w[50] = hc_bytealign_be (w[24], w[25], offset); + w[49] = hc_bytealign_be (w[23], w[24], offset); + w[48] = hc_bytealign_be (w[22], w[23], offset); + w[47] = hc_bytealign_be (w[21], w[22], offset); + w[46] = hc_bytealign_be (w[20], w[21], offset); + w[45] = hc_bytealign_be (w[19], w[20], offset); + w[44] = hc_bytealign_be (w[18], w[19], offset); + w[43] = hc_bytealign_be (w[17], w[18], offset); + w[42] = hc_bytealign_be (w[16], w[17], offset); + w[41] = hc_bytealign_be (w[15], w[16], offset); + w[40] = hc_bytealign_be (w[14], w[15], offset); + w[39] = hc_bytealign_be (w[13], w[14], offset); + w[38] = hc_bytealign_be (w[12], w[13], offset); + w[37] = hc_bytealign_be (w[11], w[12], offset); + w[36] = hc_bytealign_be (w[10], w[11], offset); + w[35] = hc_bytealign_be (w[ 9], w[10], offset); + w[34] = hc_bytealign_be (w[ 8], w[ 9], offset); + w[33] = hc_bytealign_be (w[ 7], w[ 8], offset); + w[32] = hc_bytealign_be (w[ 6], w[ 7], offset); + w[31] = hc_bytealign_be (w[ 5], w[ 6], offset); + w[30] = hc_bytealign_be (w[ 4], w[ 5], offset); + w[29] = hc_bytealign_be (w[ 3], w[ 4], offset); + w[28] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[27] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[26] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[25] = hc_bytealign_be ( 0, w[ 0], offset); + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; @@ -23242,52 +27106,52 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 19: - w[63] = hc_bytealign_be (w[43], w[44], offset); - w[62] = hc_bytealign_be (w[42], w[43], offset); - w[61] = hc_bytealign_be (w[41], w[42], offset); - w[60] = hc_bytealign_be (w[40], w[41], offset); - w[59] = hc_bytealign_be (w[39], w[40], offset); - w[58] = hc_bytealign_be (w[38], w[39], offset); - w[57] = hc_bytealign_be (w[37], w[38], offset); - w[56] = hc_bytealign_be (w[36], w[37], offset); - w[55] = hc_bytealign_be (w[35], w[36], offset); - w[54] = hc_bytealign_be (w[34], w[35], offset); - w[53] = hc_bytealign_be (w[33], w[34], offset); - w[52] = hc_bytealign_be (w[32], w[33], offset); - w[51] = hc_bytealign_be (w[31], w[32], offset); - w[50] = hc_bytealign_be (w[30], w[31], offset); - w[49] = hc_bytealign_be (w[29], w[30], offset); - w[48] = hc_bytealign_be (w[28], w[29], offset); - w[47] = hc_bytealign_be (w[27], w[28], offset); - w[46] = hc_bytealign_be (w[26], w[27], offset); - w[45] = hc_bytealign_be (w[25], w[26], offset); - w[44] = hc_bytealign_be (w[24], w[25], offset); - w[43] = hc_bytealign_be (w[23], w[24], offset); - w[42] = hc_bytealign_be (w[22], w[23], offset); - w[41] = hc_bytealign_be (w[21], w[22], offset); - w[40] = hc_bytealign_be (w[20], w[21], offset); - w[39] = hc_bytealign_be (w[19], w[20], offset); - w[38] = hc_bytealign_be (w[18], w[19], offset); - w[37] = hc_bytealign_be (w[17], w[18], offset); - w[36] = hc_bytealign_be (w[16], w[17], offset); - w[35] = hc_bytealign_be (w[15], w[16], offset); - w[34] = hc_bytealign_be (w[14], w[15], offset); - w[33] = hc_bytealign_be (w[13], w[14], offset); - w[32] = hc_bytealign_be (w[12], w[13], offset); - w[31] = hc_bytealign_be (w[11], w[12], offset); - w[30] = hc_bytealign_be (w[10], w[11], offset); - w[29] = hc_bytealign_be (w[ 9], w[10], offset); - w[28] = hc_bytealign_be (w[ 8], w[ 9], offset); - w[27] = hc_bytealign_be (w[ 7], w[ 8], offset); - w[26] = hc_bytealign_be (w[ 6], w[ 7], offset); - w[25] = hc_bytealign_be (w[ 5], w[ 6], offset); - w[24] = hc_bytealign_be (w[ 4], w[ 5], offset); - w[23] = hc_bytealign_be (w[ 3], w[ 4], offset); - w[22] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[21] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[20] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[19] = hc_bytealign_be ( 0, w[ 0], offset); + case 26: + w[63] = hc_bytealign_be (w[36], w[37], offset); + w[62] = hc_bytealign_be (w[35], w[36], offset); + w[61] = hc_bytealign_be (w[34], w[35], offset); + w[60] = hc_bytealign_be (w[33], w[34], offset); + w[59] = hc_bytealign_be (w[32], w[33], offset); + w[58] = hc_bytealign_be (w[31], w[32], offset); + w[57] = hc_bytealign_be (w[30], w[31], offset); + w[56] = hc_bytealign_be (w[29], w[30], offset); + w[55] = hc_bytealign_be (w[28], w[29], offset); + w[54] = hc_bytealign_be (w[27], w[28], offset); + w[53] = hc_bytealign_be (w[26], w[27], offset); + w[52] = hc_bytealign_be (w[25], w[26], offset); + w[51] = hc_bytealign_be (w[24], w[25], offset); + w[50] = hc_bytealign_be (w[23], w[24], offset); + w[49] = hc_bytealign_be (w[22], w[23], offset); + w[48] = hc_bytealign_be (w[21], w[22], offset); + w[47] = hc_bytealign_be (w[20], w[21], offset); + w[46] = hc_bytealign_be (w[19], w[20], offset); + w[45] = hc_bytealign_be (w[18], w[19], offset); + w[44] = hc_bytealign_be (w[17], w[18], offset); + w[43] = hc_bytealign_be (w[16], w[17], offset); + w[42] = hc_bytealign_be (w[15], w[16], offset); + w[41] = hc_bytealign_be (w[14], w[15], offset); + w[40] = hc_bytealign_be (w[13], w[14], offset); + w[39] = hc_bytealign_be (w[12], w[13], offset); + w[38] = hc_bytealign_be (w[11], w[12], offset); + w[37] = hc_bytealign_be (w[10], w[11], offset); + w[36] = hc_bytealign_be (w[ 9], w[10], offset); + w[35] = hc_bytealign_be (w[ 8], w[ 9], offset); + w[34] = hc_bytealign_be (w[ 7], w[ 8], offset); + w[33] = hc_bytealign_be (w[ 6], w[ 7], offset); + w[32] = hc_bytealign_be (w[ 5], w[ 6], offset); + w[31] = hc_bytealign_be (w[ 4], w[ 5], offset); + w[30] = hc_bytealign_be (w[ 3], w[ 4], offset); + w[29] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[28] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[27] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[26] = hc_bytealign_be ( 0, w[ 0], offset); + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; @@ -23310,51 +27174,51 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 20: - w[63] = hc_bytealign_be (w[42], w[43], offset); - w[62] = hc_bytealign_be (w[41], w[42], offset); - w[61] = hc_bytealign_be (w[40], w[41], offset); - w[60] = hc_bytealign_be (w[39], w[40], offset); - w[59] = hc_bytealign_be (w[38], w[39], offset); - w[58] = hc_bytealign_be (w[37], w[38], offset); - w[57] = hc_bytealign_be (w[36], w[37], offset); - w[56] = hc_bytealign_be (w[35], w[36], offset); - w[55] = hc_bytealign_be (w[34], w[35], offset); - w[54] = hc_bytealign_be (w[33], w[34], offset); - w[53] = hc_bytealign_be (w[32], w[33], offset); - w[52] = hc_bytealign_be (w[31], w[32], offset); - w[51] = hc_bytealign_be (w[30], w[31], offset); - w[50] = hc_bytealign_be (w[29], w[30], offset); - w[49] = hc_bytealign_be (w[28], w[29], offset); - w[48] = hc_bytealign_be (w[27], w[28], offset); - w[47] = hc_bytealign_be (w[26], w[27], offset); - w[46] = hc_bytealign_be (w[25], w[26], offset); - w[45] = hc_bytealign_be (w[24], w[25], offset); - w[44] = hc_bytealign_be (w[23], w[24], offset); - w[43] = hc_bytealign_be (w[22], w[23], offset); - w[42] = hc_bytealign_be (w[21], w[22], offset); - w[41] = hc_bytealign_be (w[20], w[21], offset); - w[40] = hc_bytealign_be (w[19], w[20], offset); - w[39] = hc_bytealign_be (w[18], w[19], offset); - w[38] = hc_bytealign_be (w[17], w[18], offset); - w[37] = hc_bytealign_be (w[16], w[17], offset); - w[36] = hc_bytealign_be (w[15], w[16], offset); - w[35] = hc_bytealign_be (w[14], w[15], offset); - w[34] = hc_bytealign_be (w[13], w[14], offset); - w[33] = hc_bytealign_be (w[12], w[13], offset); - w[32] = hc_bytealign_be (w[11], w[12], offset); - w[31] = hc_bytealign_be (w[10], w[11], offset); - w[30] = hc_bytealign_be (w[ 9], w[10], offset); - w[29] = hc_bytealign_be (w[ 8], w[ 9], offset); - w[28] = hc_bytealign_be (w[ 7], w[ 8], offset); - w[27] = hc_bytealign_be (w[ 6], w[ 7], offset); - w[26] = hc_bytealign_be (w[ 5], w[ 6], offset); - w[25] = hc_bytealign_be (w[ 4], w[ 5], offset); - w[24] = hc_bytealign_be (w[ 3], w[ 4], offset); - w[23] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[22] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[21] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[20] = hc_bytealign_be ( 0, w[ 0], offset); + case 27: + w[63] = hc_bytealign_be (w[35], w[36], offset); + w[62] = hc_bytealign_be (w[34], w[35], offset); + w[61] = hc_bytealign_be (w[33], w[34], offset); + w[60] = hc_bytealign_be (w[32], w[33], offset); + w[59] = hc_bytealign_be (w[31], w[32], offset); + w[58] = hc_bytealign_be (w[30], w[31], offset); + w[57] = hc_bytealign_be (w[29], w[30], offset); + w[56] = hc_bytealign_be (w[28], w[29], offset); + w[55] = hc_bytealign_be (w[27], w[28], offset); + w[54] = hc_bytealign_be (w[26], w[27], offset); + w[53] = hc_bytealign_be (w[25], w[26], offset); + w[52] = hc_bytealign_be (w[24], w[25], offset); + w[51] = hc_bytealign_be (w[23], w[24], offset); + w[50] = hc_bytealign_be (w[22], w[23], offset); + w[49] = hc_bytealign_be (w[21], w[22], offset); + w[48] = hc_bytealign_be (w[20], w[21], offset); + w[47] = hc_bytealign_be (w[19], w[20], offset); + w[46] = hc_bytealign_be (w[18], w[19], offset); + w[45] = hc_bytealign_be (w[17], w[18], offset); + w[44] = hc_bytealign_be (w[16], w[17], offset); + w[43] = hc_bytealign_be (w[15], w[16], offset); + w[42] = hc_bytealign_be (w[14], w[15], offset); + w[41] = hc_bytealign_be (w[13], w[14], offset); + w[40] = hc_bytealign_be (w[12], w[13], offset); + w[39] = hc_bytealign_be (w[11], w[12], offset); + w[38] = hc_bytealign_be (w[10], w[11], offset); + w[37] = hc_bytealign_be (w[ 9], w[10], offset); + w[36] = hc_bytealign_be (w[ 8], w[ 9], offset); + w[35] = hc_bytealign_be (w[ 7], w[ 8], offset); + w[34] = hc_bytealign_be (w[ 6], w[ 7], offset); + w[33] = hc_bytealign_be (w[ 5], w[ 6], offset); + w[32] = hc_bytealign_be (w[ 4], w[ 5], offset); + w[31] = hc_bytealign_be (w[ 3], w[ 4], offset); + w[30] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[29] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[28] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[27] = hc_bytealign_be ( 0, w[ 0], offset); + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; @@ -23378,50 +27242,50 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 21: - w[63] = hc_bytealign_be (w[41], w[42], offset); - w[62] = hc_bytealign_be (w[40], w[41], offset); - w[61] = hc_bytealign_be (w[39], w[40], offset); - w[60] = hc_bytealign_be (w[38], w[39], offset); - w[59] = hc_bytealign_be (w[37], w[38], offset); - w[58] = hc_bytealign_be (w[36], w[37], offset); - w[57] = hc_bytealign_be (w[35], w[36], offset); - w[56] = hc_bytealign_be (w[34], w[35], offset); - w[55] = hc_bytealign_be (w[33], w[34], offset); - w[54] = hc_bytealign_be (w[32], w[33], offset); - w[53] = hc_bytealign_be (w[31], w[32], offset); - w[52] = hc_bytealign_be (w[30], w[31], offset); - w[51] = hc_bytealign_be (w[29], w[30], offset); - w[50] = hc_bytealign_be (w[28], w[29], offset); - w[49] = hc_bytealign_be (w[27], w[28], offset); - w[48] = hc_bytealign_be (w[26], w[27], offset); - w[47] = hc_bytealign_be (w[25], w[26], offset); - w[46] = hc_bytealign_be (w[24], w[25], offset); - w[45] = hc_bytealign_be (w[23], w[24], offset); - w[44] = hc_bytealign_be (w[22], w[23], offset); - w[43] = hc_bytealign_be (w[21], w[22], offset); - w[42] = hc_bytealign_be (w[20], w[21], offset); - w[41] = hc_bytealign_be (w[19], w[20], offset); - w[40] = hc_bytealign_be (w[18], w[19], offset); - w[39] = hc_bytealign_be (w[17], w[18], offset); - w[38] = hc_bytealign_be (w[16], w[17], offset); - w[37] = hc_bytealign_be (w[15], w[16], offset); - w[36] = hc_bytealign_be (w[14], w[15], offset); - w[35] = hc_bytealign_be (w[13], w[14], offset); - w[34] = hc_bytealign_be (w[12], w[13], offset); - w[33] = hc_bytealign_be (w[11], w[12], offset); - w[32] = hc_bytealign_be (w[10], w[11], offset); - w[31] = hc_bytealign_be (w[ 9], w[10], offset); - w[30] = hc_bytealign_be (w[ 8], w[ 9], offset); - w[29] = hc_bytealign_be (w[ 7], w[ 8], offset); - w[28] = hc_bytealign_be (w[ 6], w[ 7], offset); - w[27] = hc_bytealign_be (w[ 5], w[ 6], offset); - w[26] = hc_bytealign_be (w[ 4], w[ 5], offset); - w[25] = hc_bytealign_be (w[ 3], w[ 4], offset); - w[24] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[23] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[22] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[21] = hc_bytealign_be ( 0, w[ 0], offset); + case 28: + w[63] = hc_bytealign_be (w[34], w[35], offset); + w[62] = hc_bytealign_be (w[33], w[34], offset); + w[61] = hc_bytealign_be (w[32], w[33], offset); + w[60] = hc_bytealign_be (w[31], w[32], offset); + w[59] = hc_bytealign_be (w[30], w[31], offset); + w[58] = hc_bytealign_be (w[29], w[30], offset); + w[57] = hc_bytealign_be (w[28], w[29], offset); + w[56] = hc_bytealign_be (w[27], w[28], offset); + w[55] = hc_bytealign_be (w[26], w[27], offset); + w[54] = hc_bytealign_be (w[25], w[26], offset); + w[53] = hc_bytealign_be (w[24], w[25], offset); + w[52] = hc_bytealign_be (w[23], w[24], offset); + w[51] = hc_bytealign_be (w[22], w[23], offset); + w[50] = hc_bytealign_be (w[21], w[22], offset); + w[49] = hc_bytealign_be (w[20], w[21], offset); + w[48] = hc_bytealign_be (w[19], w[20], offset); + w[47] = hc_bytealign_be (w[18], w[19], offset); + w[46] = hc_bytealign_be (w[17], w[18], offset); + w[45] = hc_bytealign_be (w[16], w[17], offset); + w[44] = hc_bytealign_be (w[15], w[16], offset); + w[43] = hc_bytealign_be (w[14], w[15], offset); + w[42] = hc_bytealign_be (w[13], w[14], offset); + w[41] = hc_bytealign_be (w[12], w[13], offset); + w[40] = hc_bytealign_be (w[11], w[12], offset); + w[39] = hc_bytealign_be (w[10], w[11], offset); + w[38] = hc_bytealign_be (w[ 9], w[10], offset); + w[37] = hc_bytealign_be (w[ 8], w[ 9], offset); + w[36] = hc_bytealign_be (w[ 7], w[ 8], offset); + w[35] = hc_bytealign_be (w[ 6], w[ 7], offset); + w[34] = hc_bytealign_be (w[ 5], w[ 6], offset); + w[33] = hc_bytealign_be (w[ 4], w[ 5], offset); + w[32] = hc_bytealign_be (w[ 3], w[ 4], offset); + w[31] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[30] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[29] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[28] = hc_bytealign_be ( 0, w[ 0], offset); + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; @@ -23446,49 +27310,49 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 22: - w[63] = hc_bytealign_be (w[40], w[41], offset); - w[62] = hc_bytealign_be (w[39], w[40], offset); - w[61] = hc_bytealign_be (w[38], w[39], offset); - w[60] = hc_bytealign_be (w[37], w[38], offset); - w[59] = hc_bytealign_be (w[36], w[37], offset); - w[58] = hc_bytealign_be (w[35], w[36], offset); - w[57] = hc_bytealign_be (w[34], w[35], offset); - w[56] = hc_bytealign_be (w[33], w[34], offset); - w[55] = hc_bytealign_be (w[32], w[33], offset); - w[54] = hc_bytealign_be (w[31], w[32], offset); - w[53] = hc_bytealign_be (w[30], w[31], offset); - w[52] = hc_bytealign_be (w[29], w[30], offset); - w[51] = hc_bytealign_be (w[28], w[29], offset); - w[50] = hc_bytealign_be (w[27], w[28], offset); - w[49] = hc_bytealign_be (w[26], w[27], offset); - w[48] = hc_bytealign_be (w[25], w[26], offset); - w[47] = hc_bytealign_be (w[24], w[25], offset); - w[46] = hc_bytealign_be (w[23], w[24], offset); - w[45] = hc_bytealign_be (w[22], w[23], offset); - w[44] = hc_bytealign_be (w[21], w[22], offset); - w[43] = hc_bytealign_be (w[20], w[21], offset); - w[42] = hc_bytealign_be (w[19], w[20], offset); - w[41] = hc_bytealign_be (w[18], w[19], offset); - w[40] = hc_bytealign_be (w[17], w[18], offset); - w[39] = hc_bytealign_be (w[16], w[17], offset); - w[38] = hc_bytealign_be (w[15], w[16], offset); - w[37] = hc_bytealign_be (w[14], w[15], offset); - w[36] = hc_bytealign_be (w[13], w[14], offset); - w[35] = hc_bytealign_be (w[12], w[13], offset); - w[34] = hc_bytealign_be (w[11], w[12], offset); - w[33] = hc_bytealign_be (w[10], w[11], offset); - w[32] = hc_bytealign_be (w[ 9], w[10], offset); - w[31] = hc_bytealign_be (w[ 8], w[ 9], offset); - w[30] = hc_bytealign_be (w[ 7], w[ 8], offset); - w[29] = hc_bytealign_be (w[ 6], w[ 7], offset); - w[28] = hc_bytealign_be (w[ 5], w[ 6], offset); - w[27] = hc_bytealign_be (w[ 4], w[ 5], offset); - w[26] = hc_bytealign_be (w[ 3], w[ 4], offset); - w[25] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[24] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[23] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[22] = hc_bytealign_be ( 0, w[ 0], offset); + case 29: + w[63] = hc_bytealign_be (w[33], w[34], offset); + w[62] = hc_bytealign_be (w[32], w[33], offset); + w[61] = hc_bytealign_be (w[31], w[32], offset); + w[60] = hc_bytealign_be (w[30], w[31], offset); + w[59] = hc_bytealign_be (w[29], w[30], offset); + w[58] = hc_bytealign_be (w[28], w[29], offset); + w[57] = hc_bytealign_be (w[27], w[28], offset); + w[56] = hc_bytealign_be (w[26], w[27], offset); + w[55] = hc_bytealign_be (w[25], w[26], offset); + w[54] = hc_bytealign_be (w[24], w[25], offset); + w[53] = hc_bytealign_be (w[23], w[24], offset); + w[52] = hc_bytealign_be (w[22], w[23], offset); + w[51] = hc_bytealign_be (w[21], w[22], offset); + w[50] = hc_bytealign_be (w[20], w[21], offset); + w[49] = hc_bytealign_be (w[19], w[20], offset); + w[48] = hc_bytealign_be (w[18], w[19], offset); + w[47] = hc_bytealign_be (w[17], w[18], offset); + w[46] = hc_bytealign_be (w[16], w[17], offset); + w[45] = hc_bytealign_be (w[15], w[16], offset); + w[44] = hc_bytealign_be (w[14], w[15], offset); + w[43] = hc_bytealign_be (w[13], w[14], offset); + w[42] = hc_bytealign_be (w[12], w[13], offset); + w[41] = hc_bytealign_be (w[11], w[12], offset); + w[40] = hc_bytealign_be (w[10], w[11], offset); + w[39] = hc_bytealign_be (w[ 9], w[10], offset); + w[38] = hc_bytealign_be (w[ 8], w[ 9], offset); + w[37] = hc_bytealign_be (w[ 7], w[ 8], offset); + w[36] = hc_bytealign_be (w[ 6], w[ 7], offset); + w[35] = hc_bytealign_be (w[ 5], w[ 6], offset); + w[34] = hc_bytealign_be (w[ 4], w[ 5], offset); + w[33] = hc_bytealign_be (w[ 3], w[ 4], offset); + w[32] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[31] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[30] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[29] = hc_bytealign_be ( 0, w[ 0], offset); + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; @@ -23505,57 +27369,57 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; - w[ 5] = 0; - w[ 4] = 0; - w[ 3] = 0; - w[ 2] = 0; - w[ 1] = 0; - w[ 0] = 0; - - break; - - case 23: - w[63] = hc_bytealign_be (w[39], w[40], offset); - w[62] = hc_bytealign_be (w[38], w[39], offset); - w[61] = hc_bytealign_be (w[37], w[38], offset); - w[60] = hc_bytealign_be (w[36], w[37], offset); - w[59] = hc_bytealign_be (w[35], w[36], offset); - w[58] = hc_bytealign_be (w[34], w[35], offset); - w[57] = hc_bytealign_be (w[33], w[34], offset); - w[56] = hc_bytealign_be (w[32], w[33], offset); - w[55] = hc_bytealign_be (w[31], w[32], offset); - w[54] = hc_bytealign_be (w[30], w[31], offset); - w[53] = hc_bytealign_be (w[29], w[30], offset); - w[52] = hc_bytealign_be (w[28], w[29], offset); - w[51] = hc_bytealign_be (w[27], w[28], offset); - w[50] = hc_bytealign_be (w[26], w[27], offset); - w[49] = hc_bytealign_be (w[25], w[26], offset); - w[48] = hc_bytealign_be (w[24], w[25], offset); - w[47] = hc_bytealign_be (w[23], w[24], offset); - w[46] = hc_bytealign_be (w[22], w[23], offset); - w[45] = hc_bytealign_be (w[21], w[22], offset); - w[44] = hc_bytealign_be (w[20], w[21], offset); - w[43] = hc_bytealign_be (w[19], w[20], offset); - w[42] = hc_bytealign_be (w[18], w[19], offset); - w[41] = hc_bytealign_be (w[17], w[18], offset); - w[40] = hc_bytealign_be (w[16], w[17], offset); - w[39] = hc_bytealign_be (w[15], w[16], offset); - w[38] = hc_bytealign_be (w[14], w[15], offset); - w[37] = hc_bytealign_be (w[13], w[14], offset); - w[36] = hc_bytealign_be (w[12], w[13], offset); - w[35] = hc_bytealign_be (w[11], w[12], offset); - w[34] = hc_bytealign_be (w[10], w[11], offset); - w[33] = hc_bytealign_be (w[ 9], w[10], offset); - w[32] = hc_bytealign_be (w[ 8], w[ 9], offset); - w[31] = hc_bytealign_be (w[ 7], w[ 8], offset); - w[30] = hc_bytealign_be (w[ 6], w[ 7], offset); - w[29] = hc_bytealign_be (w[ 5], w[ 6], offset); - w[28] = hc_bytealign_be (w[ 4], w[ 5], offset); - w[27] = hc_bytealign_be (w[ 3], w[ 4], offset); - w[26] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[25] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[24] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[23] = hc_bytealign_be ( 0, w[ 0], offset); + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 30: + w[63] = hc_bytealign_be (w[32], w[33], offset); + w[62] = hc_bytealign_be (w[31], w[32], offset); + w[61] = hc_bytealign_be (w[30], w[31], offset); + w[60] = hc_bytealign_be (w[29], w[30], offset); + w[59] = hc_bytealign_be (w[28], w[29], offset); + w[58] = hc_bytealign_be (w[27], w[28], offset); + w[57] = hc_bytealign_be (w[26], w[27], offset); + w[56] = hc_bytealign_be (w[25], w[26], offset); + w[55] = hc_bytealign_be (w[24], w[25], offset); + w[54] = hc_bytealign_be (w[23], w[24], offset); + w[53] = hc_bytealign_be (w[22], w[23], offset); + w[52] = hc_bytealign_be (w[21], w[22], offset); + w[51] = hc_bytealign_be (w[20], w[21], offset); + w[50] = hc_bytealign_be (w[19], w[20], offset); + w[49] = hc_bytealign_be (w[18], w[19], offset); + w[48] = hc_bytealign_be (w[17], w[18], offset); + w[47] = hc_bytealign_be (w[16], w[17], offset); + w[46] = hc_bytealign_be (w[15], w[16], offset); + w[45] = hc_bytealign_be (w[14], w[15], offset); + w[44] = hc_bytealign_be (w[13], w[14], offset); + w[43] = hc_bytealign_be (w[12], w[13], offset); + w[42] = hc_bytealign_be (w[11], w[12], offset); + w[41] = hc_bytealign_be (w[10], w[11], offset); + w[40] = hc_bytealign_be (w[ 9], w[10], offset); + w[39] = hc_bytealign_be (w[ 8], w[ 9], offset); + w[38] = hc_bytealign_be (w[ 7], w[ 8], offset); + w[37] = hc_bytealign_be (w[ 6], w[ 7], offset); + w[36] = hc_bytealign_be (w[ 5], w[ 6], offset); + w[35] = hc_bytealign_be (w[ 4], w[ 5], offset); + w[34] = hc_bytealign_be (w[ 3], w[ 4], offset); + w[33] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[32] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[31] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[30] = hc_bytealign_be ( 0, w[ 0], offset); + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; @@ -23582,47 +27446,47 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 24: - w[63] = hc_bytealign_be (w[38], w[39], offset); - w[62] = hc_bytealign_be (w[37], w[38], offset); - w[61] = hc_bytealign_be (w[36], w[37], offset); - w[60] = hc_bytealign_be (w[35], w[36], offset); - w[59] = hc_bytealign_be (w[34], w[35], offset); - w[58] = hc_bytealign_be (w[33], w[34], offset); - w[57] = hc_bytealign_be (w[32], w[33], offset); - w[56] = hc_bytealign_be (w[31], w[32], offset); - w[55] = hc_bytealign_be (w[30], w[31], offset); - w[54] = hc_bytealign_be (w[29], w[30], offset); - w[53] = hc_bytealign_be (w[28], w[29], offset); - w[52] = hc_bytealign_be (w[27], w[28], offset); - w[51] = hc_bytealign_be (w[26], w[27], offset); - w[50] = hc_bytealign_be (w[25], w[26], offset); - w[49] = hc_bytealign_be (w[24], w[25], offset); - w[48] = hc_bytealign_be (w[23], w[24], offset); - w[47] = hc_bytealign_be (w[22], w[23], offset); - w[46] = hc_bytealign_be (w[21], w[22], offset); - w[45] = hc_bytealign_be (w[20], w[21], offset); - w[44] = hc_bytealign_be (w[19], w[20], offset); - w[43] = hc_bytealign_be (w[18], w[19], offset); - w[42] = hc_bytealign_be (w[17], w[18], offset); - w[41] = hc_bytealign_be (w[16], w[17], offset); - w[40] = hc_bytealign_be (w[15], w[16], offset); - w[39] = hc_bytealign_be (w[14], w[15], offset); - w[38] = hc_bytealign_be (w[13], w[14], offset); - w[37] = hc_bytealign_be (w[12], w[13], offset); - w[36] = hc_bytealign_be (w[11], w[12], offset); - w[35] = hc_bytealign_be (w[10], w[11], offset); - w[34] = hc_bytealign_be (w[ 9], w[10], offset); - w[33] = hc_bytealign_be (w[ 8], w[ 9], offset); - w[32] = hc_bytealign_be (w[ 7], w[ 8], offset); - w[31] = hc_bytealign_be (w[ 6], w[ 7], offset); - w[30] = hc_bytealign_be (w[ 5], w[ 6], offset); - w[29] = hc_bytealign_be (w[ 4], w[ 5], offset); - w[28] = hc_bytealign_be (w[ 3], w[ 4], offset); - w[27] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[26] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[25] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[24] = hc_bytealign_be ( 0, w[ 0], offset); + case 31: + w[63] = hc_bytealign_be (w[31], w[32], offset); + w[62] = hc_bytealign_be (w[30], w[31], offset); + w[61] = hc_bytealign_be (w[29], w[30], offset); + w[60] = hc_bytealign_be (w[28], w[29], offset); + w[59] = hc_bytealign_be (w[27], w[28], offset); + w[58] = hc_bytealign_be (w[26], w[27], offset); + w[57] = hc_bytealign_be (w[25], w[26], offset); + w[56] = hc_bytealign_be (w[24], w[25], offset); + w[55] = hc_bytealign_be (w[23], w[24], offset); + w[54] = hc_bytealign_be (w[22], w[23], offset); + w[53] = hc_bytealign_be (w[21], w[22], offset); + w[52] = hc_bytealign_be (w[20], w[21], offset); + w[51] = hc_bytealign_be (w[19], w[20], offset); + w[50] = hc_bytealign_be (w[18], w[19], offset); + w[49] = hc_bytealign_be (w[17], w[18], offset); + w[48] = hc_bytealign_be (w[16], w[17], offset); + w[47] = hc_bytealign_be (w[15], w[16], offset); + w[46] = hc_bytealign_be (w[14], w[15], offset); + w[45] = hc_bytealign_be (w[13], w[14], offset); + w[44] = hc_bytealign_be (w[12], w[13], offset); + w[43] = hc_bytealign_be (w[11], w[12], offset); + w[42] = hc_bytealign_be (w[10], w[11], offset); + w[41] = hc_bytealign_be (w[ 9], w[10], offset); + w[40] = hc_bytealign_be (w[ 8], w[ 9], offset); + w[39] = hc_bytealign_be (w[ 7], w[ 8], offset); + w[38] = hc_bytealign_be (w[ 6], w[ 7], offset); + w[37] = hc_bytealign_be (w[ 5], w[ 6], offset); + w[36] = hc_bytealign_be (w[ 4], w[ 5], offset); + w[35] = hc_bytealign_be (w[ 3], w[ 4], offset); + w[34] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[33] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[32] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[31] = hc_bytealign_be ( 0, w[ 0], offset); + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; @@ -23650,46 +27514,46 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 25: - w[63] = hc_bytealign_be (w[37], w[38], offset); - w[62] = hc_bytealign_be (w[36], w[37], offset); - w[61] = hc_bytealign_be (w[35], w[36], offset); - w[60] = hc_bytealign_be (w[34], w[35], offset); - w[59] = hc_bytealign_be (w[33], w[34], offset); - w[58] = hc_bytealign_be (w[32], w[33], offset); - w[57] = hc_bytealign_be (w[31], w[32], offset); - w[56] = hc_bytealign_be (w[30], w[31], offset); - w[55] = hc_bytealign_be (w[29], w[30], offset); - w[54] = hc_bytealign_be (w[28], w[29], offset); - w[53] = hc_bytealign_be (w[27], w[28], offset); - w[52] = hc_bytealign_be (w[26], w[27], offset); - w[51] = hc_bytealign_be (w[25], w[26], offset); - w[50] = hc_bytealign_be (w[24], w[25], offset); - w[49] = hc_bytealign_be (w[23], w[24], offset); - w[48] = hc_bytealign_be (w[22], w[23], offset); - w[47] = hc_bytealign_be (w[21], w[22], offset); - w[46] = hc_bytealign_be (w[20], w[21], offset); - w[45] = hc_bytealign_be (w[19], w[20], offset); - w[44] = hc_bytealign_be (w[18], w[19], offset); - w[43] = hc_bytealign_be (w[17], w[18], offset); - w[42] = hc_bytealign_be (w[16], w[17], offset); - w[41] = hc_bytealign_be (w[15], w[16], offset); - w[40] = hc_bytealign_be (w[14], w[15], offset); - w[39] = hc_bytealign_be (w[13], w[14], offset); - w[38] = hc_bytealign_be (w[12], w[13], offset); - w[37] = hc_bytealign_be (w[11], w[12], offset); - w[36] = hc_bytealign_be (w[10], w[11], offset); - w[35] = hc_bytealign_be (w[ 9], w[10], offset); - w[34] = hc_bytealign_be (w[ 8], w[ 9], offset); - w[33] = hc_bytealign_be (w[ 7], w[ 8], offset); - w[32] = hc_bytealign_be (w[ 6], w[ 7], offset); - w[31] = hc_bytealign_be (w[ 5], w[ 6], offset); - w[30] = hc_bytealign_be (w[ 4], w[ 5], offset); - w[29] = hc_bytealign_be (w[ 3], w[ 4], offset); - w[28] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[27] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[26] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[25] = hc_bytealign_be ( 0, w[ 0], offset); + case 32: + w[63] = hc_bytealign_be (w[30], w[31], offset); + w[62] = hc_bytealign_be (w[29], w[30], offset); + w[61] = hc_bytealign_be (w[28], w[29], offset); + w[60] = hc_bytealign_be (w[27], w[28], offset); + w[59] = hc_bytealign_be (w[26], w[27], offset); + w[58] = hc_bytealign_be (w[25], w[26], offset); + w[57] = hc_bytealign_be (w[24], w[25], offset); + w[56] = hc_bytealign_be (w[23], w[24], offset); + w[55] = hc_bytealign_be (w[22], w[23], offset); + w[54] = hc_bytealign_be (w[21], w[22], offset); + w[53] = hc_bytealign_be (w[20], w[21], offset); + w[52] = hc_bytealign_be (w[19], w[20], offset); + w[51] = hc_bytealign_be (w[18], w[19], offset); + w[50] = hc_bytealign_be (w[17], w[18], offset); + w[49] = hc_bytealign_be (w[16], w[17], offset); + w[48] = hc_bytealign_be (w[15], w[16], offset); + w[47] = hc_bytealign_be (w[14], w[15], offset); + w[46] = hc_bytealign_be (w[13], w[14], offset); + w[45] = hc_bytealign_be (w[12], w[13], offset); + w[44] = hc_bytealign_be (w[11], w[12], offset); + w[43] = hc_bytealign_be (w[10], w[11], offset); + w[42] = hc_bytealign_be (w[ 9], w[10], offset); + w[41] = hc_bytealign_be (w[ 8], w[ 9], offset); + w[40] = hc_bytealign_be (w[ 7], w[ 8], offset); + w[39] = hc_bytealign_be (w[ 6], w[ 7], offset); + w[38] = hc_bytealign_be (w[ 5], w[ 6], offset); + w[37] = hc_bytealign_be (w[ 4], w[ 5], offset); + w[36] = hc_bytealign_be (w[ 3], w[ 4], offset); + w[35] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[34] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[33] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[32] = hc_bytealign_be ( 0, w[ 0], offset); + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; @@ -23718,45 +27582,45 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 26: - w[63] = hc_bytealign_be (w[36], w[37], offset); - w[62] = hc_bytealign_be (w[35], w[36], offset); - w[61] = hc_bytealign_be (w[34], w[35], offset); - w[60] = hc_bytealign_be (w[33], w[34], offset); - w[59] = hc_bytealign_be (w[32], w[33], offset); - w[58] = hc_bytealign_be (w[31], w[32], offset); - w[57] = hc_bytealign_be (w[30], w[31], offset); - w[56] = hc_bytealign_be (w[29], w[30], offset); - w[55] = hc_bytealign_be (w[28], w[29], offset); - w[54] = hc_bytealign_be (w[27], w[28], offset); - w[53] = hc_bytealign_be (w[26], w[27], offset); - w[52] = hc_bytealign_be (w[25], w[26], offset); - w[51] = hc_bytealign_be (w[24], w[25], offset); - w[50] = hc_bytealign_be (w[23], w[24], offset); - w[49] = hc_bytealign_be (w[22], w[23], offset); - w[48] = hc_bytealign_be (w[21], w[22], offset); - w[47] = hc_bytealign_be (w[20], w[21], offset); - w[46] = hc_bytealign_be (w[19], w[20], offset); - w[45] = hc_bytealign_be (w[18], w[19], offset); - w[44] = hc_bytealign_be (w[17], w[18], offset); - w[43] = hc_bytealign_be (w[16], w[17], offset); - w[42] = hc_bytealign_be (w[15], w[16], offset); - w[41] = hc_bytealign_be (w[14], w[15], offset); - w[40] = hc_bytealign_be (w[13], w[14], offset); - w[39] = hc_bytealign_be (w[12], w[13], offset); - w[38] = hc_bytealign_be (w[11], w[12], offset); - w[37] = hc_bytealign_be (w[10], w[11], offset); - w[36] = hc_bytealign_be (w[ 9], w[10], offset); - w[35] = hc_bytealign_be (w[ 8], w[ 9], offset); - w[34] = hc_bytealign_be (w[ 7], w[ 8], offset); - w[33] = hc_bytealign_be (w[ 6], w[ 7], offset); - w[32] = hc_bytealign_be (w[ 5], w[ 6], offset); - w[31] = hc_bytealign_be (w[ 4], w[ 5], offset); - w[30] = hc_bytealign_be (w[ 3], w[ 4], offset); - w[29] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[28] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[27] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[26] = hc_bytealign_be ( 0, w[ 0], offset); + case 33: + w[63] = hc_bytealign_be (w[29], w[30], offset); + w[62] = hc_bytealign_be (w[28], w[29], offset); + w[61] = hc_bytealign_be (w[27], w[28], offset); + w[60] = hc_bytealign_be (w[26], w[27], offset); + w[59] = hc_bytealign_be (w[25], w[26], offset); + w[58] = hc_bytealign_be (w[24], w[25], offset); + w[57] = hc_bytealign_be (w[23], w[24], offset); + w[56] = hc_bytealign_be (w[22], w[23], offset); + w[55] = hc_bytealign_be (w[21], w[22], offset); + w[54] = hc_bytealign_be (w[20], w[21], offset); + w[53] = hc_bytealign_be (w[19], w[20], offset); + w[52] = hc_bytealign_be (w[18], w[19], offset); + w[51] = hc_bytealign_be (w[17], w[18], offset); + w[50] = hc_bytealign_be (w[16], w[17], offset); + w[49] = hc_bytealign_be (w[15], w[16], offset); + w[48] = hc_bytealign_be (w[14], w[15], offset); + w[47] = hc_bytealign_be (w[13], w[14], offset); + w[46] = hc_bytealign_be (w[12], w[13], offset); + w[45] = hc_bytealign_be (w[11], w[12], offset); + w[44] = hc_bytealign_be (w[10], w[11], offset); + w[43] = hc_bytealign_be (w[ 9], w[10], offset); + w[42] = hc_bytealign_be (w[ 8], w[ 9], offset); + w[41] = hc_bytealign_be (w[ 7], w[ 8], offset); + w[40] = hc_bytealign_be (w[ 6], w[ 7], offset); + w[39] = hc_bytealign_be (w[ 5], w[ 6], offset); + w[38] = hc_bytealign_be (w[ 4], w[ 5], offset); + w[37] = hc_bytealign_be (w[ 3], w[ 4], offset); + w[36] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[35] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[34] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[33] = hc_bytealign_be ( 0, w[ 0], offset); + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; @@ -23786,44 +27650,44 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 27: - w[63] = hc_bytealign_be (w[35], w[36], offset); - w[62] = hc_bytealign_be (w[34], w[35], offset); - w[61] = hc_bytealign_be (w[33], w[34], offset); - w[60] = hc_bytealign_be (w[32], w[33], offset); - w[59] = hc_bytealign_be (w[31], w[32], offset); - w[58] = hc_bytealign_be (w[30], w[31], offset); - w[57] = hc_bytealign_be (w[29], w[30], offset); - w[56] = hc_bytealign_be (w[28], w[29], offset); - w[55] = hc_bytealign_be (w[27], w[28], offset); - w[54] = hc_bytealign_be (w[26], w[27], offset); - w[53] = hc_bytealign_be (w[25], w[26], offset); - w[52] = hc_bytealign_be (w[24], w[25], offset); - w[51] = hc_bytealign_be (w[23], w[24], offset); - w[50] = hc_bytealign_be (w[22], w[23], offset); - w[49] = hc_bytealign_be (w[21], w[22], offset); - w[48] = hc_bytealign_be (w[20], w[21], offset); - w[47] = hc_bytealign_be (w[19], w[20], offset); - w[46] = hc_bytealign_be (w[18], w[19], offset); - w[45] = hc_bytealign_be (w[17], w[18], offset); - w[44] = hc_bytealign_be (w[16], w[17], offset); - w[43] = hc_bytealign_be (w[15], w[16], offset); - w[42] = hc_bytealign_be (w[14], w[15], offset); - w[41] = hc_bytealign_be (w[13], w[14], offset); - w[40] = hc_bytealign_be (w[12], w[13], offset); - w[39] = hc_bytealign_be (w[11], w[12], offset); - w[38] = hc_bytealign_be (w[10], w[11], offset); - w[37] = hc_bytealign_be (w[ 9], w[10], offset); - w[36] = hc_bytealign_be (w[ 8], w[ 9], offset); - w[35] = hc_bytealign_be (w[ 7], w[ 8], offset); - w[34] = hc_bytealign_be (w[ 6], w[ 7], offset); - w[33] = hc_bytealign_be (w[ 5], w[ 6], offset); - w[32] = hc_bytealign_be (w[ 4], w[ 5], offset); - w[31] = hc_bytealign_be (w[ 3], w[ 4], offset); - w[30] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[29] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[28] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[27] = hc_bytealign_be ( 0, w[ 0], offset); + case 34: + w[63] = hc_bytealign_be (w[28], w[29], offset); + w[62] = hc_bytealign_be (w[27], w[28], offset); + w[61] = hc_bytealign_be (w[26], w[27], offset); + w[60] = hc_bytealign_be (w[25], w[26], offset); + w[59] = hc_bytealign_be (w[24], w[25], offset); + w[58] = hc_bytealign_be (w[23], w[24], offset); + w[57] = hc_bytealign_be (w[22], w[23], offset); + w[56] = hc_bytealign_be (w[21], w[22], offset); + w[55] = hc_bytealign_be (w[20], w[21], offset); + w[54] = hc_bytealign_be (w[19], w[20], offset); + w[53] = hc_bytealign_be (w[18], w[19], offset); + w[52] = hc_bytealign_be (w[17], w[18], offset); + w[51] = hc_bytealign_be (w[16], w[17], offset); + w[50] = hc_bytealign_be (w[15], w[16], offset); + w[49] = hc_bytealign_be (w[14], w[15], offset); + w[48] = hc_bytealign_be (w[13], w[14], offset); + w[47] = hc_bytealign_be (w[12], w[13], offset); + w[46] = hc_bytealign_be (w[11], w[12], offset); + w[45] = hc_bytealign_be (w[10], w[11], offset); + w[44] = hc_bytealign_be (w[ 9], w[10], offset); + w[43] = hc_bytealign_be (w[ 8], w[ 9], offset); + w[42] = hc_bytealign_be (w[ 7], w[ 8], offset); + w[41] = hc_bytealign_be (w[ 6], w[ 7], offset); + w[40] = hc_bytealign_be (w[ 5], w[ 6], offset); + w[39] = hc_bytealign_be (w[ 4], w[ 5], offset); + w[38] = hc_bytealign_be (w[ 3], w[ 4], offset); + w[37] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[36] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[35] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[34] = hc_bytealign_be ( 0, w[ 0], offset); + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; @@ -23854,43 +27718,43 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 28: - w[63] = hc_bytealign_be (w[34], w[35], offset); - w[62] = hc_bytealign_be (w[33], w[34], offset); - w[61] = hc_bytealign_be (w[32], w[33], offset); - w[60] = hc_bytealign_be (w[31], w[32], offset); - w[59] = hc_bytealign_be (w[30], w[31], offset); - w[58] = hc_bytealign_be (w[29], w[30], offset); - w[57] = hc_bytealign_be (w[28], w[29], offset); - w[56] = hc_bytealign_be (w[27], w[28], offset); - w[55] = hc_bytealign_be (w[26], w[27], offset); - w[54] = hc_bytealign_be (w[25], w[26], offset); - w[53] = hc_bytealign_be (w[24], w[25], offset); - w[52] = hc_bytealign_be (w[23], w[24], offset); - w[51] = hc_bytealign_be (w[22], w[23], offset); - w[50] = hc_bytealign_be (w[21], w[22], offset); - w[49] = hc_bytealign_be (w[20], w[21], offset); - w[48] = hc_bytealign_be (w[19], w[20], offset); - w[47] = hc_bytealign_be (w[18], w[19], offset); - w[46] = hc_bytealign_be (w[17], w[18], offset); - w[45] = hc_bytealign_be (w[16], w[17], offset); - w[44] = hc_bytealign_be (w[15], w[16], offset); - w[43] = hc_bytealign_be (w[14], w[15], offset); - w[42] = hc_bytealign_be (w[13], w[14], offset); - w[41] = hc_bytealign_be (w[12], w[13], offset); - w[40] = hc_bytealign_be (w[11], w[12], offset); - w[39] = hc_bytealign_be (w[10], w[11], offset); - w[38] = hc_bytealign_be (w[ 9], w[10], offset); - w[37] = hc_bytealign_be (w[ 8], w[ 9], offset); - w[36] = hc_bytealign_be (w[ 7], w[ 8], offset); - w[35] = hc_bytealign_be (w[ 6], w[ 7], offset); - w[34] = hc_bytealign_be (w[ 5], w[ 6], offset); - w[33] = hc_bytealign_be (w[ 4], w[ 5], offset); - w[32] = hc_bytealign_be (w[ 3], w[ 4], offset); - w[31] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[30] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[29] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[28] = hc_bytealign_be ( 0, w[ 0], offset); + case 35: + w[63] = hc_bytealign_be (w[27], w[28], offset); + w[62] = hc_bytealign_be (w[26], w[27], offset); + w[61] = hc_bytealign_be (w[25], w[26], offset); + w[60] = hc_bytealign_be (w[24], w[25], offset); + w[59] = hc_bytealign_be (w[23], w[24], offset); + w[58] = hc_bytealign_be (w[22], w[23], offset); + w[57] = hc_bytealign_be (w[21], w[22], offset); + w[56] = hc_bytealign_be (w[20], w[21], offset); + w[55] = hc_bytealign_be (w[19], w[20], offset); + w[54] = hc_bytealign_be (w[18], w[19], offset); + w[53] = hc_bytealign_be (w[17], w[18], offset); + w[52] = hc_bytealign_be (w[16], w[17], offset); + w[51] = hc_bytealign_be (w[15], w[16], offset); + w[50] = hc_bytealign_be (w[14], w[15], offset); + w[49] = hc_bytealign_be (w[13], w[14], offset); + w[48] = hc_bytealign_be (w[12], w[13], offset); + w[47] = hc_bytealign_be (w[11], w[12], offset); + w[46] = hc_bytealign_be (w[10], w[11], offset); + w[45] = hc_bytealign_be (w[ 9], w[10], offset); + w[44] = hc_bytealign_be (w[ 8], w[ 9], offset); + w[43] = hc_bytealign_be (w[ 7], w[ 8], offset); + w[42] = hc_bytealign_be (w[ 6], w[ 7], offset); + w[41] = hc_bytealign_be (w[ 5], w[ 6], offset); + w[40] = hc_bytealign_be (w[ 4], w[ 5], offset); + w[39] = hc_bytealign_be (w[ 3], w[ 4], offset); + w[38] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[37] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[36] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[35] = hc_bytealign_be ( 0, w[ 0], offset); + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; @@ -23922,42 +27786,42 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 29: - w[63] = hc_bytealign_be (w[33], w[34], offset); - w[62] = hc_bytealign_be (w[32], w[33], offset); - w[61] = hc_bytealign_be (w[31], w[32], offset); - w[60] = hc_bytealign_be (w[30], w[31], offset); - w[59] = hc_bytealign_be (w[29], w[30], offset); - w[58] = hc_bytealign_be (w[28], w[29], offset); - w[57] = hc_bytealign_be (w[27], w[28], offset); - w[56] = hc_bytealign_be (w[26], w[27], offset); - w[55] = hc_bytealign_be (w[25], w[26], offset); - w[54] = hc_bytealign_be (w[24], w[25], offset); - w[53] = hc_bytealign_be (w[23], w[24], offset); - w[52] = hc_bytealign_be (w[22], w[23], offset); - w[51] = hc_bytealign_be (w[21], w[22], offset); - w[50] = hc_bytealign_be (w[20], w[21], offset); - w[49] = hc_bytealign_be (w[19], w[20], offset); - w[48] = hc_bytealign_be (w[18], w[19], offset); - w[47] = hc_bytealign_be (w[17], w[18], offset); - w[46] = hc_bytealign_be (w[16], w[17], offset); - w[45] = hc_bytealign_be (w[15], w[16], offset); - w[44] = hc_bytealign_be (w[14], w[15], offset); - w[43] = hc_bytealign_be (w[13], w[14], offset); - w[42] = hc_bytealign_be (w[12], w[13], offset); - w[41] = hc_bytealign_be (w[11], w[12], offset); - w[40] = hc_bytealign_be (w[10], w[11], offset); - w[39] = hc_bytealign_be (w[ 9], w[10], offset); - w[38] = hc_bytealign_be (w[ 8], w[ 9], offset); - w[37] = hc_bytealign_be (w[ 7], w[ 8], offset); - w[36] = hc_bytealign_be (w[ 6], w[ 7], offset); - w[35] = hc_bytealign_be (w[ 5], w[ 6], offset); - w[34] = hc_bytealign_be (w[ 4], w[ 5], offset); - w[33] = hc_bytealign_be (w[ 3], w[ 4], offset); - w[32] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[31] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[30] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[29] = hc_bytealign_be ( 0, w[ 0], offset); + case 36: + w[63] = hc_bytealign_be (w[26], w[27], offset); + w[62] = hc_bytealign_be (w[25], w[26], offset); + w[61] = hc_bytealign_be (w[24], w[25], offset); + w[60] = hc_bytealign_be (w[23], w[24], offset); + w[59] = hc_bytealign_be (w[22], w[23], offset); + w[58] = hc_bytealign_be (w[21], w[22], offset); + w[57] = hc_bytealign_be (w[20], w[21], offset); + w[56] = hc_bytealign_be (w[19], w[20], offset); + w[55] = hc_bytealign_be (w[18], w[19], offset); + w[54] = hc_bytealign_be (w[17], w[18], offset); + w[53] = hc_bytealign_be (w[16], w[17], offset); + w[52] = hc_bytealign_be (w[15], w[16], offset); + w[51] = hc_bytealign_be (w[14], w[15], offset); + w[50] = hc_bytealign_be (w[13], w[14], offset); + w[49] = hc_bytealign_be (w[12], w[13], offset); + w[48] = hc_bytealign_be (w[11], w[12], offset); + w[47] = hc_bytealign_be (w[10], w[11], offset); + w[46] = hc_bytealign_be (w[ 9], w[10], offset); + w[45] = hc_bytealign_be (w[ 8], w[ 9], offset); + w[44] = hc_bytealign_be (w[ 7], w[ 8], offset); + w[43] = hc_bytealign_be (w[ 6], w[ 7], offset); + w[42] = hc_bytealign_be (w[ 5], w[ 6], offset); + w[41] = hc_bytealign_be (w[ 4], w[ 5], offset); + w[40] = hc_bytealign_be (w[ 3], w[ 4], offset); + w[39] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[38] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[37] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[36] = hc_bytealign_be ( 0, w[ 0], offset); + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; @@ -23990,41 +27854,41 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 30: - w[63] = hc_bytealign_be (w[32], w[33], offset); - w[62] = hc_bytealign_be (w[31], w[32], offset); - w[61] = hc_bytealign_be (w[30], w[31], offset); - w[60] = hc_bytealign_be (w[29], w[30], offset); - w[59] = hc_bytealign_be (w[28], w[29], offset); - w[58] = hc_bytealign_be (w[27], w[28], offset); - w[57] = hc_bytealign_be (w[26], w[27], offset); - w[56] = hc_bytealign_be (w[25], w[26], offset); - w[55] = hc_bytealign_be (w[24], w[25], offset); - w[54] = hc_bytealign_be (w[23], w[24], offset); - w[53] = hc_bytealign_be (w[22], w[23], offset); - w[52] = hc_bytealign_be (w[21], w[22], offset); - w[51] = hc_bytealign_be (w[20], w[21], offset); - w[50] = hc_bytealign_be (w[19], w[20], offset); - w[49] = hc_bytealign_be (w[18], w[19], offset); - w[48] = hc_bytealign_be (w[17], w[18], offset); - w[47] = hc_bytealign_be (w[16], w[17], offset); - w[46] = hc_bytealign_be (w[15], w[16], offset); - w[45] = hc_bytealign_be (w[14], w[15], offset); - w[44] = hc_bytealign_be (w[13], w[14], offset); - w[43] = hc_bytealign_be (w[12], w[13], offset); - w[42] = hc_bytealign_be (w[11], w[12], offset); - w[41] = hc_bytealign_be (w[10], w[11], offset); - w[40] = hc_bytealign_be (w[ 9], w[10], offset); - w[39] = hc_bytealign_be (w[ 8], w[ 9], offset); - w[38] = hc_bytealign_be (w[ 7], w[ 8], offset); - w[37] = hc_bytealign_be (w[ 6], w[ 7], offset); - w[36] = hc_bytealign_be (w[ 5], w[ 6], offset); - w[35] = hc_bytealign_be (w[ 4], w[ 5], offset); - w[34] = hc_bytealign_be (w[ 3], w[ 4], offset); - w[33] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[32] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[31] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[30] = hc_bytealign_be ( 0, w[ 0], offset); + case 37: + w[63] = hc_bytealign_be (w[25], w[26], offset); + w[62] = hc_bytealign_be (w[24], w[25], offset); + w[61] = hc_bytealign_be (w[23], w[24], offset); + w[60] = hc_bytealign_be (w[22], w[23], offset); + w[59] = hc_bytealign_be (w[21], w[22], offset); + w[58] = hc_bytealign_be (w[20], w[21], offset); + w[57] = hc_bytealign_be (w[19], w[20], offset); + w[56] = hc_bytealign_be (w[18], w[19], offset); + w[55] = hc_bytealign_be (w[17], w[18], offset); + w[54] = hc_bytealign_be (w[16], w[17], offset); + w[53] = hc_bytealign_be (w[15], w[16], offset); + w[52] = hc_bytealign_be (w[14], w[15], offset); + w[51] = hc_bytealign_be (w[13], w[14], offset); + w[50] = hc_bytealign_be (w[12], w[13], offset); + w[49] = hc_bytealign_be (w[11], w[12], offset); + w[48] = hc_bytealign_be (w[10], w[11], offset); + w[47] = hc_bytealign_be (w[ 9], w[10], offset); + w[46] = hc_bytealign_be (w[ 8], w[ 9], offset); + w[45] = hc_bytealign_be (w[ 7], w[ 8], offset); + w[44] = hc_bytealign_be (w[ 6], w[ 7], offset); + w[43] = hc_bytealign_be (w[ 5], w[ 6], offset); + w[42] = hc_bytealign_be (w[ 4], w[ 5], offset); + w[41] = hc_bytealign_be (w[ 3], w[ 4], offset); + w[40] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[39] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[38] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[37] = hc_bytealign_be ( 0, w[ 0], offset); + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; @@ -24058,40 +27922,40 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 31: - w[63] = hc_bytealign_be (w[31], w[32], offset); - w[62] = hc_bytealign_be (w[30], w[31], offset); - w[61] = hc_bytealign_be (w[29], w[30], offset); - w[60] = hc_bytealign_be (w[28], w[29], offset); - w[59] = hc_bytealign_be (w[27], w[28], offset); - w[58] = hc_bytealign_be (w[26], w[27], offset); - w[57] = hc_bytealign_be (w[25], w[26], offset); - w[56] = hc_bytealign_be (w[24], w[25], offset); - w[55] = hc_bytealign_be (w[23], w[24], offset); - w[54] = hc_bytealign_be (w[22], w[23], offset); - w[53] = hc_bytealign_be (w[21], w[22], offset); - w[52] = hc_bytealign_be (w[20], w[21], offset); - w[51] = hc_bytealign_be (w[19], w[20], offset); - w[50] = hc_bytealign_be (w[18], w[19], offset); - w[49] = hc_bytealign_be (w[17], w[18], offset); - w[48] = hc_bytealign_be (w[16], w[17], offset); - w[47] = hc_bytealign_be (w[15], w[16], offset); - w[46] = hc_bytealign_be (w[14], w[15], offset); - w[45] = hc_bytealign_be (w[13], w[14], offset); - w[44] = hc_bytealign_be (w[12], w[13], offset); - w[43] = hc_bytealign_be (w[11], w[12], offset); - w[42] = hc_bytealign_be (w[10], w[11], offset); - w[41] = hc_bytealign_be (w[ 9], w[10], offset); - w[40] = hc_bytealign_be (w[ 8], w[ 9], offset); - w[39] = hc_bytealign_be (w[ 7], w[ 8], offset); - w[38] = hc_bytealign_be (w[ 6], w[ 7], offset); - w[37] = hc_bytealign_be (w[ 5], w[ 6], offset); - w[36] = hc_bytealign_be (w[ 4], w[ 5], offset); - w[35] = hc_bytealign_be (w[ 3], w[ 4], offset); - w[34] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[33] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[32] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[31] = hc_bytealign_be ( 0, w[ 0], offset); + case 38: + w[63] = hc_bytealign_be (w[24], w[25], offset); + w[62] = hc_bytealign_be (w[23], w[24], offset); + w[61] = hc_bytealign_be (w[22], w[23], offset); + w[60] = hc_bytealign_be (w[21], w[22], offset); + w[59] = hc_bytealign_be (w[20], w[21], offset); + w[58] = hc_bytealign_be (w[19], w[20], offset); + w[57] = hc_bytealign_be (w[18], w[19], offset); + w[56] = hc_bytealign_be (w[17], w[18], offset); + w[55] = hc_bytealign_be (w[16], w[17], offset); + w[54] = hc_bytealign_be (w[15], w[16], offset); + w[53] = hc_bytealign_be (w[14], w[15], offset); + w[52] = hc_bytealign_be (w[13], w[14], offset); + w[51] = hc_bytealign_be (w[12], w[13], offset); + w[50] = hc_bytealign_be (w[11], w[12], offset); + w[49] = hc_bytealign_be (w[10], w[11], offset); + w[48] = hc_bytealign_be (w[ 9], w[10], offset); + w[47] = hc_bytealign_be (w[ 8], w[ 9], offset); + w[46] = hc_bytealign_be (w[ 7], w[ 8], offset); + w[45] = hc_bytealign_be (w[ 6], w[ 7], offset); + w[44] = hc_bytealign_be (w[ 5], w[ 6], offset); + w[43] = hc_bytealign_be (w[ 4], w[ 5], offset); + w[42] = hc_bytealign_be (w[ 3], w[ 4], offset); + w[41] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[40] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[39] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[38] = hc_bytealign_be ( 0, w[ 0], offset); + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; @@ -24126,39 +27990,107 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 32: - w[63] = hc_bytealign_be (w[30], w[31], offset); - w[62] = hc_bytealign_be (w[29], w[30], offset); - w[61] = hc_bytealign_be (w[28], w[29], offset); - w[60] = hc_bytealign_be (w[27], w[28], offset); - w[59] = hc_bytealign_be (w[26], w[27], offset); - w[58] = hc_bytealign_be (w[25], w[26], offset); - w[57] = hc_bytealign_be (w[24], w[25], offset); - w[56] = hc_bytealign_be (w[23], w[24], offset); - w[55] = hc_bytealign_be (w[22], w[23], offset); - w[54] = hc_bytealign_be (w[21], w[22], offset); - w[53] = hc_bytealign_be (w[20], w[21], offset); - w[52] = hc_bytealign_be (w[19], w[20], offset); - w[51] = hc_bytealign_be (w[18], w[19], offset); - w[50] = hc_bytealign_be (w[17], w[18], offset); - w[49] = hc_bytealign_be (w[16], w[17], offset); - w[48] = hc_bytealign_be (w[15], w[16], offset); - w[47] = hc_bytealign_be (w[14], w[15], offset); - w[46] = hc_bytealign_be (w[13], w[14], offset); - w[45] = hc_bytealign_be (w[12], w[13], offset); - w[44] = hc_bytealign_be (w[11], w[12], offset); - w[43] = hc_bytealign_be (w[10], w[11], offset); - w[42] = hc_bytealign_be (w[ 9], w[10], offset); - w[41] = hc_bytealign_be (w[ 8], w[ 9], offset); - w[40] = hc_bytealign_be (w[ 7], w[ 8], offset); - w[39] = hc_bytealign_be (w[ 6], w[ 7], offset); - w[38] = hc_bytealign_be (w[ 5], w[ 6], offset); - w[37] = hc_bytealign_be (w[ 4], w[ 5], offset); - w[36] = hc_bytealign_be (w[ 3], w[ 4], offset); - w[35] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[34] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[33] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[32] = hc_bytealign_be ( 0, w[ 0], offset); + case 39: + w[63] = hc_bytealign_be (w[23], w[24], offset); + w[62] = hc_bytealign_be (w[22], w[23], offset); + w[61] = hc_bytealign_be (w[21], w[22], offset); + w[60] = hc_bytealign_be (w[20], w[21], offset); + w[59] = hc_bytealign_be (w[19], w[20], offset); + w[58] = hc_bytealign_be (w[18], w[19], offset); + w[57] = hc_bytealign_be (w[17], w[18], offset); + w[56] = hc_bytealign_be (w[16], w[17], offset); + w[55] = hc_bytealign_be (w[15], w[16], offset); + w[54] = hc_bytealign_be (w[14], w[15], offset); + w[53] = hc_bytealign_be (w[13], w[14], offset); + w[52] = hc_bytealign_be (w[12], w[13], offset); + w[51] = hc_bytealign_be (w[11], w[12], offset); + w[50] = hc_bytealign_be (w[10], w[11], offset); + w[49] = hc_bytealign_be (w[ 9], w[10], offset); + w[48] = hc_bytealign_be (w[ 8], w[ 9], offset); + w[47] = hc_bytealign_be (w[ 7], w[ 8], offset); + w[46] = hc_bytealign_be (w[ 6], w[ 7], offset); + w[45] = hc_bytealign_be (w[ 5], w[ 6], offset); + w[44] = hc_bytealign_be (w[ 4], w[ 5], offset); + w[43] = hc_bytealign_be (w[ 3], w[ 4], offset); + w[42] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[41] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[40] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[39] = hc_bytealign_be ( 0, w[ 0], offset); + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 40: + w[63] = hc_bytealign_be (w[22], w[23], offset); + w[62] = hc_bytealign_be (w[21], w[22], offset); + w[61] = hc_bytealign_be (w[20], w[21], offset); + w[60] = hc_bytealign_be (w[19], w[20], offset); + w[59] = hc_bytealign_be (w[18], w[19], offset); + w[58] = hc_bytealign_be (w[17], w[18], offset); + w[57] = hc_bytealign_be (w[16], w[17], offset); + w[56] = hc_bytealign_be (w[15], w[16], offset); + w[55] = hc_bytealign_be (w[14], w[15], offset); + w[54] = hc_bytealign_be (w[13], w[14], offset); + w[53] = hc_bytealign_be (w[12], w[13], offset); + w[52] = hc_bytealign_be (w[11], w[12], offset); + w[51] = hc_bytealign_be (w[10], w[11], offset); + w[50] = hc_bytealign_be (w[ 9], w[10], offset); + w[49] = hc_bytealign_be (w[ 8], w[ 9], offset); + w[48] = hc_bytealign_be (w[ 7], w[ 8], offset); + w[47] = hc_bytealign_be (w[ 6], w[ 7], offset); + w[46] = hc_bytealign_be (w[ 5], w[ 6], offset); + w[45] = hc_bytealign_be (w[ 4], w[ 5], offset); + w[44] = hc_bytealign_be (w[ 3], w[ 4], offset); + w[43] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[42] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[41] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[40] = hc_bytealign_be ( 0, w[ 0], offset); + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; @@ -24194,38 +28126,38 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 33: - w[63] = hc_bytealign_be (w[29], w[30], offset); - w[62] = hc_bytealign_be (w[28], w[29], offset); - w[61] = hc_bytealign_be (w[27], w[28], offset); - w[60] = hc_bytealign_be (w[26], w[27], offset); - w[59] = hc_bytealign_be (w[25], w[26], offset); - w[58] = hc_bytealign_be (w[24], w[25], offset); - w[57] = hc_bytealign_be (w[23], w[24], offset); - w[56] = hc_bytealign_be (w[22], w[23], offset); - w[55] = hc_bytealign_be (w[21], w[22], offset); - w[54] = hc_bytealign_be (w[20], w[21], offset); - w[53] = hc_bytealign_be (w[19], w[20], offset); - w[52] = hc_bytealign_be (w[18], w[19], offset); - w[51] = hc_bytealign_be (w[17], w[18], offset); - w[50] = hc_bytealign_be (w[16], w[17], offset); - w[49] = hc_bytealign_be (w[15], w[16], offset); - w[48] = hc_bytealign_be (w[14], w[15], offset); - w[47] = hc_bytealign_be (w[13], w[14], offset); - w[46] = hc_bytealign_be (w[12], w[13], offset); - w[45] = hc_bytealign_be (w[11], w[12], offset); - w[44] = hc_bytealign_be (w[10], w[11], offset); - w[43] = hc_bytealign_be (w[ 9], w[10], offset); - w[42] = hc_bytealign_be (w[ 8], w[ 9], offset); - w[41] = hc_bytealign_be (w[ 7], w[ 8], offset); - w[40] = hc_bytealign_be (w[ 6], w[ 7], offset); - w[39] = hc_bytealign_be (w[ 5], w[ 6], offset); - w[38] = hc_bytealign_be (w[ 4], w[ 5], offset); - w[37] = hc_bytealign_be (w[ 3], w[ 4], offset); - w[36] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[35] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[34] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[33] = hc_bytealign_be ( 0, w[ 0], offset); + case 41: + w[63] = hc_bytealign_be (w[21], w[22], offset); + w[62] = hc_bytealign_be (w[20], w[21], offset); + w[61] = hc_bytealign_be (w[19], w[20], offset); + w[60] = hc_bytealign_be (w[18], w[19], offset); + w[59] = hc_bytealign_be (w[17], w[18], offset); + w[58] = hc_bytealign_be (w[16], w[17], offset); + w[57] = hc_bytealign_be (w[15], w[16], offset); + w[56] = hc_bytealign_be (w[14], w[15], offset); + w[55] = hc_bytealign_be (w[13], w[14], offset); + w[54] = hc_bytealign_be (w[12], w[13], offset); + w[53] = hc_bytealign_be (w[11], w[12], offset); + w[52] = hc_bytealign_be (w[10], w[11], offset); + w[51] = hc_bytealign_be (w[ 9], w[10], offset); + w[50] = hc_bytealign_be (w[ 8], w[ 9], offset); + w[49] = hc_bytealign_be (w[ 7], w[ 8], offset); + w[48] = hc_bytealign_be (w[ 6], w[ 7], offset); + w[47] = hc_bytealign_be (w[ 5], w[ 6], offset); + w[46] = hc_bytealign_be (w[ 4], w[ 5], offset); + w[45] = hc_bytealign_be (w[ 3], w[ 4], offset); + w[44] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[43] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[42] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[41] = hc_bytealign_be ( 0, w[ 0], offset); + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; @@ -24262,37 +28194,37 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 34: - w[63] = hc_bytealign_be (w[28], w[29], offset); - w[62] = hc_bytealign_be (w[27], w[28], offset); - w[61] = hc_bytealign_be (w[26], w[27], offset); - w[60] = hc_bytealign_be (w[25], w[26], offset); - w[59] = hc_bytealign_be (w[24], w[25], offset); - w[58] = hc_bytealign_be (w[23], w[24], offset); - w[57] = hc_bytealign_be (w[22], w[23], offset); - w[56] = hc_bytealign_be (w[21], w[22], offset); - w[55] = hc_bytealign_be (w[20], w[21], offset); - w[54] = hc_bytealign_be (w[19], w[20], offset); - w[53] = hc_bytealign_be (w[18], w[19], offset); - w[52] = hc_bytealign_be (w[17], w[18], offset); - w[51] = hc_bytealign_be (w[16], w[17], offset); - w[50] = hc_bytealign_be (w[15], w[16], offset); - w[49] = hc_bytealign_be (w[14], w[15], offset); - w[48] = hc_bytealign_be (w[13], w[14], offset); - w[47] = hc_bytealign_be (w[12], w[13], offset); - w[46] = hc_bytealign_be (w[11], w[12], offset); - w[45] = hc_bytealign_be (w[10], w[11], offset); - w[44] = hc_bytealign_be (w[ 9], w[10], offset); - w[43] = hc_bytealign_be (w[ 8], w[ 9], offset); - w[42] = hc_bytealign_be (w[ 7], w[ 8], offset); - w[41] = hc_bytealign_be (w[ 6], w[ 7], offset); - w[40] = hc_bytealign_be (w[ 5], w[ 6], offset); - w[39] = hc_bytealign_be (w[ 4], w[ 5], offset); - w[38] = hc_bytealign_be (w[ 3], w[ 4], offset); - w[37] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[36] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[35] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[34] = hc_bytealign_be ( 0, w[ 0], offset); + case 42: + w[63] = hc_bytealign_be (w[20], w[21], offset); + w[62] = hc_bytealign_be (w[19], w[20], offset); + w[61] = hc_bytealign_be (w[18], w[19], offset); + w[60] = hc_bytealign_be (w[17], w[18], offset); + w[59] = hc_bytealign_be (w[16], w[17], offset); + w[58] = hc_bytealign_be (w[15], w[16], offset); + w[57] = hc_bytealign_be (w[14], w[15], offset); + w[56] = hc_bytealign_be (w[13], w[14], offset); + w[55] = hc_bytealign_be (w[12], w[13], offset); + w[54] = hc_bytealign_be (w[11], w[12], offset); + w[53] = hc_bytealign_be (w[10], w[11], offset); + w[52] = hc_bytealign_be (w[ 9], w[10], offset); + w[51] = hc_bytealign_be (w[ 8], w[ 9], offset); + w[50] = hc_bytealign_be (w[ 7], w[ 8], offset); + w[49] = hc_bytealign_be (w[ 6], w[ 7], offset); + w[48] = hc_bytealign_be (w[ 5], w[ 6], offset); + w[47] = hc_bytealign_be (w[ 4], w[ 5], offset); + w[46] = hc_bytealign_be (w[ 3], w[ 4], offset); + w[45] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[44] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[43] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[42] = hc_bytealign_be ( 0, w[ 0], offset); + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; @@ -24330,36 +28262,36 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 35: - w[63] = hc_bytealign_be (w[27], w[28], offset); - w[62] = hc_bytealign_be (w[26], w[27], offset); - w[61] = hc_bytealign_be (w[25], w[26], offset); - w[60] = hc_bytealign_be (w[24], w[25], offset); - w[59] = hc_bytealign_be (w[23], w[24], offset); - w[58] = hc_bytealign_be (w[22], w[23], offset); - w[57] = hc_bytealign_be (w[21], w[22], offset); - w[56] = hc_bytealign_be (w[20], w[21], offset); - w[55] = hc_bytealign_be (w[19], w[20], offset); - w[54] = hc_bytealign_be (w[18], w[19], offset); - w[53] = hc_bytealign_be (w[17], w[18], offset); - w[52] = hc_bytealign_be (w[16], w[17], offset); - w[51] = hc_bytealign_be (w[15], w[16], offset); - w[50] = hc_bytealign_be (w[14], w[15], offset); - w[49] = hc_bytealign_be (w[13], w[14], offset); - w[48] = hc_bytealign_be (w[12], w[13], offset); - w[47] = hc_bytealign_be (w[11], w[12], offset); - w[46] = hc_bytealign_be (w[10], w[11], offset); - w[45] = hc_bytealign_be (w[ 9], w[10], offset); - w[44] = hc_bytealign_be (w[ 8], w[ 9], offset); - w[43] = hc_bytealign_be (w[ 7], w[ 8], offset); - w[42] = hc_bytealign_be (w[ 6], w[ 7], offset); - w[41] = hc_bytealign_be (w[ 5], w[ 6], offset); - w[40] = hc_bytealign_be (w[ 4], w[ 5], offset); - w[39] = hc_bytealign_be (w[ 3], w[ 4], offset); - w[38] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[37] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[36] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[35] = hc_bytealign_be ( 0, w[ 0], offset); + case 43: + w[63] = hc_bytealign_be (w[19], w[20], offset); + w[62] = hc_bytealign_be (w[18], w[19], offset); + w[61] = hc_bytealign_be (w[17], w[18], offset); + w[60] = hc_bytealign_be (w[16], w[17], offset); + w[59] = hc_bytealign_be (w[15], w[16], offset); + w[58] = hc_bytealign_be (w[14], w[15], offset); + w[57] = hc_bytealign_be (w[13], w[14], offset); + w[56] = hc_bytealign_be (w[12], w[13], offset); + w[55] = hc_bytealign_be (w[11], w[12], offset); + w[54] = hc_bytealign_be (w[10], w[11], offset); + w[53] = hc_bytealign_be (w[ 9], w[10], offset); + w[52] = hc_bytealign_be (w[ 8], w[ 9], offset); + w[51] = hc_bytealign_be (w[ 7], w[ 8], offset); + w[50] = hc_bytealign_be (w[ 6], w[ 7], offset); + w[49] = hc_bytealign_be (w[ 5], w[ 6], offset); + w[48] = hc_bytealign_be (w[ 4], w[ 5], offset); + w[47] = hc_bytealign_be (w[ 3], w[ 4], offset); + w[46] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[45] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[44] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[43] = hc_bytealign_be ( 0, w[ 0], offset); + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; @@ -24374,59 +28306,59 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) w[23] = 0; w[22] = 0; w[21] = 0; - w[20] = 0; - w[19] = 0; - w[18] = 0; - w[17] = 0; - w[16] = 0; - w[15] = 0; - w[14] = 0; - w[13] = 0; - w[12] = 0; - w[11] = 0; - w[10] = 0; - w[ 9] = 0; - w[ 8] = 0; - w[ 7] = 0; - w[ 6] = 0; - w[ 5] = 0; - w[ 4] = 0; - w[ 3] = 0; - w[ 2] = 0; - w[ 1] = 0; - w[ 0] = 0; - - break; - - case 36: - w[63] = hc_bytealign_be (w[26], w[27], offset); - w[62] = hc_bytealign_be (w[25], w[26], offset); - w[61] = hc_bytealign_be (w[24], w[25], offset); - w[60] = hc_bytealign_be (w[23], w[24], offset); - w[59] = hc_bytealign_be (w[22], w[23], offset); - w[58] = hc_bytealign_be (w[21], w[22], offset); - w[57] = hc_bytealign_be (w[20], w[21], offset); - w[56] = hc_bytealign_be (w[19], w[20], offset); - w[55] = hc_bytealign_be (w[18], w[19], offset); - w[54] = hc_bytealign_be (w[17], w[18], offset); - w[53] = hc_bytealign_be (w[16], w[17], offset); - w[52] = hc_bytealign_be (w[15], w[16], offset); - w[51] = hc_bytealign_be (w[14], w[15], offset); - w[50] = hc_bytealign_be (w[13], w[14], offset); - w[49] = hc_bytealign_be (w[12], w[13], offset); - w[48] = hc_bytealign_be (w[11], w[12], offset); - w[47] = hc_bytealign_be (w[10], w[11], offset); - w[46] = hc_bytealign_be (w[ 9], w[10], offset); - w[45] = hc_bytealign_be (w[ 8], w[ 9], offset); - w[44] = hc_bytealign_be (w[ 7], w[ 8], offset); - w[43] = hc_bytealign_be (w[ 6], w[ 7], offset); - w[42] = hc_bytealign_be (w[ 5], w[ 6], offset); - w[41] = hc_bytealign_be (w[ 4], w[ 5], offset); - w[40] = hc_bytealign_be (w[ 3], w[ 4], offset); - w[39] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[38] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[37] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[36] = hc_bytealign_be ( 0, w[ 0], offset); + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 44: + w[63] = hc_bytealign_be (w[18], w[19], offset); + w[62] = hc_bytealign_be (w[17], w[18], offset); + w[61] = hc_bytealign_be (w[16], w[17], offset); + w[60] = hc_bytealign_be (w[15], w[16], offset); + w[59] = hc_bytealign_be (w[14], w[15], offset); + w[58] = hc_bytealign_be (w[13], w[14], offset); + w[57] = hc_bytealign_be (w[12], w[13], offset); + w[56] = hc_bytealign_be (w[11], w[12], offset); + w[55] = hc_bytealign_be (w[10], w[11], offset); + w[54] = hc_bytealign_be (w[ 9], w[10], offset); + w[53] = hc_bytealign_be (w[ 8], w[ 9], offset); + w[52] = hc_bytealign_be (w[ 7], w[ 8], offset); + w[51] = hc_bytealign_be (w[ 6], w[ 7], offset); + w[50] = hc_bytealign_be (w[ 5], w[ 6], offset); + w[49] = hc_bytealign_be (w[ 4], w[ 5], offset); + w[48] = hc_bytealign_be (w[ 3], w[ 4], offset); + w[47] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[46] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[45] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[44] = hc_bytealign_be ( 0, w[ 0], offset); + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; @@ -24466,34 +28398,34 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 37: - w[63] = hc_bytealign_be (w[25], w[26], offset); - w[62] = hc_bytealign_be (w[24], w[25], offset); - w[61] = hc_bytealign_be (w[23], w[24], offset); - w[60] = hc_bytealign_be (w[22], w[23], offset); - w[59] = hc_bytealign_be (w[21], w[22], offset); - w[58] = hc_bytealign_be (w[20], w[21], offset); - w[57] = hc_bytealign_be (w[19], w[20], offset); - w[56] = hc_bytealign_be (w[18], w[19], offset); - w[55] = hc_bytealign_be (w[17], w[18], offset); - w[54] = hc_bytealign_be (w[16], w[17], offset); - w[53] = hc_bytealign_be (w[15], w[16], offset); - w[52] = hc_bytealign_be (w[14], w[15], offset); - w[51] = hc_bytealign_be (w[13], w[14], offset); - w[50] = hc_bytealign_be (w[12], w[13], offset); - w[49] = hc_bytealign_be (w[11], w[12], offset); - w[48] = hc_bytealign_be (w[10], w[11], offset); - w[47] = hc_bytealign_be (w[ 9], w[10], offset); - w[46] = hc_bytealign_be (w[ 8], w[ 9], offset); - w[45] = hc_bytealign_be (w[ 7], w[ 8], offset); - w[44] = hc_bytealign_be (w[ 6], w[ 7], offset); - w[43] = hc_bytealign_be (w[ 5], w[ 6], offset); - w[42] = hc_bytealign_be (w[ 4], w[ 5], offset); - w[41] = hc_bytealign_be (w[ 3], w[ 4], offset); - w[40] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[39] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[38] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[37] = hc_bytealign_be ( 0, w[ 0], offset); + case 45: + w[63] = hc_bytealign_be (w[17], w[18], offset); + w[62] = hc_bytealign_be (w[16], w[17], offset); + w[61] = hc_bytealign_be (w[15], w[16], offset); + w[60] = hc_bytealign_be (w[14], w[15], offset); + w[59] = hc_bytealign_be (w[13], w[14], offset); + w[58] = hc_bytealign_be (w[12], w[13], offset); + w[57] = hc_bytealign_be (w[11], w[12], offset); + w[56] = hc_bytealign_be (w[10], w[11], offset); + w[55] = hc_bytealign_be (w[ 9], w[10], offset); + w[54] = hc_bytealign_be (w[ 8], w[ 9], offset); + w[53] = hc_bytealign_be (w[ 7], w[ 8], offset); + w[52] = hc_bytealign_be (w[ 6], w[ 7], offset); + w[51] = hc_bytealign_be (w[ 5], w[ 6], offset); + w[50] = hc_bytealign_be (w[ 4], w[ 5], offset); + w[49] = hc_bytealign_be (w[ 3], w[ 4], offset); + w[48] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[47] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[46] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[45] = hc_bytealign_be ( 0, w[ 0], offset); + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; @@ -24534,33 +28466,33 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 38: - w[63] = hc_bytealign_be (w[24], w[25], offset); - w[62] = hc_bytealign_be (w[23], w[24], offset); - w[61] = hc_bytealign_be (w[22], w[23], offset); - w[60] = hc_bytealign_be (w[21], w[22], offset); - w[59] = hc_bytealign_be (w[20], w[21], offset); - w[58] = hc_bytealign_be (w[19], w[20], offset); - w[57] = hc_bytealign_be (w[18], w[19], offset); - w[56] = hc_bytealign_be (w[17], w[18], offset); - w[55] = hc_bytealign_be (w[16], w[17], offset); - w[54] = hc_bytealign_be (w[15], w[16], offset); - w[53] = hc_bytealign_be (w[14], w[15], offset); - w[52] = hc_bytealign_be (w[13], w[14], offset); - w[51] = hc_bytealign_be (w[12], w[13], offset); - w[50] = hc_bytealign_be (w[11], w[12], offset); - w[49] = hc_bytealign_be (w[10], w[11], offset); - w[48] = hc_bytealign_be (w[ 9], w[10], offset); - w[47] = hc_bytealign_be (w[ 8], w[ 9], offset); - w[46] = hc_bytealign_be (w[ 7], w[ 8], offset); - w[45] = hc_bytealign_be (w[ 6], w[ 7], offset); - w[44] = hc_bytealign_be (w[ 5], w[ 6], offset); - w[43] = hc_bytealign_be (w[ 4], w[ 5], offset); - w[42] = hc_bytealign_be (w[ 3], w[ 4], offset); - w[41] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[40] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[39] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[38] = hc_bytealign_be ( 0, w[ 0], offset); + case 46: + w[63] = hc_bytealign_be (w[16], w[17], offset); + w[62] = hc_bytealign_be (w[15], w[16], offset); + w[61] = hc_bytealign_be (w[14], w[15], offset); + w[60] = hc_bytealign_be (w[13], w[14], offset); + w[59] = hc_bytealign_be (w[12], w[13], offset); + w[58] = hc_bytealign_be (w[11], w[12], offset); + w[57] = hc_bytealign_be (w[10], w[11], offset); + w[56] = hc_bytealign_be (w[ 9], w[10], offset); + w[55] = hc_bytealign_be (w[ 8], w[ 9], offset); + w[54] = hc_bytealign_be (w[ 7], w[ 8], offset); + w[53] = hc_bytealign_be (w[ 6], w[ 7], offset); + w[52] = hc_bytealign_be (w[ 5], w[ 6], offset); + w[51] = hc_bytealign_be (w[ 4], w[ 5], offset); + w[50] = hc_bytealign_be (w[ 3], w[ 4], offset); + w[49] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[48] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[47] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[46] = hc_bytealign_be ( 0, w[ 0], offset); + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; @@ -24602,32 +28534,32 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 39: - w[63] = hc_bytealign_be (w[23], w[24], offset); - w[62] = hc_bytealign_be (w[22], w[23], offset); - w[61] = hc_bytealign_be (w[21], w[22], offset); - w[60] = hc_bytealign_be (w[20], w[21], offset); - w[59] = hc_bytealign_be (w[19], w[20], offset); - w[58] = hc_bytealign_be (w[18], w[19], offset); - w[57] = hc_bytealign_be (w[17], w[18], offset); - w[56] = hc_bytealign_be (w[16], w[17], offset); - w[55] = hc_bytealign_be (w[15], w[16], offset); - w[54] = hc_bytealign_be (w[14], w[15], offset); - w[53] = hc_bytealign_be (w[13], w[14], offset); - w[52] = hc_bytealign_be (w[12], w[13], offset); - w[51] = hc_bytealign_be (w[11], w[12], offset); - w[50] = hc_bytealign_be (w[10], w[11], offset); - w[49] = hc_bytealign_be (w[ 9], w[10], offset); - w[48] = hc_bytealign_be (w[ 8], w[ 9], offset); - w[47] = hc_bytealign_be (w[ 7], w[ 8], offset); - w[46] = hc_bytealign_be (w[ 6], w[ 7], offset); - w[45] = hc_bytealign_be (w[ 5], w[ 6], offset); - w[44] = hc_bytealign_be (w[ 4], w[ 5], offset); - w[43] = hc_bytealign_be (w[ 3], w[ 4], offset); - w[42] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[41] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[40] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[39] = hc_bytealign_be ( 0, w[ 0], offset); + case 47: + w[63] = hc_bytealign_be (w[15], w[16], offset); + w[62] = hc_bytealign_be (w[14], w[15], offset); + w[61] = hc_bytealign_be (w[13], w[14], offset); + w[60] = hc_bytealign_be (w[12], w[13], offset); + w[59] = hc_bytealign_be (w[11], w[12], offset); + w[58] = hc_bytealign_be (w[10], w[11], offset); + w[57] = hc_bytealign_be (w[ 9], w[10], offset); + w[56] = hc_bytealign_be (w[ 8], w[ 9], offset); + w[55] = hc_bytealign_be (w[ 7], w[ 8], offset); + w[54] = hc_bytealign_be (w[ 6], w[ 7], offset); + w[53] = hc_bytealign_be (w[ 5], w[ 6], offset); + w[52] = hc_bytealign_be (w[ 4], w[ 5], offset); + w[51] = hc_bytealign_be (w[ 3], w[ 4], offset); + w[50] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[49] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[48] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[47] = hc_bytealign_be ( 0, w[ 0], offset); + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; @@ -24670,31 +28602,31 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 40: - w[63] = hc_bytealign_be (w[22], w[23], offset); - w[62] = hc_bytealign_be (w[21], w[22], offset); - w[61] = hc_bytealign_be (w[20], w[21], offset); - w[60] = hc_bytealign_be (w[19], w[20], offset); - w[59] = hc_bytealign_be (w[18], w[19], offset); - w[58] = hc_bytealign_be (w[17], w[18], offset); - w[57] = hc_bytealign_be (w[16], w[17], offset); - w[56] = hc_bytealign_be (w[15], w[16], offset); - w[55] = hc_bytealign_be (w[14], w[15], offset); - w[54] = hc_bytealign_be (w[13], w[14], offset); - w[53] = hc_bytealign_be (w[12], w[13], offset); - w[52] = hc_bytealign_be (w[11], w[12], offset); - w[51] = hc_bytealign_be (w[10], w[11], offset); - w[50] = hc_bytealign_be (w[ 9], w[10], offset); - w[49] = hc_bytealign_be (w[ 8], w[ 9], offset); - w[48] = hc_bytealign_be (w[ 7], w[ 8], offset); - w[47] = hc_bytealign_be (w[ 6], w[ 7], offset); - w[46] = hc_bytealign_be (w[ 5], w[ 6], offset); - w[45] = hc_bytealign_be (w[ 4], w[ 5], offset); - w[44] = hc_bytealign_be (w[ 3], w[ 4], offset); - w[43] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[42] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[41] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[40] = hc_bytealign_be ( 0, w[ 0], offset); + case 48: + w[63] = hc_bytealign_be (w[14], w[15], offset); + w[62] = hc_bytealign_be (w[13], w[14], offset); + w[61] = hc_bytealign_be (w[12], w[13], offset); + w[60] = hc_bytealign_be (w[11], w[12], offset); + w[59] = hc_bytealign_be (w[10], w[11], offset); + w[58] = hc_bytealign_be (w[ 9], w[10], offset); + w[57] = hc_bytealign_be (w[ 8], w[ 9], offset); + w[56] = hc_bytealign_be (w[ 7], w[ 8], offset); + w[55] = hc_bytealign_be (w[ 6], w[ 7], offset); + w[54] = hc_bytealign_be (w[ 5], w[ 6], offset); + w[53] = hc_bytealign_be (w[ 4], w[ 5], offset); + w[52] = hc_bytealign_be (w[ 3], w[ 4], offset); + w[51] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[50] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[49] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[48] = hc_bytealign_be ( 0, w[ 0], offset); + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; @@ -24738,30 +28670,30 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 41: - w[63] = hc_bytealign_be (w[21], w[22], offset); - w[62] = hc_bytealign_be (w[20], w[21], offset); - w[61] = hc_bytealign_be (w[19], w[20], offset); - w[60] = hc_bytealign_be (w[18], w[19], offset); - w[59] = hc_bytealign_be (w[17], w[18], offset); - w[58] = hc_bytealign_be (w[16], w[17], offset); - w[57] = hc_bytealign_be (w[15], w[16], offset); - w[56] = hc_bytealign_be (w[14], w[15], offset); - w[55] = hc_bytealign_be (w[13], w[14], offset); - w[54] = hc_bytealign_be (w[12], w[13], offset); - w[53] = hc_bytealign_be (w[11], w[12], offset); - w[52] = hc_bytealign_be (w[10], w[11], offset); - w[51] = hc_bytealign_be (w[ 9], w[10], offset); - w[50] = hc_bytealign_be (w[ 8], w[ 9], offset); - w[49] = hc_bytealign_be (w[ 7], w[ 8], offset); - w[48] = hc_bytealign_be (w[ 6], w[ 7], offset); - w[47] = hc_bytealign_be (w[ 5], w[ 6], offset); - w[46] = hc_bytealign_be (w[ 4], w[ 5], offset); - w[45] = hc_bytealign_be (w[ 3], w[ 4], offset); - w[44] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[43] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[42] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[41] = hc_bytealign_be ( 0, w[ 0], offset); + case 49: + w[63] = hc_bytealign_be (w[13], w[14], offset); + w[62] = hc_bytealign_be (w[12], w[13], offset); + w[61] = hc_bytealign_be (w[11], w[12], offset); + w[60] = hc_bytealign_be (w[10], w[11], offset); + w[59] = hc_bytealign_be (w[ 9], w[10], offset); + w[58] = hc_bytealign_be (w[ 8], w[ 9], offset); + w[57] = hc_bytealign_be (w[ 7], w[ 8], offset); + w[56] = hc_bytealign_be (w[ 6], w[ 7], offset); + w[55] = hc_bytealign_be (w[ 5], w[ 6], offset); + w[54] = hc_bytealign_be (w[ 4], w[ 5], offset); + w[53] = hc_bytealign_be (w[ 3], w[ 4], offset); + w[52] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[51] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[50] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[49] = hc_bytealign_be ( 0, w[ 0], offset); + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; @@ -24806,29 +28738,29 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 42: - w[63] = hc_bytealign_be (w[20], w[21], offset); - w[62] = hc_bytealign_be (w[19], w[20], offset); - w[61] = hc_bytealign_be (w[18], w[19], offset); - w[60] = hc_bytealign_be (w[17], w[18], offset); - w[59] = hc_bytealign_be (w[16], w[17], offset); - w[58] = hc_bytealign_be (w[15], w[16], offset); - w[57] = hc_bytealign_be (w[14], w[15], offset); - w[56] = hc_bytealign_be (w[13], w[14], offset); - w[55] = hc_bytealign_be (w[12], w[13], offset); - w[54] = hc_bytealign_be (w[11], w[12], offset); - w[53] = hc_bytealign_be (w[10], w[11], offset); - w[52] = hc_bytealign_be (w[ 9], w[10], offset); - w[51] = hc_bytealign_be (w[ 8], w[ 9], offset); - w[50] = hc_bytealign_be (w[ 7], w[ 8], offset); - w[49] = hc_bytealign_be (w[ 6], w[ 7], offset); - w[48] = hc_bytealign_be (w[ 5], w[ 6], offset); - w[47] = hc_bytealign_be (w[ 4], w[ 5], offset); - w[46] = hc_bytealign_be (w[ 3], w[ 4], offset); - w[45] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[44] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[43] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[42] = hc_bytealign_be ( 0, w[ 0], offset); + case 50: + w[63] = hc_bytealign_be (w[12], w[13], offset); + w[62] = hc_bytealign_be (w[11], w[12], offset); + w[61] = hc_bytealign_be (w[10], w[11], offset); + w[60] = hc_bytealign_be (w[ 9], w[10], offset); + w[59] = hc_bytealign_be (w[ 8], w[ 9], offset); + w[58] = hc_bytealign_be (w[ 7], w[ 8], offset); + w[57] = hc_bytealign_be (w[ 6], w[ 7], offset); + w[56] = hc_bytealign_be (w[ 5], w[ 6], offset); + w[55] = hc_bytealign_be (w[ 4], w[ 5], offset); + w[54] = hc_bytealign_be (w[ 3], w[ 4], offset); + w[53] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[52] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[51] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[50] = hc_bytealign_be ( 0, w[ 0], offset); + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; @@ -24874,28 +28806,28 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 43: - w[63] = hc_bytealign_be (w[19], w[20], offset); - w[62] = hc_bytealign_be (w[18], w[19], offset); - w[61] = hc_bytealign_be (w[17], w[18], offset); - w[60] = hc_bytealign_be (w[16], w[17], offset); - w[59] = hc_bytealign_be (w[15], w[16], offset); - w[58] = hc_bytealign_be (w[14], w[15], offset); - w[57] = hc_bytealign_be (w[13], w[14], offset); - w[56] = hc_bytealign_be (w[12], w[13], offset); - w[55] = hc_bytealign_be (w[11], w[12], offset); - w[54] = hc_bytealign_be (w[10], w[11], offset); - w[53] = hc_bytealign_be (w[ 9], w[10], offset); - w[52] = hc_bytealign_be (w[ 8], w[ 9], offset); - w[51] = hc_bytealign_be (w[ 7], w[ 8], offset); - w[50] = hc_bytealign_be (w[ 6], w[ 7], offset); - w[49] = hc_bytealign_be (w[ 5], w[ 6], offset); - w[48] = hc_bytealign_be (w[ 4], w[ 5], offset); - w[47] = hc_bytealign_be (w[ 3], w[ 4], offset); - w[46] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[45] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[44] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[43] = hc_bytealign_be ( 0, w[ 0], offset); + case 51: + w[63] = hc_bytealign_be (w[11], w[12], offset); + w[62] = hc_bytealign_be (w[10], w[11], offset); + w[61] = hc_bytealign_be (w[ 9], w[10], offset); + w[60] = hc_bytealign_be (w[ 8], w[ 9], offset); + w[59] = hc_bytealign_be (w[ 7], w[ 8], offset); + w[58] = hc_bytealign_be (w[ 6], w[ 7], offset); + w[57] = hc_bytealign_be (w[ 5], w[ 6], offset); + w[56] = hc_bytealign_be (w[ 4], w[ 5], offset); + w[55] = hc_bytealign_be (w[ 3], w[ 4], offset); + w[54] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[53] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[52] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[51] = hc_bytealign_be ( 0, w[ 0], offset); + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; @@ -24942,27 +28874,27 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 44: - w[63] = hc_bytealign_be (w[18], w[19], offset); - w[62] = hc_bytealign_be (w[17], w[18], offset); - w[61] = hc_bytealign_be (w[16], w[17], offset); - w[60] = hc_bytealign_be (w[15], w[16], offset); - w[59] = hc_bytealign_be (w[14], w[15], offset); - w[58] = hc_bytealign_be (w[13], w[14], offset); - w[57] = hc_bytealign_be (w[12], w[13], offset); - w[56] = hc_bytealign_be (w[11], w[12], offset); - w[55] = hc_bytealign_be (w[10], w[11], offset); - w[54] = hc_bytealign_be (w[ 9], w[10], offset); - w[53] = hc_bytealign_be (w[ 8], w[ 9], offset); - w[52] = hc_bytealign_be (w[ 7], w[ 8], offset); - w[51] = hc_bytealign_be (w[ 6], w[ 7], offset); - w[50] = hc_bytealign_be (w[ 5], w[ 6], offset); - w[49] = hc_bytealign_be (w[ 4], w[ 5], offset); - w[48] = hc_bytealign_be (w[ 3], w[ 4], offset); - w[47] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[46] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[45] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[44] = hc_bytealign_be ( 0, w[ 0], offset); + case 52: + w[63] = hc_bytealign_be (w[10], w[11], offset); + w[62] = hc_bytealign_be (w[ 9], w[10], offset); + w[61] = hc_bytealign_be (w[ 8], w[ 9], offset); + w[60] = hc_bytealign_be (w[ 7], w[ 8], offset); + w[59] = hc_bytealign_be (w[ 6], w[ 7], offset); + w[58] = hc_bytealign_be (w[ 5], w[ 6], offset); + w[57] = hc_bytealign_be (w[ 4], w[ 5], offset); + w[56] = hc_bytealign_be (w[ 3], w[ 4], offset); + w[55] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[54] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[53] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[52] = hc_bytealign_be ( 0, w[ 0], offset); + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; @@ -25010,26 +28942,26 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 45: - w[63] = hc_bytealign_be (w[17], w[18], offset); - w[62] = hc_bytealign_be (w[16], w[17], offset); - w[61] = hc_bytealign_be (w[15], w[16], offset); - w[60] = hc_bytealign_be (w[14], w[15], offset); - w[59] = hc_bytealign_be (w[13], w[14], offset); - w[58] = hc_bytealign_be (w[12], w[13], offset); - w[57] = hc_bytealign_be (w[11], w[12], offset); - w[56] = hc_bytealign_be (w[10], w[11], offset); - w[55] = hc_bytealign_be (w[ 9], w[10], offset); - w[54] = hc_bytealign_be (w[ 8], w[ 9], offset); - w[53] = hc_bytealign_be (w[ 7], w[ 8], offset); - w[52] = hc_bytealign_be (w[ 6], w[ 7], offset); - w[51] = hc_bytealign_be (w[ 5], w[ 6], offset); - w[50] = hc_bytealign_be (w[ 4], w[ 5], offset); - w[49] = hc_bytealign_be (w[ 3], w[ 4], offset); - w[48] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[47] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[46] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[45] = hc_bytealign_be ( 0, w[ 0], offset); + case 53: + w[63] = hc_bytealign_be (w[ 9], w[10], offset); + w[62] = hc_bytealign_be (w[ 8], w[ 9], offset); + w[61] = hc_bytealign_be (w[ 7], w[ 8], offset); + w[60] = hc_bytealign_be (w[ 6], w[ 7], offset); + w[59] = hc_bytealign_be (w[ 5], w[ 6], offset); + w[58] = hc_bytealign_be (w[ 4], w[ 5], offset); + w[57] = hc_bytealign_be (w[ 3], w[ 4], offset); + w[56] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[55] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[54] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[53] = hc_bytealign_be ( 0, w[ 0], offset); + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; @@ -25070,33 +29002,33 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; - w[ 4] = 0; - w[ 3] = 0; - w[ 2] = 0; - w[ 1] = 0; - w[ 0] = 0; - - break; - - case 46: - w[63] = hc_bytealign_be (w[16], w[17], offset); - w[62] = hc_bytealign_be (w[15], w[16], offset); - w[61] = hc_bytealign_be (w[14], w[15], offset); - w[60] = hc_bytealign_be (w[13], w[14], offset); - w[59] = hc_bytealign_be (w[12], w[13], offset); - w[58] = hc_bytealign_be (w[11], w[12], offset); - w[57] = hc_bytealign_be (w[10], w[11], offset); - w[56] = hc_bytealign_be (w[ 9], w[10], offset); - w[55] = hc_bytealign_be (w[ 8], w[ 9], offset); - w[54] = hc_bytealign_be (w[ 7], w[ 8], offset); - w[53] = hc_bytealign_be (w[ 6], w[ 7], offset); - w[52] = hc_bytealign_be (w[ 5], w[ 6], offset); - w[51] = hc_bytealign_be (w[ 4], w[ 5], offset); - w[50] = hc_bytealign_be (w[ 3], w[ 4], offset); - w[49] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[48] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[47] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[46] = hc_bytealign_be ( 0, w[ 0], offset); + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 54: + w[63] = hc_bytealign_be (w[ 8], w[ 9], offset); + w[62] = hc_bytealign_be (w[ 7], w[ 8], offset); + w[61] = hc_bytealign_be (w[ 6], w[ 7], offset); + w[60] = hc_bytealign_be (w[ 5], w[ 6], offset); + w[59] = hc_bytealign_be (w[ 4], w[ 5], offset); + w[58] = hc_bytealign_be (w[ 3], w[ 4], offset); + w[57] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[56] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[55] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[54] = hc_bytealign_be ( 0, w[ 0], offset); + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; @@ -25146,24 +29078,24 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 47: - w[63] = hc_bytealign_be (w[15], w[16], offset); - w[62] = hc_bytealign_be (w[14], w[15], offset); - w[61] = hc_bytealign_be (w[13], w[14], offset); - w[60] = hc_bytealign_be (w[12], w[13], offset); - w[59] = hc_bytealign_be (w[11], w[12], offset); - w[58] = hc_bytealign_be (w[10], w[11], offset); - w[57] = hc_bytealign_be (w[ 9], w[10], offset); - w[56] = hc_bytealign_be (w[ 8], w[ 9], offset); - w[55] = hc_bytealign_be (w[ 7], w[ 8], offset); - w[54] = hc_bytealign_be (w[ 6], w[ 7], offset); - w[53] = hc_bytealign_be (w[ 5], w[ 6], offset); - w[52] = hc_bytealign_be (w[ 4], w[ 5], offset); - w[51] = hc_bytealign_be (w[ 3], w[ 4], offset); - w[50] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[49] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[48] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[47] = hc_bytealign_be ( 0, w[ 0], offset); + case 55: + w[63] = hc_bytealign_be (w[ 7], w[ 8], offset); + w[62] = hc_bytealign_be (w[ 6], w[ 7], offset); + w[61] = hc_bytealign_be (w[ 5], w[ 6], offset); + w[60] = hc_bytealign_be (w[ 4], w[ 5], offset); + w[59] = hc_bytealign_be (w[ 3], w[ 4], offset); + w[58] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[57] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[56] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[55] = hc_bytealign_be ( 0, w[ 0], offset); + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; @@ -25214,23 +29146,23 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 48: - w[63] = hc_bytealign_be (w[14], w[15], offset); - w[62] = hc_bytealign_be (w[13], w[14], offset); - w[61] = hc_bytealign_be (w[12], w[13], offset); - w[60] = hc_bytealign_be (w[11], w[12], offset); - w[59] = hc_bytealign_be (w[10], w[11], offset); - w[58] = hc_bytealign_be (w[ 9], w[10], offset); - w[57] = hc_bytealign_be (w[ 8], w[ 9], offset); - w[56] = hc_bytealign_be (w[ 7], w[ 8], offset); - w[55] = hc_bytealign_be (w[ 6], w[ 7], offset); - w[54] = hc_bytealign_be (w[ 5], w[ 6], offset); - w[53] = hc_bytealign_be (w[ 4], w[ 5], offset); - w[52] = hc_bytealign_be (w[ 3], w[ 4], offset); - w[51] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[50] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[49] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[48] = hc_bytealign_be ( 0, w[ 0], offset); + case 56: + w[63] = hc_bytealign_be (w[ 6], w[ 7], offset); + w[62] = hc_bytealign_be (w[ 5], w[ 6], offset); + w[61] = hc_bytealign_be (w[ 4], w[ 5], offset); + w[60] = hc_bytealign_be (w[ 3], w[ 4], offset); + w[59] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[58] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[57] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[56] = hc_bytealign_be ( 0, w[ 0], offset); + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; @@ -25282,22 +29214,22 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 49: - w[63] = hc_bytealign_be (w[13], w[14], offset); - w[62] = hc_bytealign_be (w[12], w[13], offset); - w[61] = hc_bytealign_be (w[11], w[12], offset); - w[60] = hc_bytealign_be (w[10], w[11], offset); - w[59] = hc_bytealign_be (w[ 9], w[10], offset); - w[58] = hc_bytealign_be (w[ 8], w[ 9], offset); - w[57] = hc_bytealign_be (w[ 7], w[ 8], offset); - w[56] = hc_bytealign_be (w[ 6], w[ 7], offset); - w[55] = hc_bytealign_be (w[ 5], w[ 6], offset); - w[54] = hc_bytealign_be (w[ 4], w[ 5], offset); - w[53] = hc_bytealign_be (w[ 3], w[ 4], offset); - w[52] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[51] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[50] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[49] = hc_bytealign_be ( 0, w[ 0], offset); + case 57: + w[63] = hc_bytealign_be (w[ 5], w[ 6], offset); + w[62] = hc_bytealign_be (w[ 4], w[ 5], offset); + w[61] = hc_bytealign_be (w[ 3], w[ 4], offset); + w[60] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[59] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[58] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[57] = hc_bytealign_be ( 0, w[ 0], offset); + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; @@ -25350,21 +29282,21 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 50: - w[63] = hc_bytealign_be (w[12], w[13], offset); - w[62] = hc_bytealign_be (w[11], w[12], offset); - w[61] = hc_bytealign_be (w[10], w[11], offset); - w[60] = hc_bytealign_be (w[ 9], w[10], offset); - w[59] = hc_bytealign_be (w[ 8], w[ 9], offset); - w[58] = hc_bytealign_be (w[ 7], w[ 8], offset); - w[57] = hc_bytealign_be (w[ 6], w[ 7], offset); - w[56] = hc_bytealign_be (w[ 5], w[ 6], offset); - w[55] = hc_bytealign_be (w[ 4], w[ 5], offset); - w[54] = hc_bytealign_be (w[ 3], w[ 4], offset); - w[53] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[52] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[51] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[50] = hc_bytealign_be ( 0, w[ 0], offset); + case 58: + w[63] = hc_bytealign_be (w[ 4], w[ 5], offset); + w[62] = hc_bytealign_be (w[ 3], w[ 4], offset); + w[61] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[60] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[59] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[58] = hc_bytealign_be ( 0, w[ 0], offset); + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; @@ -25418,20 +29350,20 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 51: - w[63] = hc_bytealign_be (w[11], w[12], offset); - w[62] = hc_bytealign_be (w[10], w[11], offset); - w[61] = hc_bytealign_be (w[ 9], w[10], offset); - w[60] = hc_bytealign_be (w[ 8], w[ 9], offset); - w[59] = hc_bytealign_be (w[ 7], w[ 8], offset); - w[58] = hc_bytealign_be (w[ 6], w[ 7], offset); - w[57] = hc_bytealign_be (w[ 5], w[ 6], offset); - w[56] = hc_bytealign_be (w[ 4], w[ 5], offset); - w[55] = hc_bytealign_be (w[ 3], w[ 4], offset); - w[54] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[53] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[52] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[51] = hc_bytealign_be ( 0, w[ 0], offset); + case 59: + w[63] = hc_bytealign_be (w[ 3], w[ 4], offset); + w[62] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[61] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[60] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[59] = hc_bytealign_be ( 0, w[ 0], offset); + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; @@ -25486,19 +29418,19 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 52: - w[63] = hc_bytealign_be (w[10], w[11], offset); - w[62] = hc_bytealign_be (w[ 9], w[10], offset); - w[61] = hc_bytealign_be (w[ 8], w[ 9], offset); - w[60] = hc_bytealign_be (w[ 7], w[ 8], offset); - w[59] = hc_bytealign_be (w[ 6], w[ 7], offset); - w[58] = hc_bytealign_be (w[ 5], w[ 6], offset); - w[57] = hc_bytealign_be (w[ 4], w[ 5], offset); - w[56] = hc_bytealign_be (w[ 3], w[ 4], offset); - w[55] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[54] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[53] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[52] = hc_bytealign_be ( 0, w[ 0], offset); + case 60: + w[63] = hc_bytealign_be (w[ 2], w[ 3], offset); + w[62] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[61] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[60] = hc_bytealign_be ( 0, w[ 0], offset); + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; @@ -25554,18 +29486,18 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 53: - w[63] = hc_bytealign_be (w[ 9], w[10], offset); - w[62] = hc_bytealign_be (w[ 8], w[ 9], offset); - w[61] = hc_bytealign_be (w[ 7], w[ 8], offset); - w[60] = hc_bytealign_be (w[ 6], w[ 7], offset); - w[59] = hc_bytealign_be (w[ 5], w[ 6], offset); - w[58] = hc_bytealign_be (w[ 4], w[ 5], offset); - w[57] = hc_bytealign_be (w[ 3], w[ 4], offset); - w[56] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[55] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[54] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[53] = hc_bytealign_be ( 0, w[ 0], offset); + case 61: + w[63] = hc_bytealign_be (w[ 1], w[ 2], offset); + w[62] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[61] = hc_bytealign_be ( 0, w[ 0], offset); + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; @@ -25622,17 +29554,17 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 54: - w[63] = hc_bytealign_be (w[ 8], w[ 9], offset); - w[62] = hc_bytealign_be (w[ 7], w[ 8], offset); - w[61] = hc_bytealign_be (w[ 6], w[ 7], offset); - w[60] = hc_bytealign_be (w[ 5], w[ 6], offset); - w[59] = hc_bytealign_be (w[ 4], w[ 5], offset); - w[58] = hc_bytealign_be (w[ 3], w[ 4], offset); - w[57] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[56] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[55] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[54] = hc_bytealign_be ( 0, w[ 0], offset); + case 62: + w[63] = hc_bytealign_be (w[ 0], w[ 1], offset); + w[62] = hc_bytealign_be ( 0, w[ 0], offset); + w[61] = 0; + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; @@ -25690,16 +29622,16 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 55: - w[63] = hc_bytealign_be (w[ 7], w[ 8], offset); - w[62] = hc_bytealign_be (w[ 6], w[ 7], offset); - w[61] = hc_bytealign_be (w[ 5], w[ 6], offset); - w[60] = hc_bytealign_be (w[ 4], w[ 5], offset); - w[59] = hc_bytealign_be (w[ 3], w[ 4], offset); - w[58] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[57] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[56] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[55] = hc_bytealign_be ( 0, w[ 0], offset); + case 63: + w[63] = hc_bytealign_be ( 0, w[ 0], offset); + w[62] = 0; + w[61] = 0; + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; @@ -25756,69 +29688,355 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) w[ 1] = 0; w[ 0] = 0; + break; + } + #endif + + #if (defined IS_AMD && HAS_VPERM == 1) || defined IS_NV + + #if defined IS_NV + const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; + #endif + + #if defined IS_AMD + const int selector = 0x0706050403020100 >> ((offset & 3) * 8); + #endif + + switch (offset_switch) + { + case 0: + w[63] = hc_byte_perm (w[63], w[62], selector); + w[62] = hc_byte_perm (w[62], w[61], selector); + w[61] = hc_byte_perm (w[61], w[60], selector); + w[60] = hc_byte_perm (w[60], w[59], selector); + w[59] = hc_byte_perm (w[59], w[58], selector); + w[58] = hc_byte_perm (w[58], w[57], selector); + w[57] = hc_byte_perm (w[57], w[56], selector); + w[56] = hc_byte_perm (w[56], w[55], selector); + w[55] = hc_byte_perm (w[55], w[54], selector); + w[54] = hc_byte_perm (w[54], w[53], selector); + w[53] = hc_byte_perm (w[53], w[52], selector); + w[52] = hc_byte_perm (w[52], w[51], selector); + w[51] = hc_byte_perm (w[51], w[50], selector); + w[50] = hc_byte_perm (w[50], w[49], selector); + w[49] = hc_byte_perm (w[49], w[48], selector); + w[48] = hc_byte_perm (w[48], w[47], selector); + w[47] = hc_byte_perm (w[47], w[46], selector); + w[46] = hc_byte_perm (w[46], w[45], selector); + w[45] = hc_byte_perm (w[45], w[44], selector); + w[44] = hc_byte_perm (w[44], w[43], selector); + w[43] = hc_byte_perm (w[43], w[42], selector); + w[42] = hc_byte_perm (w[42], w[41], selector); + w[41] = hc_byte_perm (w[41], w[40], selector); + w[40] = hc_byte_perm (w[40], w[39], selector); + w[39] = hc_byte_perm (w[39], w[38], selector); + w[38] = hc_byte_perm (w[38], w[37], selector); + w[37] = hc_byte_perm (w[37], w[36], selector); + w[36] = hc_byte_perm (w[36], w[35], selector); + w[35] = hc_byte_perm (w[35], w[34], selector); + w[34] = hc_byte_perm (w[34], w[33], selector); + w[33] = hc_byte_perm (w[33], w[32], selector); + w[32] = hc_byte_perm (w[32], w[31], selector); + w[31] = hc_byte_perm (w[31], w[30], selector); + w[30] = hc_byte_perm (w[30], w[29], selector); + w[29] = hc_byte_perm (w[29], w[28], selector); + w[28] = hc_byte_perm (w[28], w[27], selector); + w[27] = hc_byte_perm (w[27], w[26], selector); + w[26] = hc_byte_perm (w[26], w[25], selector); + w[25] = hc_byte_perm (w[25], w[24], selector); + w[24] = hc_byte_perm (w[24], w[23], selector); + w[23] = hc_byte_perm (w[23], w[22], selector); + w[22] = hc_byte_perm (w[22], w[21], selector); + w[21] = hc_byte_perm (w[21], w[20], selector); + w[20] = hc_byte_perm (w[20], w[19], selector); + w[19] = hc_byte_perm (w[19], w[18], selector); + w[18] = hc_byte_perm (w[18], w[17], selector); + w[17] = hc_byte_perm (w[17], w[16], selector); + w[16] = hc_byte_perm (w[16], w[15], selector); + w[15] = hc_byte_perm (w[15], w[14], selector); + w[14] = hc_byte_perm (w[14], w[13], selector); + w[13] = hc_byte_perm (w[13], w[12], selector); + w[12] = hc_byte_perm (w[12], w[11], selector); + w[11] = hc_byte_perm (w[11], w[10], selector); + w[10] = hc_byte_perm (w[10], w[ 9], selector); + w[ 9] = hc_byte_perm (w[ 9], w[ 8], selector); + w[ 8] = hc_byte_perm (w[ 8], w[ 7], selector); + w[ 7] = hc_byte_perm (w[ 7], w[ 6], selector); + w[ 6] = hc_byte_perm (w[ 6], w[ 5], selector); + w[ 5] = hc_byte_perm (w[ 5], w[ 4], selector); + w[ 4] = hc_byte_perm (w[ 4], w[ 3], selector); + w[ 3] = hc_byte_perm (w[ 3], w[ 2], selector); + w[ 2] = hc_byte_perm (w[ 2], w[ 1], selector); + w[ 1] = hc_byte_perm (w[ 1], w[ 0], selector); + w[ 0] = hc_byte_perm (w[ 0], 0, selector); + + break; + + case 1: + w[63] = hc_byte_perm (w[62], w[61], selector); + w[62] = hc_byte_perm (w[61], w[60], selector); + w[61] = hc_byte_perm (w[60], w[59], selector); + w[60] = hc_byte_perm (w[59], w[58], selector); + w[59] = hc_byte_perm (w[58], w[57], selector); + w[58] = hc_byte_perm (w[57], w[56], selector); + w[57] = hc_byte_perm (w[56], w[55], selector); + w[56] = hc_byte_perm (w[55], w[54], selector); + w[55] = hc_byte_perm (w[54], w[53], selector); + w[54] = hc_byte_perm (w[53], w[52], selector); + w[53] = hc_byte_perm (w[52], w[51], selector); + w[52] = hc_byte_perm (w[51], w[50], selector); + w[51] = hc_byte_perm (w[50], w[49], selector); + w[50] = hc_byte_perm (w[49], w[48], selector); + w[49] = hc_byte_perm (w[48], w[47], selector); + w[48] = hc_byte_perm (w[47], w[46], selector); + w[47] = hc_byte_perm (w[46], w[45], selector); + w[46] = hc_byte_perm (w[45], w[44], selector); + w[45] = hc_byte_perm (w[44], w[43], selector); + w[44] = hc_byte_perm (w[43], w[42], selector); + w[43] = hc_byte_perm (w[42], w[41], selector); + w[42] = hc_byte_perm (w[41], w[40], selector); + w[41] = hc_byte_perm (w[40], w[39], selector); + w[40] = hc_byte_perm (w[39], w[38], selector); + w[39] = hc_byte_perm (w[38], w[37], selector); + w[38] = hc_byte_perm (w[37], w[36], selector); + w[37] = hc_byte_perm (w[36], w[35], selector); + w[36] = hc_byte_perm (w[35], w[34], selector); + w[35] = hc_byte_perm (w[34], w[33], selector); + w[34] = hc_byte_perm (w[33], w[32], selector); + w[33] = hc_byte_perm (w[32], w[31], selector); + w[32] = hc_byte_perm (w[31], w[30], selector); + w[31] = hc_byte_perm (w[30], w[29], selector); + w[30] = hc_byte_perm (w[29], w[28], selector); + w[29] = hc_byte_perm (w[28], w[27], selector); + w[28] = hc_byte_perm (w[27], w[26], selector); + w[27] = hc_byte_perm (w[26], w[25], selector); + w[26] = hc_byte_perm (w[25], w[24], selector); + w[25] = hc_byte_perm (w[24], w[23], selector); + w[24] = hc_byte_perm (w[23], w[22], selector); + w[23] = hc_byte_perm (w[22], w[21], selector); + w[22] = hc_byte_perm (w[21], w[20], selector); + w[21] = hc_byte_perm (w[20], w[19], selector); + w[20] = hc_byte_perm (w[19], w[18], selector); + w[19] = hc_byte_perm (w[18], w[17], selector); + w[18] = hc_byte_perm (w[17], w[16], selector); + w[17] = hc_byte_perm (w[16], w[15], selector); + w[16] = hc_byte_perm (w[15], w[14], selector); + w[15] = hc_byte_perm (w[14], w[13], selector); + w[14] = hc_byte_perm (w[13], w[12], selector); + w[13] = hc_byte_perm (w[12], w[11], selector); + w[12] = hc_byte_perm (w[11], w[10], selector); + w[11] = hc_byte_perm (w[10], w[ 9], selector); + w[10] = hc_byte_perm (w[ 9], w[ 8], selector); + w[ 9] = hc_byte_perm (w[ 8], w[ 7], selector); + w[ 8] = hc_byte_perm (w[ 7], w[ 6], selector); + w[ 7] = hc_byte_perm (w[ 6], w[ 5], selector); + w[ 6] = hc_byte_perm (w[ 5], w[ 4], selector); + w[ 5] = hc_byte_perm (w[ 4], w[ 3], selector); + w[ 4] = hc_byte_perm (w[ 3], w[ 2], selector); + w[ 3] = hc_byte_perm (w[ 2], w[ 1], selector); + w[ 2] = hc_byte_perm (w[ 1], w[ 0], selector); + w[ 1] = hc_byte_perm (w[ 0], 0, selector); + w[ 0] = 0; + + break; + + case 2: + w[63] = hc_byte_perm (w[61], w[60], selector); + w[62] = hc_byte_perm (w[60], w[59], selector); + w[61] = hc_byte_perm (w[59], w[58], selector); + w[60] = hc_byte_perm (w[58], w[57], selector); + w[59] = hc_byte_perm (w[57], w[56], selector); + w[58] = hc_byte_perm (w[56], w[55], selector); + w[57] = hc_byte_perm (w[55], w[54], selector); + w[56] = hc_byte_perm (w[54], w[53], selector); + w[55] = hc_byte_perm (w[53], w[52], selector); + w[54] = hc_byte_perm (w[52], w[51], selector); + w[53] = hc_byte_perm (w[51], w[50], selector); + w[52] = hc_byte_perm (w[50], w[49], selector); + w[51] = hc_byte_perm (w[49], w[48], selector); + w[50] = hc_byte_perm (w[48], w[47], selector); + w[49] = hc_byte_perm (w[47], w[46], selector); + w[48] = hc_byte_perm (w[46], w[45], selector); + w[47] = hc_byte_perm (w[45], w[44], selector); + w[46] = hc_byte_perm (w[44], w[43], selector); + w[45] = hc_byte_perm (w[43], w[42], selector); + w[44] = hc_byte_perm (w[42], w[41], selector); + w[43] = hc_byte_perm (w[41], w[40], selector); + w[42] = hc_byte_perm (w[40], w[39], selector); + w[41] = hc_byte_perm (w[39], w[38], selector); + w[40] = hc_byte_perm (w[38], w[37], selector); + w[39] = hc_byte_perm (w[37], w[36], selector); + w[38] = hc_byte_perm (w[36], w[35], selector); + w[37] = hc_byte_perm (w[35], w[34], selector); + w[36] = hc_byte_perm (w[34], w[33], selector); + w[35] = hc_byte_perm (w[33], w[32], selector); + w[34] = hc_byte_perm (w[32], w[31], selector); + w[33] = hc_byte_perm (w[31], w[30], selector); + w[32] = hc_byte_perm (w[30], w[29], selector); + w[31] = hc_byte_perm (w[29], w[28], selector); + w[30] = hc_byte_perm (w[28], w[27], selector); + w[29] = hc_byte_perm (w[27], w[26], selector); + w[28] = hc_byte_perm (w[26], w[25], selector); + w[27] = hc_byte_perm (w[25], w[24], selector); + w[26] = hc_byte_perm (w[24], w[23], selector); + w[25] = hc_byte_perm (w[23], w[22], selector); + w[24] = hc_byte_perm (w[22], w[21], selector); + w[23] = hc_byte_perm (w[21], w[20], selector); + w[22] = hc_byte_perm (w[20], w[19], selector); + w[21] = hc_byte_perm (w[19], w[18], selector); + w[20] = hc_byte_perm (w[18], w[17], selector); + w[19] = hc_byte_perm (w[17], w[16], selector); + w[18] = hc_byte_perm (w[16], w[15], selector); + w[17] = hc_byte_perm (w[15], w[14], selector); + w[16] = hc_byte_perm (w[14], w[13], selector); + w[15] = hc_byte_perm (w[13], w[12], selector); + w[14] = hc_byte_perm (w[12], w[11], selector); + w[13] = hc_byte_perm (w[11], w[10], selector); + w[12] = hc_byte_perm (w[10], w[ 9], selector); + w[11] = hc_byte_perm (w[ 9], w[ 8], selector); + w[10] = hc_byte_perm (w[ 8], w[ 7], selector); + w[ 9] = hc_byte_perm (w[ 7], w[ 6], selector); + w[ 8] = hc_byte_perm (w[ 6], w[ 5], selector); + w[ 7] = hc_byte_perm (w[ 5], w[ 4], selector); + w[ 6] = hc_byte_perm (w[ 4], w[ 3], selector); + w[ 5] = hc_byte_perm (w[ 3], w[ 2], selector); + w[ 4] = hc_byte_perm (w[ 2], w[ 1], selector); + w[ 3] = hc_byte_perm (w[ 1], w[ 0], selector); + w[ 2] = hc_byte_perm (w[ 0], 0, selector); + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 3: + w[63] = hc_byte_perm (w[60], w[59], selector); + w[62] = hc_byte_perm (w[59], w[58], selector); + w[61] = hc_byte_perm (w[58], w[57], selector); + w[60] = hc_byte_perm (w[57], w[56], selector); + w[59] = hc_byte_perm (w[56], w[55], selector); + w[58] = hc_byte_perm (w[55], w[54], selector); + w[57] = hc_byte_perm (w[54], w[53], selector); + w[56] = hc_byte_perm (w[53], w[52], selector); + w[55] = hc_byte_perm (w[52], w[51], selector); + w[54] = hc_byte_perm (w[51], w[50], selector); + w[53] = hc_byte_perm (w[50], w[49], selector); + w[52] = hc_byte_perm (w[49], w[48], selector); + w[51] = hc_byte_perm (w[48], w[47], selector); + w[50] = hc_byte_perm (w[47], w[46], selector); + w[49] = hc_byte_perm (w[46], w[45], selector); + w[48] = hc_byte_perm (w[45], w[44], selector); + w[47] = hc_byte_perm (w[44], w[43], selector); + w[46] = hc_byte_perm (w[43], w[42], selector); + w[45] = hc_byte_perm (w[42], w[41], selector); + w[44] = hc_byte_perm (w[41], w[40], selector); + w[43] = hc_byte_perm (w[40], w[39], selector); + w[42] = hc_byte_perm (w[39], w[38], selector); + w[41] = hc_byte_perm (w[38], w[37], selector); + w[40] = hc_byte_perm (w[37], w[36], selector); + w[39] = hc_byte_perm (w[36], w[35], selector); + w[38] = hc_byte_perm (w[35], w[34], selector); + w[37] = hc_byte_perm (w[34], w[33], selector); + w[36] = hc_byte_perm (w[33], w[32], selector); + w[35] = hc_byte_perm (w[32], w[31], selector); + w[34] = hc_byte_perm (w[31], w[30], selector); + w[33] = hc_byte_perm (w[30], w[29], selector); + w[32] = hc_byte_perm (w[29], w[28], selector); + w[31] = hc_byte_perm (w[28], w[27], selector); + w[30] = hc_byte_perm (w[27], w[26], selector); + w[29] = hc_byte_perm (w[26], w[25], selector); + w[28] = hc_byte_perm (w[25], w[24], selector); + w[27] = hc_byte_perm (w[24], w[23], selector); + w[26] = hc_byte_perm (w[23], w[22], selector); + w[25] = hc_byte_perm (w[22], w[21], selector); + w[24] = hc_byte_perm (w[21], w[20], selector); + w[23] = hc_byte_perm (w[20], w[19], selector); + w[22] = hc_byte_perm (w[19], w[18], selector); + w[21] = hc_byte_perm (w[18], w[17], selector); + w[20] = hc_byte_perm (w[17], w[16], selector); + w[19] = hc_byte_perm (w[16], w[15], selector); + w[18] = hc_byte_perm (w[15], w[14], selector); + w[17] = hc_byte_perm (w[14], w[13], selector); + w[16] = hc_byte_perm (w[13], w[12], selector); + w[15] = hc_byte_perm (w[12], w[11], selector); + w[14] = hc_byte_perm (w[11], w[10], selector); + w[13] = hc_byte_perm (w[10], w[ 9], selector); + w[12] = hc_byte_perm (w[ 9], w[ 8], selector); + w[11] = hc_byte_perm (w[ 8], w[ 7], selector); + w[10] = hc_byte_perm (w[ 7], w[ 6], selector); + w[ 9] = hc_byte_perm (w[ 6], w[ 5], selector); + w[ 8] = hc_byte_perm (w[ 5], w[ 4], selector); + w[ 7] = hc_byte_perm (w[ 4], w[ 3], selector); + w[ 6] = hc_byte_perm (w[ 3], w[ 2], selector); + w[ 5] = hc_byte_perm (w[ 2], w[ 1], selector); + w[ 4] = hc_byte_perm (w[ 1], w[ 0], selector); + w[ 3] = hc_byte_perm (w[ 0], 0, selector); + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; - case 56: - w[63] = hc_bytealign_be (w[ 6], w[ 7], offset); - w[62] = hc_bytealign_be (w[ 5], w[ 6], offset); - w[61] = hc_bytealign_be (w[ 4], w[ 5], offset); - w[60] = hc_bytealign_be (w[ 3], w[ 4], offset); - w[59] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[58] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[57] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[56] = hc_bytealign_be ( 0, w[ 0], offset); - w[55] = 0; - w[54] = 0; - w[53] = 0; - w[52] = 0; - w[51] = 0; - w[50] = 0; - w[49] = 0; - w[48] = 0; - w[47] = 0; - w[46] = 0; - w[45] = 0; - w[44] = 0; - w[43] = 0; - w[42] = 0; - w[41] = 0; - w[40] = 0; - w[39] = 0; - w[38] = 0; - w[37] = 0; - w[36] = 0; - w[35] = 0; - w[34] = 0; - w[33] = 0; - w[32] = 0; - w[31] = 0; - w[30] = 0; - w[29] = 0; - w[28] = 0; - w[27] = 0; - w[26] = 0; - w[25] = 0; - w[24] = 0; - w[23] = 0; - w[22] = 0; - w[21] = 0; - w[20] = 0; - w[19] = 0; - w[18] = 0; - w[17] = 0; - w[16] = 0; - w[15] = 0; - w[14] = 0; - w[13] = 0; - w[12] = 0; - w[11] = 0; - w[10] = 0; - w[ 9] = 0; - w[ 8] = 0; - w[ 7] = 0; - w[ 6] = 0; - w[ 5] = 0; - w[ 4] = 0; + case 4: + w[63] = hc_byte_perm (w[59], w[58], selector); + w[62] = hc_byte_perm (w[58], w[57], selector); + w[61] = hc_byte_perm (w[57], w[56], selector); + w[60] = hc_byte_perm (w[56], w[55], selector); + w[59] = hc_byte_perm (w[55], w[54], selector); + w[58] = hc_byte_perm (w[54], w[53], selector); + w[57] = hc_byte_perm (w[53], w[52], selector); + w[56] = hc_byte_perm (w[52], w[51], selector); + w[55] = hc_byte_perm (w[51], w[50], selector); + w[54] = hc_byte_perm (w[50], w[49], selector); + w[53] = hc_byte_perm (w[49], w[48], selector); + w[52] = hc_byte_perm (w[48], w[47], selector); + w[51] = hc_byte_perm (w[47], w[46], selector); + w[50] = hc_byte_perm (w[46], w[45], selector); + w[49] = hc_byte_perm (w[45], w[44], selector); + w[48] = hc_byte_perm (w[44], w[43], selector); + w[47] = hc_byte_perm (w[43], w[42], selector); + w[46] = hc_byte_perm (w[42], w[41], selector); + w[45] = hc_byte_perm (w[41], w[40], selector); + w[44] = hc_byte_perm (w[40], w[39], selector); + w[43] = hc_byte_perm (w[39], w[38], selector); + w[42] = hc_byte_perm (w[38], w[37], selector); + w[41] = hc_byte_perm (w[37], w[36], selector); + w[40] = hc_byte_perm (w[36], w[35], selector); + w[39] = hc_byte_perm (w[35], w[34], selector); + w[38] = hc_byte_perm (w[34], w[33], selector); + w[37] = hc_byte_perm (w[33], w[32], selector); + w[36] = hc_byte_perm (w[32], w[31], selector); + w[35] = hc_byte_perm (w[31], w[30], selector); + w[34] = hc_byte_perm (w[30], w[29], selector); + w[33] = hc_byte_perm (w[29], w[28], selector); + w[32] = hc_byte_perm (w[28], w[27], selector); + w[31] = hc_byte_perm (w[27], w[26], selector); + w[30] = hc_byte_perm (w[26], w[25], selector); + w[29] = hc_byte_perm (w[25], w[24], selector); + w[28] = hc_byte_perm (w[24], w[23], selector); + w[27] = hc_byte_perm (w[23], w[22], selector); + w[26] = hc_byte_perm (w[22], w[21], selector); + w[25] = hc_byte_perm (w[21], w[20], selector); + w[24] = hc_byte_perm (w[20], w[19], selector); + w[23] = hc_byte_perm (w[19], w[18], selector); + w[22] = hc_byte_perm (w[18], w[17], selector); + w[21] = hc_byte_perm (w[17], w[16], selector); + w[20] = hc_byte_perm (w[16], w[15], selector); + w[19] = hc_byte_perm (w[15], w[14], selector); + w[18] = hc_byte_perm (w[14], w[13], selector); + w[17] = hc_byte_perm (w[13], w[12], selector); + w[16] = hc_byte_perm (w[12], w[11], selector); + w[15] = hc_byte_perm (w[11], w[10], selector); + w[14] = hc_byte_perm (w[10], w[ 9], selector); + w[13] = hc_byte_perm (w[ 9], w[ 8], selector); + w[12] = hc_byte_perm (w[ 8], w[ 7], selector); + w[11] = hc_byte_perm (w[ 7], w[ 6], selector); + w[10] = hc_byte_perm (w[ 6], w[ 5], selector); + w[ 9] = hc_byte_perm (w[ 5], w[ 4], selector); + w[ 8] = hc_byte_perm (w[ 4], w[ 3], selector); + w[ 7] = hc_byte_perm (w[ 3], w[ 2], selector); + w[ 6] = hc_byte_perm (w[ 2], w[ 1], selector); + w[ 5] = hc_byte_perm (w[ 1], w[ 0], selector); + w[ 4] = hc_byte_perm (w[ 0], 0, selector); w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; @@ -25826,66 +30044,66 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 57: - w[63] = hc_bytealign_be (w[ 5], w[ 6], offset); - w[62] = hc_bytealign_be (w[ 4], w[ 5], offset); - w[61] = hc_bytealign_be (w[ 3], w[ 4], offset); - w[60] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[59] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[58] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[57] = hc_bytealign_be ( 0, w[ 0], offset); - w[56] = 0; - w[55] = 0; - w[54] = 0; - w[53] = 0; - w[52] = 0; - w[51] = 0; - w[50] = 0; - w[49] = 0; - w[48] = 0; - w[47] = 0; - w[46] = 0; - w[45] = 0; - w[44] = 0; - w[43] = 0; - w[42] = 0; - w[41] = 0; - w[40] = 0; - w[39] = 0; - w[38] = 0; - w[37] = 0; - w[36] = 0; - w[35] = 0; - w[34] = 0; - w[33] = 0; - w[32] = 0; - w[31] = 0; - w[30] = 0; - w[29] = 0; - w[28] = 0; - w[27] = 0; - w[26] = 0; - w[25] = 0; - w[24] = 0; - w[23] = 0; - w[22] = 0; - w[21] = 0; - w[20] = 0; - w[19] = 0; - w[18] = 0; - w[17] = 0; - w[16] = 0; - w[15] = 0; - w[14] = 0; - w[13] = 0; - w[12] = 0; - w[11] = 0; - w[10] = 0; - w[ 9] = 0; - w[ 8] = 0; - w[ 7] = 0; - w[ 6] = 0; - w[ 5] = 0; + case 5: + w[63] = hc_byte_perm (w[58], w[57], selector); + w[62] = hc_byte_perm (w[57], w[56], selector); + w[61] = hc_byte_perm (w[56], w[55], selector); + w[60] = hc_byte_perm (w[55], w[54], selector); + w[59] = hc_byte_perm (w[54], w[53], selector); + w[58] = hc_byte_perm (w[53], w[52], selector); + w[57] = hc_byte_perm (w[52], w[51], selector); + w[56] = hc_byte_perm (w[51], w[50], selector); + w[55] = hc_byte_perm (w[50], w[49], selector); + w[54] = hc_byte_perm (w[49], w[48], selector); + w[53] = hc_byte_perm (w[48], w[47], selector); + w[52] = hc_byte_perm (w[47], w[46], selector); + w[51] = hc_byte_perm (w[46], w[45], selector); + w[50] = hc_byte_perm (w[45], w[44], selector); + w[49] = hc_byte_perm (w[44], w[43], selector); + w[48] = hc_byte_perm (w[43], w[42], selector); + w[47] = hc_byte_perm (w[42], w[41], selector); + w[46] = hc_byte_perm (w[41], w[40], selector); + w[45] = hc_byte_perm (w[40], w[39], selector); + w[44] = hc_byte_perm (w[39], w[38], selector); + w[43] = hc_byte_perm (w[38], w[37], selector); + w[42] = hc_byte_perm (w[37], w[36], selector); + w[41] = hc_byte_perm (w[36], w[35], selector); + w[40] = hc_byte_perm (w[35], w[34], selector); + w[39] = hc_byte_perm (w[34], w[33], selector); + w[38] = hc_byte_perm (w[33], w[32], selector); + w[37] = hc_byte_perm (w[32], w[31], selector); + w[36] = hc_byte_perm (w[31], w[30], selector); + w[35] = hc_byte_perm (w[30], w[29], selector); + w[34] = hc_byte_perm (w[29], w[28], selector); + w[33] = hc_byte_perm (w[28], w[27], selector); + w[32] = hc_byte_perm (w[27], w[26], selector); + w[31] = hc_byte_perm (w[26], w[25], selector); + w[30] = hc_byte_perm (w[25], w[24], selector); + w[29] = hc_byte_perm (w[24], w[23], selector); + w[28] = hc_byte_perm (w[23], w[22], selector); + w[27] = hc_byte_perm (w[22], w[21], selector); + w[26] = hc_byte_perm (w[21], w[20], selector); + w[25] = hc_byte_perm (w[20], w[19], selector); + w[24] = hc_byte_perm (w[19], w[18], selector); + w[23] = hc_byte_perm (w[18], w[17], selector); + w[22] = hc_byte_perm (w[17], w[16], selector); + w[21] = hc_byte_perm (w[16], w[15], selector); + w[20] = hc_byte_perm (w[15], w[14], selector); + w[19] = hc_byte_perm (w[14], w[13], selector); + w[18] = hc_byte_perm (w[13], w[12], selector); + w[17] = hc_byte_perm (w[12], w[11], selector); + w[16] = hc_byte_perm (w[11], w[10], selector); + w[15] = hc_byte_perm (w[10], w[ 9], selector); + w[14] = hc_byte_perm (w[ 9], w[ 8], selector); + w[13] = hc_byte_perm (w[ 8], w[ 7], selector); + w[12] = hc_byte_perm (w[ 7], w[ 6], selector); + w[11] = hc_byte_perm (w[ 6], w[ 5], selector); + w[10] = hc_byte_perm (w[ 5], w[ 4], selector); + w[ 9] = hc_byte_perm (w[ 4], w[ 3], selector); + w[ 8] = hc_byte_perm (w[ 3], w[ 2], selector); + w[ 7] = hc_byte_perm (w[ 2], w[ 1], selector); + w[ 6] = hc_byte_perm (w[ 1], w[ 0], selector); + w[ 5] = hc_byte_perm (w[ 0], 0, selector); w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; @@ -25894,65 +30112,65 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 58: - w[63] = hc_bytealign_be (w[ 4], w[ 5], offset); - w[62] = hc_bytealign_be (w[ 3], w[ 4], offset); - w[61] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[60] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[59] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[58] = hc_bytealign_be ( 0, w[ 0], offset); - w[57] = 0; - w[56] = 0; - w[55] = 0; - w[54] = 0; - w[53] = 0; - w[52] = 0; - w[51] = 0; - w[50] = 0; - w[49] = 0; - w[48] = 0; - w[47] = 0; - w[46] = 0; - w[45] = 0; - w[44] = 0; - w[43] = 0; - w[42] = 0; - w[41] = 0; - w[40] = 0; - w[39] = 0; - w[38] = 0; - w[37] = 0; - w[36] = 0; - w[35] = 0; - w[34] = 0; - w[33] = 0; - w[32] = 0; - w[31] = 0; - w[30] = 0; - w[29] = 0; - w[28] = 0; - w[27] = 0; - w[26] = 0; - w[25] = 0; - w[24] = 0; - w[23] = 0; - w[22] = 0; - w[21] = 0; - w[20] = 0; - w[19] = 0; - w[18] = 0; - w[17] = 0; - w[16] = 0; - w[15] = 0; - w[14] = 0; - w[13] = 0; - w[12] = 0; - w[11] = 0; - w[10] = 0; - w[ 9] = 0; - w[ 8] = 0; - w[ 7] = 0; - w[ 6] = 0; + case 6: + w[63] = hc_byte_perm (w[57], w[56], selector); + w[62] = hc_byte_perm (w[56], w[55], selector); + w[61] = hc_byte_perm (w[55], w[54], selector); + w[60] = hc_byte_perm (w[54], w[53], selector); + w[59] = hc_byte_perm (w[53], w[52], selector); + w[58] = hc_byte_perm (w[52], w[51], selector); + w[57] = hc_byte_perm (w[51], w[50], selector); + w[56] = hc_byte_perm (w[50], w[49], selector); + w[55] = hc_byte_perm (w[49], w[48], selector); + w[54] = hc_byte_perm (w[48], w[47], selector); + w[53] = hc_byte_perm (w[47], w[46], selector); + w[52] = hc_byte_perm (w[46], w[45], selector); + w[51] = hc_byte_perm (w[45], w[44], selector); + w[50] = hc_byte_perm (w[44], w[43], selector); + w[49] = hc_byte_perm (w[43], w[42], selector); + w[48] = hc_byte_perm (w[42], w[41], selector); + w[47] = hc_byte_perm (w[41], w[40], selector); + w[46] = hc_byte_perm (w[40], w[39], selector); + w[45] = hc_byte_perm (w[39], w[38], selector); + w[44] = hc_byte_perm (w[38], w[37], selector); + w[43] = hc_byte_perm (w[37], w[36], selector); + w[42] = hc_byte_perm (w[36], w[35], selector); + w[41] = hc_byte_perm (w[35], w[34], selector); + w[40] = hc_byte_perm (w[34], w[33], selector); + w[39] = hc_byte_perm (w[33], w[32], selector); + w[38] = hc_byte_perm (w[32], w[31], selector); + w[37] = hc_byte_perm (w[31], w[30], selector); + w[36] = hc_byte_perm (w[30], w[29], selector); + w[35] = hc_byte_perm (w[29], w[28], selector); + w[34] = hc_byte_perm (w[28], w[27], selector); + w[33] = hc_byte_perm (w[27], w[26], selector); + w[32] = hc_byte_perm (w[26], w[25], selector); + w[31] = hc_byte_perm (w[25], w[24], selector); + w[30] = hc_byte_perm (w[24], w[23], selector); + w[29] = hc_byte_perm (w[23], w[22], selector); + w[28] = hc_byte_perm (w[22], w[21], selector); + w[27] = hc_byte_perm (w[21], w[20], selector); + w[26] = hc_byte_perm (w[20], w[19], selector); + w[25] = hc_byte_perm (w[19], w[18], selector); + w[24] = hc_byte_perm (w[18], w[17], selector); + w[23] = hc_byte_perm (w[17], w[16], selector); + w[22] = hc_byte_perm (w[16], w[15], selector); + w[21] = hc_byte_perm (w[15], w[14], selector); + w[20] = hc_byte_perm (w[14], w[13], selector); + w[19] = hc_byte_perm (w[13], w[12], selector); + w[18] = hc_byte_perm (w[12], w[11], selector); + w[17] = hc_byte_perm (w[11], w[10], selector); + w[16] = hc_byte_perm (w[10], w[ 9], selector); + w[15] = hc_byte_perm (w[ 9], w[ 8], selector); + w[14] = hc_byte_perm (w[ 8], w[ 7], selector); + w[13] = hc_byte_perm (w[ 7], w[ 6], selector); + w[12] = hc_byte_perm (w[ 6], w[ 5], selector); + w[11] = hc_byte_perm (w[ 5], w[ 4], selector); + w[10] = hc_byte_perm (w[ 4], w[ 3], selector); + w[ 9] = hc_byte_perm (w[ 3], w[ 2], selector); + w[ 8] = hc_byte_perm (w[ 2], w[ 1], selector); + w[ 7] = hc_byte_perm (w[ 1], w[ 0], selector); + w[ 6] = hc_byte_perm (w[ 0], 0, selector); w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; @@ -25962,63 +30180,131 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 59: - w[63] = hc_bytealign_be (w[ 3], w[ 4], offset); - w[62] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[61] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[60] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[59] = hc_bytealign_be ( 0, w[ 0], offset); - w[58] = 0; - w[57] = 0; - w[56] = 0; - w[55] = 0; - w[54] = 0; - w[53] = 0; - w[52] = 0; - w[51] = 0; - w[50] = 0; - w[49] = 0; - w[48] = 0; - w[47] = 0; - w[46] = 0; - w[45] = 0; - w[44] = 0; - w[43] = 0; - w[42] = 0; - w[41] = 0; - w[40] = 0; - w[39] = 0; - w[38] = 0; - w[37] = 0; - w[36] = 0; - w[35] = 0; - w[34] = 0; - w[33] = 0; - w[32] = 0; - w[31] = 0; - w[30] = 0; - w[29] = 0; - w[28] = 0; - w[27] = 0; - w[26] = 0; - w[25] = 0; - w[24] = 0; - w[23] = 0; - w[22] = 0; - w[21] = 0; - w[20] = 0; - w[19] = 0; - w[18] = 0; - w[17] = 0; - w[16] = 0; - w[15] = 0; - w[14] = 0; - w[13] = 0; - w[12] = 0; - w[11] = 0; - w[10] = 0; - w[ 9] = 0; - w[ 8] = 0; + case 7: + w[63] = hc_byte_perm (w[56], w[55], selector); + w[62] = hc_byte_perm (w[55], w[54], selector); + w[61] = hc_byte_perm (w[54], w[53], selector); + w[60] = hc_byte_perm (w[53], w[52], selector); + w[59] = hc_byte_perm (w[52], w[51], selector); + w[58] = hc_byte_perm (w[51], w[50], selector); + w[57] = hc_byte_perm (w[50], w[49], selector); + w[56] = hc_byte_perm (w[49], w[48], selector); + w[55] = hc_byte_perm (w[48], w[47], selector); + w[54] = hc_byte_perm (w[47], w[46], selector); + w[53] = hc_byte_perm (w[46], w[45], selector); + w[52] = hc_byte_perm (w[45], w[44], selector); + w[51] = hc_byte_perm (w[44], w[43], selector); + w[50] = hc_byte_perm (w[43], w[42], selector); + w[49] = hc_byte_perm (w[42], w[41], selector); + w[48] = hc_byte_perm (w[41], w[40], selector); + w[47] = hc_byte_perm (w[40], w[39], selector); + w[46] = hc_byte_perm (w[39], w[38], selector); + w[45] = hc_byte_perm (w[38], w[37], selector); + w[44] = hc_byte_perm (w[37], w[36], selector); + w[43] = hc_byte_perm (w[36], w[35], selector); + w[42] = hc_byte_perm (w[35], w[34], selector); + w[41] = hc_byte_perm (w[34], w[33], selector); + w[40] = hc_byte_perm (w[33], w[32], selector); + w[39] = hc_byte_perm (w[32], w[31], selector); + w[38] = hc_byte_perm (w[31], w[30], selector); + w[37] = hc_byte_perm (w[30], w[29], selector); + w[36] = hc_byte_perm (w[29], w[28], selector); + w[35] = hc_byte_perm (w[28], w[27], selector); + w[34] = hc_byte_perm (w[27], w[26], selector); + w[33] = hc_byte_perm (w[26], w[25], selector); + w[32] = hc_byte_perm (w[25], w[24], selector); + w[31] = hc_byte_perm (w[24], w[23], selector); + w[30] = hc_byte_perm (w[23], w[22], selector); + w[29] = hc_byte_perm (w[22], w[21], selector); + w[28] = hc_byte_perm (w[21], w[20], selector); + w[27] = hc_byte_perm (w[20], w[19], selector); + w[26] = hc_byte_perm (w[19], w[18], selector); + w[25] = hc_byte_perm (w[18], w[17], selector); + w[24] = hc_byte_perm (w[17], w[16], selector); + w[23] = hc_byte_perm (w[16], w[15], selector); + w[22] = hc_byte_perm (w[15], w[14], selector); + w[21] = hc_byte_perm (w[14], w[13], selector); + w[20] = hc_byte_perm (w[13], w[12], selector); + w[19] = hc_byte_perm (w[12], w[11], selector); + w[18] = hc_byte_perm (w[11], w[10], selector); + w[17] = hc_byte_perm (w[10], w[ 9], selector); + w[16] = hc_byte_perm (w[ 9], w[ 8], selector); + w[15] = hc_byte_perm (w[ 8], w[ 7], selector); + w[14] = hc_byte_perm (w[ 7], w[ 6], selector); + w[13] = hc_byte_perm (w[ 6], w[ 5], selector); + w[12] = hc_byte_perm (w[ 5], w[ 4], selector); + w[11] = hc_byte_perm (w[ 4], w[ 3], selector); + w[10] = hc_byte_perm (w[ 3], w[ 2], selector); + w[ 9] = hc_byte_perm (w[ 2], w[ 1], selector); + w[ 8] = hc_byte_perm (w[ 1], w[ 0], selector); + w[ 7] = hc_byte_perm (w[ 0], 0, selector); + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 8: + w[63] = hc_byte_perm (w[55], w[54], selector); + w[62] = hc_byte_perm (w[54], w[53], selector); + w[61] = hc_byte_perm (w[53], w[52], selector); + w[60] = hc_byte_perm (w[52], w[51], selector); + w[59] = hc_byte_perm (w[51], w[50], selector); + w[58] = hc_byte_perm (w[50], w[49], selector); + w[57] = hc_byte_perm (w[49], w[48], selector); + w[56] = hc_byte_perm (w[48], w[47], selector); + w[55] = hc_byte_perm (w[47], w[46], selector); + w[54] = hc_byte_perm (w[46], w[45], selector); + w[53] = hc_byte_perm (w[45], w[44], selector); + w[52] = hc_byte_perm (w[44], w[43], selector); + w[51] = hc_byte_perm (w[43], w[42], selector); + w[50] = hc_byte_perm (w[42], w[41], selector); + w[49] = hc_byte_perm (w[41], w[40], selector); + w[48] = hc_byte_perm (w[40], w[39], selector); + w[47] = hc_byte_perm (w[39], w[38], selector); + w[46] = hc_byte_perm (w[38], w[37], selector); + w[45] = hc_byte_perm (w[37], w[36], selector); + w[44] = hc_byte_perm (w[36], w[35], selector); + w[43] = hc_byte_perm (w[35], w[34], selector); + w[42] = hc_byte_perm (w[34], w[33], selector); + w[41] = hc_byte_perm (w[33], w[32], selector); + w[40] = hc_byte_perm (w[32], w[31], selector); + w[39] = hc_byte_perm (w[31], w[30], selector); + w[38] = hc_byte_perm (w[30], w[29], selector); + w[37] = hc_byte_perm (w[29], w[28], selector); + w[36] = hc_byte_perm (w[28], w[27], selector); + w[35] = hc_byte_perm (w[27], w[26], selector); + w[34] = hc_byte_perm (w[26], w[25], selector); + w[33] = hc_byte_perm (w[25], w[24], selector); + w[32] = hc_byte_perm (w[24], w[23], selector); + w[31] = hc_byte_perm (w[23], w[22], selector); + w[30] = hc_byte_perm (w[22], w[21], selector); + w[29] = hc_byte_perm (w[21], w[20], selector); + w[28] = hc_byte_perm (w[20], w[19], selector); + w[27] = hc_byte_perm (w[19], w[18], selector); + w[26] = hc_byte_perm (w[18], w[17], selector); + w[25] = hc_byte_perm (w[17], w[16], selector); + w[24] = hc_byte_perm (w[16], w[15], selector); + w[23] = hc_byte_perm (w[15], w[14], selector); + w[22] = hc_byte_perm (w[14], w[13], selector); + w[21] = hc_byte_perm (w[13], w[12], selector); + w[20] = hc_byte_perm (w[12], w[11], selector); + w[19] = hc_byte_perm (w[11], w[10], selector); + w[18] = hc_byte_perm (w[10], w[ 9], selector); + w[17] = hc_byte_perm (w[ 9], w[ 8], selector); + w[16] = hc_byte_perm (w[ 8], w[ 7], selector); + w[15] = hc_byte_perm (w[ 7], w[ 6], selector); + w[14] = hc_byte_perm (w[ 6], w[ 5], selector); + w[13] = hc_byte_perm (w[ 5], w[ 4], selector); + w[12] = hc_byte_perm (w[ 4], w[ 3], selector); + w[11] = hc_byte_perm (w[ 3], w[ 2], selector); + w[10] = hc_byte_perm (w[ 2], w[ 1], selector); + w[ 9] = hc_byte_perm (w[ 1], w[ 0], selector); + w[ 8] = hc_byte_perm (w[ 0], 0, selector); w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; @@ -26030,62 +30316,62 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 60: - w[63] = hc_bytealign_be (w[ 2], w[ 3], offset); - w[62] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[61] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[60] = hc_bytealign_be ( 0, w[ 0], offset); - w[59] = 0; - w[58] = 0; - w[57] = 0; - w[56] = 0; - w[55] = 0; - w[54] = 0; - w[53] = 0; - w[52] = 0; - w[51] = 0; - w[50] = 0; - w[49] = 0; - w[48] = 0; - w[47] = 0; - w[46] = 0; - w[45] = 0; - w[44] = 0; - w[43] = 0; - w[42] = 0; - w[41] = 0; - w[40] = 0; - w[39] = 0; - w[38] = 0; - w[37] = 0; - w[36] = 0; - w[35] = 0; - w[34] = 0; - w[33] = 0; - w[32] = 0; - w[31] = 0; - w[30] = 0; - w[29] = 0; - w[28] = 0; - w[27] = 0; - w[26] = 0; - w[25] = 0; - w[24] = 0; - w[23] = 0; - w[22] = 0; - w[21] = 0; - w[20] = 0; - w[19] = 0; - w[18] = 0; - w[17] = 0; - w[16] = 0; - w[15] = 0; - w[14] = 0; - w[13] = 0; - w[12] = 0; - w[11] = 0; - w[10] = 0; - w[ 9] = 0; + case 9: + w[63] = hc_byte_perm (w[54], w[53], selector); + w[62] = hc_byte_perm (w[53], w[52], selector); + w[61] = hc_byte_perm (w[52], w[51], selector); + w[60] = hc_byte_perm (w[51], w[50], selector); + w[59] = hc_byte_perm (w[50], w[49], selector); + w[58] = hc_byte_perm (w[49], w[48], selector); + w[57] = hc_byte_perm (w[48], w[47], selector); + w[56] = hc_byte_perm (w[47], w[46], selector); + w[55] = hc_byte_perm (w[46], w[45], selector); + w[54] = hc_byte_perm (w[45], w[44], selector); + w[53] = hc_byte_perm (w[44], w[43], selector); + w[52] = hc_byte_perm (w[43], w[42], selector); + w[51] = hc_byte_perm (w[42], w[41], selector); + w[50] = hc_byte_perm (w[41], w[40], selector); + w[49] = hc_byte_perm (w[40], w[39], selector); + w[48] = hc_byte_perm (w[39], w[38], selector); + w[47] = hc_byte_perm (w[38], w[37], selector); + w[46] = hc_byte_perm (w[37], w[36], selector); + w[45] = hc_byte_perm (w[36], w[35], selector); + w[44] = hc_byte_perm (w[35], w[34], selector); + w[43] = hc_byte_perm (w[34], w[33], selector); + w[42] = hc_byte_perm (w[33], w[32], selector); + w[41] = hc_byte_perm (w[32], w[31], selector); + w[40] = hc_byte_perm (w[31], w[30], selector); + w[39] = hc_byte_perm (w[30], w[29], selector); + w[38] = hc_byte_perm (w[29], w[28], selector); + w[37] = hc_byte_perm (w[28], w[27], selector); + w[36] = hc_byte_perm (w[27], w[26], selector); + w[35] = hc_byte_perm (w[26], w[25], selector); + w[34] = hc_byte_perm (w[25], w[24], selector); + w[33] = hc_byte_perm (w[24], w[23], selector); + w[32] = hc_byte_perm (w[23], w[22], selector); + w[31] = hc_byte_perm (w[22], w[21], selector); + w[30] = hc_byte_perm (w[21], w[20], selector); + w[29] = hc_byte_perm (w[20], w[19], selector); + w[28] = hc_byte_perm (w[19], w[18], selector); + w[27] = hc_byte_perm (w[18], w[17], selector); + w[26] = hc_byte_perm (w[17], w[16], selector); + w[25] = hc_byte_perm (w[16], w[15], selector); + w[24] = hc_byte_perm (w[15], w[14], selector); + w[23] = hc_byte_perm (w[14], w[13], selector); + w[22] = hc_byte_perm (w[13], w[12], selector); + w[21] = hc_byte_perm (w[12], w[11], selector); + w[20] = hc_byte_perm (w[11], w[10], selector); + w[19] = hc_byte_perm (w[10], w[ 9], selector); + w[18] = hc_byte_perm (w[ 9], w[ 8], selector); + w[17] = hc_byte_perm (w[ 8], w[ 7], selector); + w[16] = hc_byte_perm (w[ 7], w[ 6], selector); + w[15] = hc_byte_perm (w[ 6], w[ 5], selector); + w[14] = hc_byte_perm (w[ 5], w[ 4], selector); + w[13] = hc_byte_perm (w[ 4], w[ 3], selector); + w[12] = hc_byte_perm (w[ 3], w[ 2], selector); + w[11] = hc_byte_perm (w[ 2], w[ 1], selector); + w[10] = hc_byte_perm (w[ 1], w[ 0], selector); + w[ 9] = hc_byte_perm (w[ 0], 0, selector); w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; @@ -26098,61 +30384,61 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 61: - w[63] = hc_bytealign_be (w[ 1], w[ 2], offset); - w[62] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[61] = hc_bytealign_be ( 0, w[ 0], offset); - w[60] = 0; - w[59] = 0; - w[58] = 0; - w[57] = 0; - w[56] = 0; - w[55] = 0; - w[54] = 0; - w[53] = 0; - w[52] = 0; - w[51] = 0; - w[50] = 0; - w[49] = 0; - w[48] = 0; - w[47] = 0; - w[46] = 0; - w[45] = 0; - w[44] = 0; - w[43] = 0; - w[42] = 0; - w[41] = 0; - w[40] = 0; - w[39] = 0; - w[38] = 0; - w[37] = 0; - w[36] = 0; - w[35] = 0; - w[34] = 0; - w[33] = 0; - w[32] = 0; - w[31] = 0; - w[30] = 0; - w[29] = 0; - w[28] = 0; - w[27] = 0; - w[26] = 0; - w[25] = 0; - w[24] = 0; - w[23] = 0; - w[22] = 0; - w[21] = 0; - w[20] = 0; - w[19] = 0; - w[18] = 0; - w[17] = 0; - w[16] = 0; - w[15] = 0; - w[14] = 0; - w[13] = 0; - w[12] = 0; - w[11] = 0; - w[10] = 0; + case 10: + w[63] = hc_byte_perm (w[53], w[52], selector); + w[62] = hc_byte_perm (w[52], w[51], selector); + w[61] = hc_byte_perm (w[51], w[50], selector); + w[60] = hc_byte_perm (w[50], w[49], selector); + w[59] = hc_byte_perm (w[49], w[48], selector); + w[58] = hc_byte_perm (w[48], w[47], selector); + w[57] = hc_byte_perm (w[47], w[46], selector); + w[56] = hc_byte_perm (w[46], w[45], selector); + w[55] = hc_byte_perm (w[45], w[44], selector); + w[54] = hc_byte_perm (w[44], w[43], selector); + w[53] = hc_byte_perm (w[43], w[42], selector); + w[52] = hc_byte_perm (w[42], w[41], selector); + w[51] = hc_byte_perm (w[41], w[40], selector); + w[50] = hc_byte_perm (w[40], w[39], selector); + w[49] = hc_byte_perm (w[39], w[38], selector); + w[48] = hc_byte_perm (w[38], w[37], selector); + w[47] = hc_byte_perm (w[37], w[36], selector); + w[46] = hc_byte_perm (w[36], w[35], selector); + w[45] = hc_byte_perm (w[35], w[34], selector); + w[44] = hc_byte_perm (w[34], w[33], selector); + w[43] = hc_byte_perm (w[33], w[32], selector); + w[42] = hc_byte_perm (w[32], w[31], selector); + w[41] = hc_byte_perm (w[31], w[30], selector); + w[40] = hc_byte_perm (w[30], w[29], selector); + w[39] = hc_byte_perm (w[29], w[28], selector); + w[38] = hc_byte_perm (w[28], w[27], selector); + w[37] = hc_byte_perm (w[27], w[26], selector); + w[36] = hc_byte_perm (w[26], w[25], selector); + w[35] = hc_byte_perm (w[25], w[24], selector); + w[34] = hc_byte_perm (w[24], w[23], selector); + w[33] = hc_byte_perm (w[23], w[22], selector); + w[32] = hc_byte_perm (w[22], w[21], selector); + w[31] = hc_byte_perm (w[21], w[20], selector); + w[30] = hc_byte_perm (w[20], w[19], selector); + w[29] = hc_byte_perm (w[19], w[18], selector); + w[28] = hc_byte_perm (w[18], w[17], selector); + w[27] = hc_byte_perm (w[17], w[16], selector); + w[26] = hc_byte_perm (w[16], w[15], selector); + w[25] = hc_byte_perm (w[15], w[14], selector); + w[24] = hc_byte_perm (w[14], w[13], selector); + w[23] = hc_byte_perm (w[13], w[12], selector); + w[22] = hc_byte_perm (w[12], w[11], selector); + w[21] = hc_byte_perm (w[11], w[10], selector); + w[20] = hc_byte_perm (w[10], w[ 9], selector); + w[19] = hc_byte_perm (w[ 9], w[ 8], selector); + w[18] = hc_byte_perm (w[ 8], w[ 7], selector); + w[17] = hc_byte_perm (w[ 7], w[ 6], selector); + w[16] = hc_byte_perm (w[ 6], w[ 5], selector); + w[15] = hc_byte_perm (w[ 5], w[ 4], selector); + w[14] = hc_byte_perm (w[ 4], w[ 3], selector); + w[13] = hc_byte_perm (w[ 3], w[ 2], selector); + w[12] = hc_byte_perm (w[ 2], w[ 1], selector); + w[11] = hc_byte_perm (w[ 1], w[ 0], selector); + w[10] = hc_byte_perm (w[ 0], 0, selector); w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; @@ -26166,60 +30452,60 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 62: - w[63] = hc_bytealign_be (w[ 0], w[ 1], offset); - w[62] = hc_bytealign_be ( 0, w[ 0], offset); - w[61] = 0; - w[60] = 0; - w[59] = 0; - w[58] = 0; - w[57] = 0; - w[56] = 0; - w[55] = 0; - w[54] = 0; - w[53] = 0; - w[52] = 0; - w[51] = 0; - w[50] = 0; - w[49] = 0; - w[48] = 0; - w[47] = 0; - w[46] = 0; - w[45] = 0; - w[44] = 0; - w[43] = 0; - w[42] = 0; - w[41] = 0; - w[40] = 0; - w[39] = 0; - w[38] = 0; - w[37] = 0; - w[36] = 0; - w[35] = 0; - w[34] = 0; - w[33] = 0; - w[32] = 0; - w[31] = 0; - w[30] = 0; - w[29] = 0; - w[28] = 0; - w[27] = 0; - w[26] = 0; - w[25] = 0; - w[24] = 0; - w[23] = 0; - w[22] = 0; - w[21] = 0; - w[20] = 0; - w[19] = 0; - w[18] = 0; - w[17] = 0; - w[16] = 0; - w[15] = 0; - w[14] = 0; - w[13] = 0; - w[12] = 0; - w[11] = 0; + case 11: + w[63] = hc_byte_perm (w[52], w[51], selector); + w[62] = hc_byte_perm (w[51], w[50], selector); + w[61] = hc_byte_perm (w[50], w[49], selector); + w[60] = hc_byte_perm (w[49], w[48], selector); + w[59] = hc_byte_perm (w[48], w[47], selector); + w[58] = hc_byte_perm (w[47], w[46], selector); + w[57] = hc_byte_perm (w[46], w[45], selector); + w[56] = hc_byte_perm (w[45], w[44], selector); + w[55] = hc_byte_perm (w[44], w[43], selector); + w[54] = hc_byte_perm (w[43], w[42], selector); + w[53] = hc_byte_perm (w[42], w[41], selector); + w[52] = hc_byte_perm (w[41], w[40], selector); + w[51] = hc_byte_perm (w[40], w[39], selector); + w[50] = hc_byte_perm (w[39], w[38], selector); + w[49] = hc_byte_perm (w[38], w[37], selector); + w[48] = hc_byte_perm (w[37], w[36], selector); + w[47] = hc_byte_perm (w[36], w[35], selector); + w[46] = hc_byte_perm (w[35], w[34], selector); + w[45] = hc_byte_perm (w[34], w[33], selector); + w[44] = hc_byte_perm (w[33], w[32], selector); + w[43] = hc_byte_perm (w[32], w[31], selector); + w[42] = hc_byte_perm (w[31], w[30], selector); + w[41] = hc_byte_perm (w[30], w[29], selector); + w[40] = hc_byte_perm (w[29], w[28], selector); + w[39] = hc_byte_perm (w[28], w[27], selector); + w[38] = hc_byte_perm (w[27], w[26], selector); + w[37] = hc_byte_perm (w[26], w[25], selector); + w[36] = hc_byte_perm (w[25], w[24], selector); + w[35] = hc_byte_perm (w[24], w[23], selector); + w[34] = hc_byte_perm (w[23], w[22], selector); + w[33] = hc_byte_perm (w[22], w[21], selector); + w[32] = hc_byte_perm (w[21], w[20], selector); + w[31] = hc_byte_perm (w[20], w[19], selector); + w[30] = hc_byte_perm (w[19], w[18], selector); + w[29] = hc_byte_perm (w[18], w[17], selector); + w[28] = hc_byte_perm (w[17], w[16], selector); + w[27] = hc_byte_perm (w[16], w[15], selector); + w[26] = hc_byte_perm (w[15], w[14], selector); + w[25] = hc_byte_perm (w[14], w[13], selector); + w[24] = hc_byte_perm (w[13], w[12], selector); + w[23] = hc_byte_perm (w[12], w[11], selector); + w[22] = hc_byte_perm (w[11], w[10], selector); + w[21] = hc_byte_perm (w[10], w[ 9], selector); + w[20] = hc_byte_perm (w[ 9], w[ 8], selector); + w[19] = hc_byte_perm (w[ 8], w[ 7], selector); + w[18] = hc_byte_perm (w[ 7], w[ 6], selector); + w[17] = hc_byte_perm (w[ 6], w[ 5], selector); + w[16] = hc_byte_perm (w[ 5], w[ 4], selector); + w[15] = hc_byte_perm (w[ 4], w[ 3], selector); + w[14] = hc_byte_perm (w[ 3], w[ 2], selector); + w[13] = hc_byte_perm (w[ 2], w[ 1], selector); + w[12] = hc_byte_perm (w[ 1], w[ 0], selector); + w[11] = hc_byte_perm (w[ 0], 0, selector); w[10] = 0; w[ 9] = 0; w[ 8] = 0; @@ -26234,59 +30520,59 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 63: - w[63] = hc_bytealign_be ( 0, w[ 0], offset); - w[62] = 0; - w[61] = 0; - w[60] = 0; - w[59] = 0; - w[58] = 0; - w[57] = 0; - w[56] = 0; - w[55] = 0; - w[54] = 0; - w[53] = 0; - w[52] = 0; - w[51] = 0; - w[50] = 0; - w[49] = 0; - w[48] = 0; - w[47] = 0; - w[46] = 0; - w[45] = 0; - w[44] = 0; - w[43] = 0; - w[42] = 0; - w[41] = 0; - w[40] = 0; - w[39] = 0; - w[38] = 0; - w[37] = 0; - w[36] = 0; - w[35] = 0; - w[34] = 0; - w[33] = 0; - w[32] = 0; - w[31] = 0; - w[30] = 0; - w[29] = 0; - w[28] = 0; - w[27] = 0; - w[26] = 0; - w[25] = 0; - w[24] = 0; - w[23] = 0; - w[22] = 0; - w[21] = 0; - w[20] = 0; - w[19] = 0; - w[18] = 0; - w[17] = 0; - w[16] = 0; - w[15] = 0; - w[14] = 0; - w[13] = 0; - w[12] = 0; + case 12: + w[63] = hc_byte_perm (w[51], w[50], selector); + w[62] = hc_byte_perm (w[50], w[49], selector); + w[61] = hc_byte_perm (w[49], w[48], selector); + w[60] = hc_byte_perm (w[48], w[47], selector); + w[59] = hc_byte_perm (w[47], w[46], selector); + w[58] = hc_byte_perm (w[46], w[45], selector); + w[57] = hc_byte_perm (w[45], w[44], selector); + w[56] = hc_byte_perm (w[44], w[43], selector); + w[55] = hc_byte_perm (w[43], w[42], selector); + w[54] = hc_byte_perm (w[42], w[41], selector); + w[53] = hc_byte_perm (w[41], w[40], selector); + w[52] = hc_byte_perm (w[40], w[39], selector); + w[51] = hc_byte_perm (w[39], w[38], selector); + w[50] = hc_byte_perm (w[38], w[37], selector); + w[49] = hc_byte_perm (w[37], w[36], selector); + w[48] = hc_byte_perm (w[36], w[35], selector); + w[47] = hc_byte_perm (w[35], w[34], selector); + w[46] = hc_byte_perm (w[34], w[33], selector); + w[45] = hc_byte_perm (w[33], w[32], selector); + w[44] = hc_byte_perm (w[32], w[31], selector); + w[43] = hc_byte_perm (w[31], w[30], selector); + w[42] = hc_byte_perm (w[30], w[29], selector); + w[41] = hc_byte_perm (w[29], w[28], selector); + w[40] = hc_byte_perm (w[28], w[27], selector); + w[39] = hc_byte_perm (w[27], w[26], selector); + w[38] = hc_byte_perm (w[26], w[25], selector); + w[37] = hc_byte_perm (w[25], w[24], selector); + w[36] = hc_byte_perm (w[24], w[23], selector); + w[35] = hc_byte_perm (w[23], w[22], selector); + w[34] = hc_byte_perm (w[22], w[21], selector); + w[33] = hc_byte_perm (w[21], w[20], selector); + w[32] = hc_byte_perm (w[20], w[19], selector); + w[31] = hc_byte_perm (w[19], w[18], selector); + w[30] = hc_byte_perm (w[18], w[17], selector); + w[29] = hc_byte_perm (w[17], w[16], selector); + w[28] = hc_byte_perm (w[16], w[15], selector); + w[27] = hc_byte_perm (w[15], w[14], selector); + w[26] = hc_byte_perm (w[14], w[13], selector); + w[25] = hc_byte_perm (w[13], w[12], selector); + w[24] = hc_byte_perm (w[12], w[11], selector); + w[23] = hc_byte_perm (w[11], w[10], selector); + w[22] = hc_byte_perm (w[10], w[ 9], selector); + w[21] = hc_byte_perm (w[ 9], w[ 8], selector); + w[20] = hc_byte_perm (w[ 8], w[ 7], selector); + w[19] = hc_byte_perm (w[ 7], w[ 6], selector); + w[18] = hc_byte_perm (w[ 6], w[ 5], selector); + w[17] = hc_byte_perm (w[ 5], w[ 4], selector); + w[16] = hc_byte_perm (w[ 4], w[ 3], selector); + w[15] = hc_byte_perm (w[ 3], w[ 2], selector); + w[14] = hc_byte_perm (w[ 2], w[ 1], selector); + w[13] = hc_byte_perm (w[ 1], w[ 0], selector); + w[12] = hc_byte_perm (w[ 0], 0, selector); w[11] = 0; w[10] = 0; w[ 9] = 0; @@ -26301,354 +30587,340 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) w[ 0] = 0; break; - } - #endif - - #if (defined IS_AMD && HAS_VPERM == 1) || defined IS_NV - - #if defined IS_NV - const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; - #endif - - #if defined IS_AMD - const int selector = 0x0706050403020100 >> ((offset & 3) * 8); - #endif - switch (offset_switch) - { - case 0: - w[63] = hc_byte_perm (w[63], w[62], selector); - w[62] = hc_byte_perm (w[62], w[61], selector); - w[61] = hc_byte_perm (w[61], w[60], selector); - w[60] = hc_byte_perm (w[60], w[59], selector); - w[59] = hc_byte_perm (w[59], w[58], selector); - w[58] = hc_byte_perm (w[58], w[57], selector); - w[57] = hc_byte_perm (w[57], w[56], selector); - w[56] = hc_byte_perm (w[56], w[55], selector); - w[55] = hc_byte_perm (w[55], w[54], selector); - w[54] = hc_byte_perm (w[54], w[53], selector); - w[53] = hc_byte_perm (w[53], w[52], selector); - w[52] = hc_byte_perm (w[52], w[51], selector); - w[51] = hc_byte_perm (w[51], w[50], selector); - w[50] = hc_byte_perm (w[50], w[49], selector); - w[49] = hc_byte_perm (w[49], w[48], selector); - w[48] = hc_byte_perm (w[48], w[47], selector); - w[47] = hc_byte_perm (w[47], w[46], selector); - w[46] = hc_byte_perm (w[46], w[45], selector); - w[45] = hc_byte_perm (w[45], w[44], selector); - w[44] = hc_byte_perm (w[44], w[43], selector); - w[43] = hc_byte_perm (w[43], w[42], selector); - w[42] = hc_byte_perm (w[42], w[41], selector); - w[41] = hc_byte_perm (w[41], w[40], selector); - w[40] = hc_byte_perm (w[40], w[39], selector); - w[39] = hc_byte_perm (w[39], w[38], selector); - w[38] = hc_byte_perm (w[38], w[37], selector); - w[37] = hc_byte_perm (w[37], w[36], selector); - w[36] = hc_byte_perm (w[36], w[35], selector); - w[35] = hc_byte_perm (w[35], w[34], selector); - w[34] = hc_byte_perm (w[34], w[33], selector); - w[33] = hc_byte_perm (w[33], w[32], selector); - w[32] = hc_byte_perm (w[32], w[31], selector); - w[31] = hc_byte_perm (w[31], w[30], selector); - w[30] = hc_byte_perm (w[30], w[29], selector); - w[29] = hc_byte_perm (w[29], w[28], selector); - w[28] = hc_byte_perm (w[28], w[27], selector); - w[27] = hc_byte_perm (w[27], w[26], selector); - w[26] = hc_byte_perm (w[26], w[25], selector); - w[25] = hc_byte_perm (w[25], w[24], selector); - w[24] = hc_byte_perm (w[24], w[23], selector); - w[23] = hc_byte_perm (w[23], w[22], selector); - w[22] = hc_byte_perm (w[22], w[21], selector); - w[21] = hc_byte_perm (w[21], w[20], selector); - w[20] = hc_byte_perm (w[20], w[19], selector); - w[19] = hc_byte_perm (w[19], w[18], selector); - w[18] = hc_byte_perm (w[18], w[17], selector); - w[17] = hc_byte_perm (w[17], w[16], selector); - w[16] = hc_byte_perm (w[16], w[15], selector); - w[15] = hc_byte_perm (w[15], w[14], selector); - w[14] = hc_byte_perm (w[14], w[13], selector); - w[13] = hc_byte_perm (w[13], w[12], selector); - w[12] = hc_byte_perm (w[12], w[11], selector); - w[11] = hc_byte_perm (w[11], w[10], selector); - w[10] = hc_byte_perm (w[10], w[ 9], selector); - w[ 9] = hc_byte_perm (w[ 9], w[ 8], selector); - w[ 8] = hc_byte_perm (w[ 8], w[ 7], selector); - w[ 7] = hc_byte_perm (w[ 7], w[ 6], selector); - w[ 6] = hc_byte_perm (w[ 6], w[ 5], selector); - w[ 5] = hc_byte_perm (w[ 5], w[ 4], selector); - w[ 4] = hc_byte_perm (w[ 4], w[ 3], selector); - w[ 3] = hc_byte_perm (w[ 3], w[ 2], selector); - w[ 2] = hc_byte_perm (w[ 2], w[ 1], selector); - w[ 1] = hc_byte_perm (w[ 1], w[ 0], selector); - w[ 0] = hc_byte_perm (w[ 0], 0, selector); + case 13: + w[63] = hc_byte_perm (w[50], w[49], selector); + w[62] = hc_byte_perm (w[49], w[48], selector); + w[61] = hc_byte_perm (w[48], w[47], selector); + w[60] = hc_byte_perm (w[47], w[46], selector); + w[59] = hc_byte_perm (w[46], w[45], selector); + w[58] = hc_byte_perm (w[45], w[44], selector); + w[57] = hc_byte_perm (w[44], w[43], selector); + w[56] = hc_byte_perm (w[43], w[42], selector); + w[55] = hc_byte_perm (w[42], w[41], selector); + w[54] = hc_byte_perm (w[41], w[40], selector); + w[53] = hc_byte_perm (w[40], w[39], selector); + w[52] = hc_byte_perm (w[39], w[38], selector); + w[51] = hc_byte_perm (w[38], w[37], selector); + w[50] = hc_byte_perm (w[37], w[36], selector); + w[49] = hc_byte_perm (w[36], w[35], selector); + w[48] = hc_byte_perm (w[35], w[34], selector); + w[47] = hc_byte_perm (w[34], w[33], selector); + w[46] = hc_byte_perm (w[33], w[32], selector); + w[45] = hc_byte_perm (w[32], w[31], selector); + w[44] = hc_byte_perm (w[31], w[30], selector); + w[43] = hc_byte_perm (w[30], w[29], selector); + w[42] = hc_byte_perm (w[29], w[28], selector); + w[41] = hc_byte_perm (w[28], w[27], selector); + w[40] = hc_byte_perm (w[27], w[26], selector); + w[39] = hc_byte_perm (w[26], w[25], selector); + w[38] = hc_byte_perm (w[25], w[24], selector); + w[37] = hc_byte_perm (w[24], w[23], selector); + w[36] = hc_byte_perm (w[23], w[22], selector); + w[35] = hc_byte_perm (w[22], w[21], selector); + w[34] = hc_byte_perm (w[21], w[20], selector); + w[33] = hc_byte_perm (w[20], w[19], selector); + w[32] = hc_byte_perm (w[19], w[18], selector); + w[31] = hc_byte_perm (w[18], w[17], selector); + w[30] = hc_byte_perm (w[17], w[16], selector); + w[29] = hc_byte_perm (w[16], w[15], selector); + w[28] = hc_byte_perm (w[15], w[14], selector); + w[27] = hc_byte_perm (w[14], w[13], selector); + w[26] = hc_byte_perm (w[13], w[12], selector); + w[25] = hc_byte_perm (w[12], w[11], selector); + w[24] = hc_byte_perm (w[11], w[10], selector); + w[23] = hc_byte_perm (w[10], w[ 9], selector); + w[22] = hc_byte_perm (w[ 9], w[ 8], selector); + w[21] = hc_byte_perm (w[ 8], w[ 7], selector); + w[20] = hc_byte_perm (w[ 7], w[ 6], selector); + w[19] = hc_byte_perm (w[ 6], w[ 5], selector); + w[18] = hc_byte_perm (w[ 5], w[ 4], selector); + w[17] = hc_byte_perm (w[ 4], w[ 3], selector); + w[16] = hc_byte_perm (w[ 3], w[ 2], selector); + w[15] = hc_byte_perm (w[ 2], w[ 1], selector); + w[14] = hc_byte_perm (w[ 1], w[ 0], selector); + w[13] = hc_byte_perm (w[ 0], 0, selector); + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; break; - case 1: - w[63] = hc_byte_perm (w[62], w[61], selector); - w[62] = hc_byte_perm (w[61], w[60], selector); - w[61] = hc_byte_perm (w[60], w[59], selector); - w[60] = hc_byte_perm (w[59], w[58], selector); - w[59] = hc_byte_perm (w[58], w[57], selector); - w[58] = hc_byte_perm (w[57], w[56], selector); - w[57] = hc_byte_perm (w[56], w[55], selector); - w[56] = hc_byte_perm (w[55], w[54], selector); - w[55] = hc_byte_perm (w[54], w[53], selector); - w[54] = hc_byte_perm (w[53], w[52], selector); - w[53] = hc_byte_perm (w[52], w[51], selector); - w[52] = hc_byte_perm (w[51], w[50], selector); - w[51] = hc_byte_perm (w[50], w[49], selector); - w[50] = hc_byte_perm (w[49], w[48], selector); - w[49] = hc_byte_perm (w[48], w[47], selector); - w[48] = hc_byte_perm (w[47], w[46], selector); - w[47] = hc_byte_perm (w[46], w[45], selector); - w[46] = hc_byte_perm (w[45], w[44], selector); - w[45] = hc_byte_perm (w[44], w[43], selector); - w[44] = hc_byte_perm (w[43], w[42], selector); - w[43] = hc_byte_perm (w[42], w[41], selector); - w[42] = hc_byte_perm (w[41], w[40], selector); - w[41] = hc_byte_perm (w[40], w[39], selector); - w[40] = hc_byte_perm (w[39], w[38], selector); - w[39] = hc_byte_perm (w[38], w[37], selector); - w[38] = hc_byte_perm (w[37], w[36], selector); - w[37] = hc_byte_perm (w[36], w[35], selector); - w[36] = hc_byte_perm (w[35], w[34], selector); - w[35] = hc_byte_perm (w[34], w[33], selector); - w[34] = hc_byte_perm (w[33], w[32], selector); - w[33] = hc_byte_perm (w[32], w[31], selector); - w[32] = hc_byte_perm (w[31], w[30], selector); - w[31] = hc_byte_perm (w[30], w[29], selector); - w[30] = hc_byte_perm (w[29], w[28], selector); - w[29] = hc_byte_perm (w[28], w[27], selector); - w[28] = hc_byte_perm (w[27], w[26], selector); - w[27] = hc_byte_perm (w[26], w[25], selector); - w[26] = hc_byte_perm (w[25], w[24], selector); - w[25] = hc_byte_perm (w[24], w[23], selector); - w[24] = hc_byte_perm (w[23], w[22], selector); - w[23] = hc_byte_perm (w[22], w[21], selector); - w[22] = hc_byte_perm (w[21], w[20], selector); - w[21] = hc_byte_perm (w[20], w[19], selector); - w[20] = hc_byte_perm (w[19], w[18], selector); - w[19] = hc_byte_perm (w[18], w[17], selector); - w[18] = hc_byte_perm (w[17], w[16], selector); - w[17] = hc_byte_perm (w[16], w[15], selector); - w[16] = hc_byte_perm (w[15], w[14], selector); - w[15] = hc_byte_perm (w[14], w[13], selector); - w[14] = hc_byte_perm (w[13], w[12], selector); - w[13] = hc_byte_perm (w[12], w[11], selector); - w[12] = hc_byte_perm (w[11], w[10], selector); - w[11] = hc_byte_perm (w[10], w[ 9], selector); - w[10] = hc_byte_perm (w[ 9], w[ 8], selector); - w[ 9] = hc_byte_perm (w[ 8], w[ 7], selector); - w[ 8] = hc_byte_perm (w[ 7], w[ 6], selector); - w[ 7] = hc_byte_perm (w[ 6], w[ 5], selector); - w[ 6] = hc_byte_perm (w[ 5], w[ 4], selector); - w[ 5] = hc_byte_perm (w[ 4], w[ 3], selector); - w[ 4] = hc_byte_perm (w[ 3], w[ 2], selector); - w[ 3] = hc_byte_perm (w[ 2], w[ 1], selector); - w[ 2] = hc_byte_perm (w[ 1], w[ 0], selector); - w[ 1] = hc_byte_perm (w[ 0], 0, selector); + case 14: + w[63] = hc_byte_perm (w[49], w[48], selector); + w[62] = hc_byte_perm (w[48], w[47], selector); + w[61] = hc_byte_perm (w[47], w[46], selector); + w[60] = hc_byte_perm (w[46], w[45], selector); + w[59] = hc_byte_perm (w[45], w[44], selector); + w[58] = hc_byte_perm (w[44], w[43], selector); + w[57] = hc_byte_perm (w[43], w[42], selector); + w[56] = hc_byte_perm (w[42], w[41], selector); + w[55] = hc_byte_perm (w[41], w[40], selector); + w[54] = hc_byte_perm (w[40], w[39], selector); + w[53] = hc_byte_perm (w[39], w[38], selector); + w[52] = hc_byte_perm (w[38], w[37], selector); + w[51] = hc_byte_perm (w[37], w[36], selector); + w[50] = hc_byte_perm (w[36], w[35], selector); + w[49] = hc_byte_perm (w[35], w[34], selector); + w[48] = hc_byte_perm (w[34], w[33], selector); + w[47] = hc_byte_perm (w[33], w[32], selector); + w[46] = hc_byte_perm (w[32], w[31], selector); + w[45] = hc_byte_perm (w[31], w[30], selector); + w[44] = hc_byte_perm (w[30], w[29], selector); + w[43] = hc_byte_perm (w[29], w[28], selector); + w[42] = hc_byte_perm (w[28], w[27], selector); + w[41] = hc_byte_perm (w[27], w[26], selector); + w[40] = hc_byte_perm (w[26], w[25], selector); + w[39] = hc_byte_perm (w[25], w[24], selector); + w[38] = hc_byte_perm (w[24], w[23], selector); + w[37] = hc_byte_perm (w[23], w[22], selector); + w[36] = hc_byte_perm (w[22], w[21], selector); + w[35] = hc_byte_perm (w[21], w[20], selector); + w[34] = hc_byte_perm (w[20], w[19], selector); + w[33] = hc_byte_perm (w[19], w[18], selector); + w[32] = hc_byte_perm (w[18], w[17], selector); + w[31] = hc_byte_perm (w[17], w[16], selector); + w[30] = hc_byte_perm (w[16], w[15], selector); + w[29] = hc_byte_perm (w[15], w[14], selector); + w[28] = hc_byte_perm (w[14], w[13], selector); + w[27] = hc_byte_perm (w[13], w[12], selector); + w[26] = hc_byte_perm (w[12], w[11], selector); + w[25] = hc_byte_perm (w[11], w[10], selector); + w[24] = hc_byte_perm (w[10], w[ 9], selector); + w[23] = hc_byte_perm (w[ 9], w[ 8], selector); + w[22] = hc_byte_perm (w[ 8], w[ 7], selector); + w[21] = hc_byte_perm (w[ 7], w[ 6], selector); + w[20] = hc_byte_perm (w[ 6], w[ 5], selector); + w[19] = hc_byte_perm (w[ 5], w[ 4], selector); + w[18] = hc_byte_perm (w[ 4], w[ 3], selector); + w[17] = hc_byte_perm (w[ 3], w[ 2], selector); + w[16] = hc_byte_perm (w[ 2], w[ 1], selector); + w[15] = hc_byte_perm (w[ 1], w[ 0], selector); + w[14] = hc_byte_perm (w[ 0], 0, selector); + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; w[ 0] = 0; break; - case 2: - w[63] = hc_byte_perm (w[61], w[60], selector); - w[62] = hc_byte_perm (w[60], w[59], selector); - w[61] = hc_byte_perm (w[59], w[58], selector); - w[60] = hc_byte_perm (w[58], w[57], selector); - w[59] = hc_byte_perm (w[57], w[56], selector); - w[58] = hc_byte_perm (w[56], w[55], selector); - w[57] = hc_byte_perm (w[55], w[54], selector); - w[56] = hc_byte_perm (w[54], w[53], selector); - w[55] = hc_byte_perm (w[53], w[52], selector); - w[54] = hc_byte_perm (w[52], w[51], selector); - w[53] = hc_byte_perm (w[51], w[50], selector); - w[52] = hc_byte_perm (w[50], w[49], selector); - w[51] = hc_byte_perm (w[49], w[48], selector); - w[50] = hc_byte_perm (w[48], w[47], selector); - w[49] = hc_byte_perm (w[47], w[46], selector); - w[48] = hc_byte_perm (w[46], w[45], selector); - w[47] = hc_byte_perm (w[45], w[44], selector); - w[46] = hc_byte_perm (w[44], w[43], selector); - w[45] = hc_byte_perm (w[43], w[42], selector); - w[44] = hc_byte_perm (w[42], w[41], selector); - w[43] = hc_byte_perm (w[41], w[40], selector); - w[42] = hc_byte_perm (w[40], w[39], selector); - w[41] = hc_byte_perm (w[39], w[38], selector); - w[40] = hc_byte_perm (w[38], w[37], selector); - w[39] = hc_byte_perm (w[37], w[36], selector); - w[38] = hc_byte_perm (w[36], w[35], selector); - w[37] = hc_byte_perm (w[35], w[34], selector); - w[36] = hc_byte_perm (w[34], w[33], selector); - w[35] = hc_byte_perm (w[33], w[32], selector); - w[34] = hc_byte_perm (w[32], w[31], selector); - w[33] = hc_byte_perm (w[31], w[30], selector); - w[32] = hc_byte_perm (w[30], w[29], selector); - w[31] = hc_byte_perm (w[29], w[28], selector); - w[30] = hc_byte_perm (w[28], w[27], selector); - w[29] = hc_byte_perm (w[27], w[26], selector); - w[28] = hc_byte_perm (w[26], w[25], selector); - w[27] = hc_byte_perm (w[25], w[24], selector); - w[26] = hc_byte_perm (w[24], w[23], selector); - w[25] = hc_byte_perm (w[23], w[22], selector); - w[24] = hc_byte_perm (w[22], w[21], selector); - w[23] = hc_byte_perm (w[21], w[20], selector); - w[22] = hc_byte_perm (w[20], w[19], selector); - w[21] = hc_byte_perm (w[19], w[18], selector); - w[20] = hc_byte_perm (w[18], w[17], selector); - w[19] = hc_byte_perm (w[17], w[16], selector); - w[18] = hc_byte_perm (w[16], w[15], selector); - w[17] = hc_byte_perm (w[15], w[14], selector); - w[16] = hc_byte_perm (w[14], w[13], selector); - w[15] = hc_byte_perm (w[13], w[12], selector); - w[14] = hc_byte_perm (w[12], w[11], selector); - w[13] = hc_byte_perm (w[11], w[10], selector); - w[12] = hc_byte_perm (w[10], w[ 9], selector); - w[11] = hc_byte_perm (w[ 9], w[ 8], selector); - w[10] = hc_byte_perm (w[ 8], w[ 7], selector); - w[ 9] = hc_byte_perm (w[ 7], w[ 6], selector); - w[ 8] = hc_byte_perm (w[ 6], w[ 5], selector); - w[ 7] = hc_byte_perm (w[ 5], w[ 4], selector); - w[ 6] = hc_byte_perm (w[ 4], w[ 3], selector); - w[ 5] = hc_byte_perm (w[ 3], w[ 2], selector); - w[ 4] = hc_byte_perm (w[ 2], w[ 1], selector); - w[ 3] = hc_byte_perm (w[ 1], w[ 0], selector); - w[ 2] = hc_byte_perm (w[ 0], 0, selector); + case 15: + w[63] = hc_byte_perm (w[48], w[47], selector); + w[62] = hc_byte_perm (w[47], w[46], selector); + w[61] = hc_byte_perm (w[46], w[45], selector); + w[60] = hc_byte_perm (w[45], w[44], selector); + w[59] = hc_byte_perm (w[44], w[43], selector); + w[58] = hc_byte_perm (w[43], w[42], selector); + w[57] = hc_byte_perm (w[42], w[41], selector); + w[56] = hc_byte_perm (w[41], w[40], selector); + w[55] = hc_byte_perm (w[40], w[39], selector); + w[54] = hc_byte_perm (w[39], w[38], selector); + w[53] = hc_byte_perm (w[38], w[37], selector); + w[52] = hc_byte_perm (w[37], w[36], selector); + w[51] = hc_byte_perm (w[36], w[35], selector); + w[50] = hc_byte_perm (w[35], w[34], selector); + w[49] = hc_byte_perm (w[34], w[33], selector); + w[48] = hc_byte_perm (w[33], w[32], selector); + w[47] = hc_byte_perm (w[32], w[31], selector); + w[46] = hc_byte_perm (w[31], w[30], selector); + w[45] = hc_byte_perm (w[30], w[29], selector); + w[44] = hc_byte_perm (w[29], w[28], selector); + w[43] = hc_byte_perm (w[28], w[27], selector); + w[42] = hc_byte_perm (w[27], w[26], selector); + w[41] = hc_byte_perm (w[26], w[25], selector); + w[40] = hc_byte_perm (w[25], w[24], selector); + w[39] = hc_byte_perm (w[24], w[23], selector); + w[38] = hc_byte_perm (w[23], w[22], selector); + w[37] = hc_byte_perm (w[22], w[21], selector); + w[36] = hc_byte_perm (w[21], w[20], selector); + w[35] = hc_byte_perm (w[20], w[19], selector); + w[34] = hc_byte_perm (w[19], w[18], selector); + w[33] = hc_byte_perm (w[18], w[17], selector); + w[32] = hc_byte_perm (w[17], w[16], selector); + w[31] = hc_byte_perm (w[16], w[15], selector); + w[30] = hc_byte_perm (w[15], w[14], selector); + w[29] = hc_byte_perm (w[14], w[13], selector); + w[28] = hc_byte_perm (w[13], w[12], selector); + w[27] = hc_byte_perm (w[12], w[11], selector); + w[26] = hc_byte_perm (w[11], w[10], selector); + w[25] = hc_byte_perm (w[10], w[ 9], selector); + w[24] = hc_byte_perm (w[ 9], w[ 8], selector); + w[23] = hc_byte_perm (w[ 8], w[ 7], selector); + w[22] = hc_byte_perm (w[ 7], w[ 6], selector); + w[21] = hc_byte_perm (w[ 6], w[ 5], selector); + w[20] = hc_byte_perm (w[ 5], w[ 4], selector); + w[19] = hc_byte_perm (w[ 4], w[ 3], selector); + w[18] = hc_byte_perm (w[ 3], w[ 2], selector); + w[17] = hc_byte_perm (w[ 2], w[ 1], selector); + w[16] = hc_byte_perm (w[ 1], w[ 0], selector); + w[15] = hc_byte_perm (w[ 0], 0, selector); + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; - case 3: - w[63] = hc_byte_perm (w[60], w[59], selector); - w[62] = hc_byte_perm (w[59], w[58], selector); - w[61] = hc_byte_perm (w[58], w[57], selector); - w[60] = hc_byte_perm (w[57], w[56], selector); - w[59] = hc_byte_perm (w[56], w[55], selector); - w[58] = hc_byte_perm (w[55], w[54], selector); - w[57] = hc_byte_perm (w[54], w[53], selector); - w[56] = hc_byte_perm (w[53], w[52], selector); - w[55] = hc_byte_perm (w[52], w[51], selector); - w[54] = hc_byte_perm (w[51], w[50], selector); - w[53] = hc_byte_perm (w[50], w[49], selector); - w[52] = hc_byte_perm (w[49], w[48], selector); - w[51] = hc_byte_perm (w[48], w[47], selector); - w[50] = hc_byte_perm (w[47], w[46], selector); - w[49] = hc_byte_perm (w[46], w[45], selector); - w[48] = hc_byte_perm (w[45], w[44], selector); - w[47] = hc_byte_perm (w[44], w[43], selector); - w[46] = hc_byte_perm (w[43], w[42], selector); - w[45] = hc_byte_perm (w[42], w[41], selector); - w[44] = hc_byte_perm (w[41], w[40], selector); - w[43] = hc_byte_perm (w[40], w[39], selector); - w[42] = hc_byte_perm (w[39], w[38], selector); - w[41] = hc_byte_perm (w[38], w[37], selector); - w[40] = hc_byte_perm (w[37], w[36], selector); - w[39] = hc_byte_perm (w[36], w[35], selector); - w[38] = hc_byte_perm (w[35], w[34], selector); - w[37] = hc_byte_perm (w[34], w[33], selector); - w[36] = hc_byte_perm (w[33], w[32], selector); - w[35] = hc_byte_perm (w[32], w[31], selector); - w[34] = hc_byte_perm (w[31], w[30], selector); - w[33] = hc_byte_perm (w[30], w[29], selector); - w[32] = hc_byte_perm (w[29], w[28], selector); - w[31] = hc_byte_perm (w[28], w[27], selector); - w[30] = hc_byte_perm (w[27], w[26], selector); - w[29] = hc_byte_perm (w[26], w[25], selector); - w[28] = hc_byte_perm (w[25], w[24], selector); - w[27] = hc_byte_perm (w[24], w[23], selector); - w[26] = hc_byte_perm (w[23], w[22], selector); - w[25] = hc_byte_perm (w[22], w[21], selector); - w[24] = hc_byte_perm (w[21], w[20], selector); - w[23] = hc_byte_perm (w[20], w[19], selector); - w[22] = hc_byte_perm (w[19], w[18], selector); - w[21] = hc_byte_perm (w[18], w[17], selector); - w[20] = hc_byte_perm (w[17], w[16], selector); - w[19] = hc_byte_perm (w[16], w[15], selector); - w[18] = hc_byte_perm (w[15], w[14], selector); - w[17] = hc_byte_perm (w[14], w[13], selector); - w[16] = hc_byte_perm (w[13], w[12], selector); - w[15] = hc_byte_perm (w[12], w[11], selector); - w[14] = hc_byte_perm (w[11], w[10], selector); - w[13] = hc_byte_perm (w[10], w[ 9], selector); - w[12] = hc_byte_perm (w[ 9], w[ 8], selector); - w[11] = hc_byte_perm (w[ 8], w[ 7], selector); - w[10] = hc_byte_perm (w[ 7], w[ 6], selector); - w[ 9] = hc_byte_perm (w[ 6], w[ 5], selector); - w[ 8] = hc_byte_perm (w[ 5], w[ 4], selector); - w[ 7] = hc_byte_perm (w[ 4], w[ 3], selector); - w[ 6] = hc_byte_perm (w[ 3], w[ 2], selector); - w[ 5] = hc_byte_perm (w[ 2], w[ 1], selector); - w[ 4] = hc_byte_perm (w[ 1], w[ 0], selector); - w[ 3] = hc_byte_perm (w[ 0], 0, selector); + case 16: + w[63] = hc_byte_perm (w[47], w[46], selector); + w[62] = hc_byte_perm (w[46], w[45], selector); + w[61] = hc_byte_perm (w[45], w[44], selector); + w[60] = hc_byte_perm (w[44], w[43], selector); + w[59] = hc_byte_perm (w[43], w[42], selector); + w[58] = hc_byte_perm (w[42], w[41], selector); + w[57] = hc_byte_perm (w[41], w[40], selector); + w[56] = hc_byte_perm (w[40], w[39], selector); + w[55] = hc_byte_perm (w[39], w[38], selector); + w[54] = hc_byte_perm (w[38], w[37], selector); + w[53] = hc_byte_perm (w[37], w[36], selector); + w[52] = hc_byte_perm (w[36], w[35], selector); + w[51] = hc_byte_perm (w[35], w[34], selector); + w[50] = hc_byte_perm (w[34], w[33], selector); + w[49] = hc_byte_perm (w[33], w[32], selector); + w[48] = hc_byte_perm (w[32], w[31], selector); + w[47] = hc_byte_perm (w[31], w[30], selector); + w[46] = hc_byte_perm (w[30], w[29], selector); + w[45] = hc_byte_perm (w[29], w[28], selector); + w[44] = hc_byte_perm (w[28], w[27], selector); + w[43] = hc_byte_perm (w[27], w[26], selector); + w[42] = hc_byte_perm (w[26], w[25], selector); + w[41] = hc_byte_perm (w[25], w[24], selector); + w[40] = hc_byte_perm (w[24], w[23], selector); + w[39] = hc_byte_perm (w[23], w[22], selector); + w[38] = hc_byte_perm (w[22], w[21], selector); + w[37] = hc_byte_perm (w[21], w[20], selector); + w[36] = hc_byte_perm (w[20], w[19], selector); + w[35] = hc_byte_perm (w[19], w[18], selector); + w[34] = hc_byte_perm (w[18], w[17], selector); + w[33] = hc_byte_perm (w[17], w[16], selector); + w[32] = hc_byte_perm (w[16], w[15], selector); + w[31] = hc_byte_perm (w[15], w[14], selector); + w[30] = hc_byte_perm (w[14], w[13], selector); + w[29] = hc_byte_perm (w[13], w[12], selector); + w[28] = hc_byte_perm (w[12], w[11], selector); + w[27] = hc_byte_perm (w[11], w[10], selector); + w[26] = hc_byte_perm (w[10], w[ 9], selector); + w[25] = hc_byte_perm (w[ 9], w[ 8], selector); + w[24] = hc_byte_perm (w[ 8], w[ 7], selector); + w[23] = hc_byte_perm (w[ 7], w[ 6], selector); + w[22] = hc_byte_perm (w[ 6], w[ 5], selector); + w[21] = hc_byte_perm (w[ 5], w[ 4], selector); + w[20] = hc_byte_perm (w[ 4], w[ 3], selector); + w[19] = hc_byte_perm (w[ 3], w[ 2], selector); + w[18] = hc_byte_perm (w[ 2], w[ 1], selector); + w[17] = hc_byte_perm (w[ 1], w[ 0], selector); + w[16] = hc_byte_perm (w[ 0], 0, selector); + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; - case 4: - w[63] = hc_byte_perm (w[59], w[58], selector); - w[62] = hc_byte_perm (w[58], w[57], selector); - w[61] = hc_byte_perm (w[57], w[56], selector); - w[60] = hc_byte_perm (w[56], w[55], selector); - w[59] = hc_byte_perm (w[55], w[54], selector); - w[58] = hc_byte_perm (w[54], w[53], selector); - w[57] = hc_byte_perm (w[53], w[52], selector); - w[56] = hc_byte_perm (w[52], w[51], selector); - w[55] = hc_byte_perm (w[51], w[50], selector); - w[54] = hc_byte_perm (w[50], w[49], selector); - w[53] = hc_byte_perm (w[49], w[48], selector); - w[52] = hc_byte_perm (w[48], w[47], selector); - w[51] = hc_byte_perm (w[47], w[46], selector); - w[50] = hc_byte_perm (w[46], w[45], selector); - w[49] = hc_byte_perm (w[45], w[44], selector); - w[48] = hc_byte_perm (w[44], w[43], selector); - w[47] = hc_byte_perm (w[43], w[42], selector); - w[46] = hc_byte_perm (w[42], w[41], selector); - w[45] = hc_byte_perm (w[41], w[40], selector); - w[44] = hc_byte_perm (w[40], w[39], selector); - w[43] = hc_byte_perm (w[39], w[38], selector); - w[42] = hc_byte_perm (w[38], w[37], selector); - w[41] = hc_byte_perm (w[37], w[36], selector); - w[40] = hc_byte_perm (w[36], w[35], selector); - w[39] = hc_byte_perm (w[35], w[34], selector); - w[38] = hc_byte_perm (w[34], w[33], selector); - w[37] = hc_byte_perm (w[33], w[32], selector); - w[36] = hc_byte_perm (w[32], w[31], selector); - w[35] = hc_byte_perm (w[31], w[30], selector); - w[34] = hc_byte_perm (w[30], w[29], selector); - w[33] = hc_byte_perm (w[29], w[28], selector); - w[32] = hc_byte_perm (w[28], w[27], selector); - w[31] = hc_byte_perm (w[27], w[26], selector); - w[30] = hc_byte_perm (w[26], w[25], selector); - w[29] = hc_byte_perm (w[25], w[24], selector); - w[28] = hc_byte_perm (w[24], w[23], selector); - w[27] = hc_byte_perm (w[23], w[22], selector); - w[26] = hc_byte_perm (w[22], w[21], selector); - w[25] = hc_byte_perm (w[21], w[20], selector); - w[24] = hc_byte_perm (w[20], w[19], selector); - w[23] = hc_byte_perm (w[19], w[18], selector); - w[22] = hc_byte_perm (w[18], w[17], selector); - w[21] = hc_byte_perm (w[17], w[16], selector); - w[20] = hc_byte_perm (w[16], w[15], selector); - w[19] = hc_byte_perm (w[15], w[14], selector); - w[18] = hc_byte_perm (w[14], w[13], selector); - w[17] = hc_byte_perm (w[13], w[12], selector); - w[16] = hc_byte_perm (w[12], w[11], selector); - w[15] = hc_byte_perm (w[11], w[10], selector); - w[14] = hc_byte_perm (w[10], w[ 9], selector); - w[13] = hc_byte_perm (w[ 9], w[ 8], selector); - w[12] = hc_byte_perm (w[ 8], w[ 7], selector); - w[11] = hc_byte_perm (w[ 7], w[ 6], selector); - w[10] = hc_byte_perm (w[ 6], w[ 5], selector); - w[ 9] = hc_byte_perm (w[ 5], w[ 4], selector); - w[ 8] = hc_byte_perm (w[ 4], w[ 3], selector); - w[ 7] = hc_byte_perm (w[ 3], w[ 2], selector); - w[ 6] = hc_byte_perm (w[ 2], w[ 1], selector); - w[ 5] = hc_byte_perm (w[ 1], w[ 0], selector); - w[ 4] = hc_byte_perm (w[ 0], 0, selector); + case 17: + w[63] = hc_byte_perm (w[46], w[45], selector); + w[62] = hc_byte_perm (w[45], w[44], selector); + w[61] = hc_byte_perm (w[44], w[43], selector); + w[60] = hc_byte_perm (w[43], w[42], selector); + w[59] = hc_byte_perm (w[42], w[41], selector); + w[58] = hc_byte_perm (w[41], w[40], selector); + w[57] = hc_byte_perm (w[40], w[39], selector); + w[56] = hc_byte_perm (w[39], w[38], selector); + w[55] = hc_byte_perm (w[38], w[37], selector); + w[54] = hc_byte_perm (w[37], w[36], selector); + w[53] = hc_byte_perm (w[36], w[35], selector); + w[52] = hc_byte_perm (w[35], w[34], selector); + w[51] = hc_byte_perm (w[34], w[33], selector); + w[50] = hc_byte_perm (w[33], w[32], selector); + w[49] = hc_byte_perm (w[32], w[31], selector); + w[48] = hc_byte_perm (w[31], w[30], selector); + w[47] = hc_byte_perm (w[30], w[29], selector); + w[46] = hc_byte_perm (w[29], w[28], selector); + w[45] = hc_byte_perm (w[28], w[27], selector); + w[44] = hc_byte_perm (w[27], w[26], selector); + w[43] = hc_byte_perm (w[26], w[25], selector); + w[42] = hc_byte_perm (w[25], w[24], selector); + w[41] = hc_byte_perm (w[24], w[23], selector); + w[40] = hc_byte_perm (w[23], w[22], selector); + w[39] = hc_byte_perm (w[22], w[21], selector); + w[38] = hc_byte_perm (w[21], w[20], selector); + w[37] = hc_byte_perm (w[20], w[19], selector); + w[36] = hc_byte_perm (w[19], w[18], selector); + w[35] = hc_byte_perm (w[18], w[17], selector); + w[34] = hc_byte_perm (w[17], w[16], selector); + w[33] = hc_byte_perm (w[16], w[15], selector); + w[32] = hc_byte_perm (w[15], w[14], selector); + w[31] = hc_byte_perm (w[14], w[13], selector); + w[30] = hc_byte_perm (w[13], w[12], selector); + w[29] = hc_byte_perm (w[12], w[11], selector); + w[28] = hc_byte_perm (w[11], w[10], selector); + w[27] = hc_byte_perm (w[10], w[ 9], selector); + w[26] = hc_byte_perm (w[ 9], w[ 8], selector); + w[25] = hc_byte_perm (w[ 8], w[ 7], selector); + w[24] = hc_byte_perm (w[ 7], w[ 6], selector); + w[23] = hc_byte_perm (w[ 6], w[ 5], selector); + w[22] = hc_byte_perm (w[ 5], w[ 4], selector); + w[21] = hc_byte_perm (w[ 4], w[ 3], selector); + w[20] = hc_byte_perm (w[ 3], w[ 2], selector); + w[19] = hc_byte_perm (w[ 2], w[ 1], selector); + w[18] = hc_byte_perm (w[ 1], w[ 0], selector); + w[17] = hc_byte_perm (w[ 0], 0, selector); + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; @@ -26656,66 +30928,66 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 5: - w[63] = hc_byte_perm (w[58], w[57], selector); - w[62] = hc_byte_perm (w[57], w[56], selector); - w[61] = hc_byte_perm (w[56], w[55], selector); - w[60] = hc_byte_perm (w[55], w[54], selector); - w[59] = hc_byte_perm (w[54], w[53], selector); - w[58] = hc_byte_perm (w[53], w[52], selector); - w[57] = hc_byte_perm (w[52], w[51], selector); - w[56] = hc_byte_perm (w[51], w[50], selector); - w[55] = hc_byte_perm (w[50], w[49], selector); - w[54] = hc_byte_perm (w[49], w[48], selector); - w[53] = hc_byte_perm (w[48], w[47], selector); - w[52] = hc_byte_perm (w[47], w[46], selector); - w[51] = hc_byte_perm (w[46], w[45], selector); - w[50] = hc_byte_perm (w[45], w[44], selector); - w[49] = hc_byte_perm (w[44], w[43], selector); - w[48] = hc_byte_perm (w[43], w[42], selector); - w[47] = hc_byte_perm (w[42], w[41], selector); - w[46] = hc_byte_perm (w[41], w[40], selector); - w[45] = hc_byte_perm (w[40], w[39], selector); - w[44] = hc_byte_perm (w[39], w[38], selector); - w[43] = hc_byte_perm (w[38], w[37], selector); - w[42] = hc_byte_perm (w[37], w[36], selector); - w[41] = hc_byte_perm (w[36], w[35], selector); - w[40] = hc_byte_perm (w[35], w[34], selector); - w[39] = hc_byte_perm (w[34], w[33], selector); - w[38] = hc_byte_perm (w[33], w[32], selector); - w[37] = hc_byte_perm (w[32], w[31], selector); - w[36] = hc_byte_perm (w[31], w[30], selector); - w[35] = hc_byte_perm (w[30], w[29], selector); - w[34] = hc_byte_perm (w[29], w[28], selector); - w[33] = hc_byte_perm (w[28], w[27], selector); - w[32] = hc_byte_perm (w[27], w[26], selector); - w[31] = hc_byte_perm (w[26], w[25], selector); - w[30] = hc_byte_perm (w[25], w[24], selector); - w[29] = hc_byte_perm (w[24], w[23], selector); - w[28] = hc_byte_perm (w[23], w[22], selector); - w[27] = hc_byte_perm (w[22], w[21], selector); - w[26] = hc_byte_perm (w[21], w[20], selector); - w[25] = hc_byte_perm (w[20], w[19], selector); - w[24] = hc_byte_perm (w[19], w[18], selector); - w[23] = hc_byte_perm (w[18], w[17], selector); - w[22] = hc_byte_perm (w[17], w[16], selector); - w[21] = hc_byte_perm (w[16], w[15], selector); - w[20] = hc_byte_perm (w[15], w[14], selector); - w[19] = hc_byte_perm (w[14], w[13], selector); - w[18] = hc_byte_perm (w[13], w[12], selector); - w[17] = hc_byte_perm (w[12], w[11], selector); - w[16] = hc_byte_perm (w[11], w[10], selector); - w[15] = hc_byte_perm (w[10], w[ 9], selector); - w[14] = hc_byte_perm (w[ 9], w[ 8], selector); - w[13] = hc_byte_perm (w[ 8], w[ 7], selector); - w[12] = hc_byte_perm (w[ 7], w[ 6], selector); - w[11] = hc_byte_perm (w[ 6], w[ 5], selector); - w[10] = hc_byte_perm (w[ 5], w[ 4], selector); - w[ 9] = hc_byte_perm (w[ 4], w[ 3], selector); - w[ 8] = hc_byte_perm (w[ 3], w[ 2], selector); - w[ 7] = hc_byte_perm (w[ 2], w[ 1], selector); - w[ 6] = hc_byte_perm (w[ 1], w[ 0], selector); - w[ 5] = hc_byte_perm (w[ 0], 0, selector); + case 18: + w[63] = hc_byte_perm (w[45], w[44], selector); + w[62] = hc_byte_perm (w[44], w[43], selector); + w[61] = hc_byte_perm (w[43], w[42], selector); + w[60] = hc_byte_perm (w[42], w[41], selector); + w[59] = hc_byte_perm (w[41], w[40], selector); + w[58] = hc_byte_perm (w[40], w[39], selector); + w[57] = hc_byte_perm (w[39], w[38], selector); + w[56] = hc_byte_perm (w[38], w[37], selector); + w[55] = hc_byte_perm (w[37], w[36], selector); + w[54] = hc_byte_perm (w[36], w[35], selector); + w[53] = hc_byte_perm (w[35], w[34], selector); + w[52] = hc_byte_perm (w[34], w[33], selector); + w[51] = hc_byte_perm (w[33], w[32], selector); + w[50] = hc_byte_perm (w[32], w[31], selector); + w[49] = hc_byte_perm (w[31], w[30], selector); + w[48] = hc_byte_perm (w[30], w[29], selector); + w[47] = hc_byte_perm (w[29], w[28], selector); + w[46] = hc_byte_perm (w[28], w[27], selector); + w[45] = hc_byte_perm (w[27], w[26], selector); + w[44] = hc_byte_perm (w[26], w[25], selector); + w[43] = hc_byte_perm (w[25], w[24], selector); + w[42] = hc_byte_perm (w[24], w[23], selector); + w[41] = hc_byte_perm (w[23], w[22], selector); + w[40] = hc_byte_perm (w[22], w[21], selector); + w[39] = hc_byte_perm (w[21], w[20], selector); + w[38] = hc_byte_perm (w[20], w[19], selector); + w[37] = hc_byte_perm (w[19], w[18], selector); + w[36] = hc_byte_perm (w[18], w[17], selector); + w[35] = hc_byte_perm (w[17], w[16], selector); + w[34] = hc_byte_perm (w[16], w[15], selector); + w[33] = hc_byte_perm (w[15], w[14], selector); + w[32] = hc_byte_perm (w[14], w[13], selector); + w[31] = hc_byte_perm (w[13], w[12], selector); + w[30] = hc_byte_perm (w[12], w[11], selector); + w[29] = hc_byte_perm (w[11], w[10], selector); + w[28] = hc_byte_perm (w[10], w[ 9], selector); + w[27] = hc_byte_perm (w[ 9], w[ 8], selector); + w[26] = hc_byte_perm (w[ 8], w[ 7], selector); + w[25] = hc_byte_perm (w[ 7], w[ 6], selector); + w[24] = hc_byte_perm (w[ 6], w[ 5], selector); + w[23] = hc_byte_perm (w[ 5], w[ 4], selector); + w[22] = hc_byte_perm (w[ 4], w[ 3], selector); + w[21] = hc_byte_perm (w[ 3], w[ 2], selector); + w[20] = hc_byte_perm (w[ 2], w[ 1], selector); + w[19] = hc_byte_perm (w[ 1], w[ 0], selector); + w[18] = hc_byte_perm (w[ 0], 0, selector); + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; @@ -26724,65 +30996,65 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 6: - w[63] = hc_byte_perm (w[57], w[56], selector); - w[62] = hc_byte_perm (w[56], w[55], selector); - w[61] = hc_byte_perm (w[55], w[54], selector); - w[60] = hc_byte_perm (w[54], w[53], selector); - w[59] = hc_byte_perm (w[53], w[52], selector); - w[58] = hc_byte_perm (w[52], w[51], selector); - w[57] = hc_byte_perm (w[51], w[50], selector); - w[56] = hc_byte_perm (w[50], w[49], selector); - w[55] = hc_byte_perm (w[49], w[48], selector); - w[54] = hc_byte_perm (w[48], w[47], selector); - w[53] = hc_byte_perm (w[47], w[46], selector); - w[52] = hc_byte_perm (w[46], w[45], selector); - w[51] = hc_byte_perm (w[45], w[44], selector); - w[50] = hc_byte_perm (w[44], w[43], selector); - w[49] = hc_byte_perm (w[43], w[42], selector); - w[48] = hc_byte_perm (w[42], w[41], selector); - w[47] = hc_byte_perm (w[41], w[40], selector); - w[46] = hc_byte_perm (w[40], w[39], selector); - w[45] = hc_byte_perm (w[39], w[38], selector); - w[44] = hc_byte_perm (w[38], w[37], selector); - w[43] = hc_byte_perm (w[37], w[36], selector); - w[42] = hc_byte_perm (w[36], w[35], selector); - w[41] = hc_byte_perm (w[35], w[34], selector); - w[40] = hc_byte_perm (w[34], w[33], selector); - w[39] = hc_byte_perm (w[33], w[32], selector); - w[38] = hc_byte_perm (w[32], w[31], selector); - w[37] = hc_byte_perm (w[31], w[30], selector); - w[36] = hc_byte_perm (w[30], w[29], selector); - w[35] = hc_byte_perm (w[29], w[28], selector); - w[34] = hc_byte_perm (w[28], w[27], selector); - w[33] = hc_byte_perm (w[27], w[26], selector); - w[32] = hc_byte_perm (w[26], w[25], selector); - w[31] = hc_byte_perm (w[25], w[24], selector); - w[30] = hc_byte_perm (w[24], w[23], selector); - w[29] = hc_byte_perm (w[23], w[22], selector); - w[28] = hc_byte_perm (w[22], w[21], selector); - w[27] = hc_byte_perm (w[21], w[20], selector); - w[26] = hc_byte_perm (w[20], w[19], selector); - w[25] = hc_byte_perm (w[19], w[18], selector); - w[24] = hc_byte_perm (w[18], w[17], selector); - w[23] = hc_byte_perm (w[17], w[16], selector); - w[22] = hc_byte_perm (w[16], w[15], selector); - w[21] = hc_byte_perm (w[15], w[14], selector); - w[20] = hc_byte_perm (w[14], w[13], selector); - w[19] = hc_byte_perm (w[13], w[12], selector); - w[18] = hc_byte_perm (w[12], w[11], selector); - w[17] = hc_byte_perm (w[11], w[10], selector); - w[16] = hc_byte_perm (w[10], w[ 9], selector); - w[15] = hc_byte_perm (w[ 9], w[ 8], selector); - w[14] = hc_byte_perm (w[ 8], w[ 7], selector); - w[13] = hc_byte_perm (w[ 7], w[ 6], selector); - w[12] = hc_byte_perm (w[ 6], w[ 5], selector); - w[11] = hc_byte_perm (w[ 5], w[ 4], selector); - w[10] = hc_byte_perm (w[ 4], w[ 3], selector); - w[ 9] = hc_byte_perm (w[ 3], w[ 2], selector); - w[ 8] = hc_byte_perm (w[ 2], w[ 1], selector); - w[ 7] = hc_byte_perm (w[ 1], w[ 0], selector); - w[ 6] = hc_byte_perm (w[ 0], 0, selector); + case 19: + w[63] = hc_byte_perm (w[44], w[43], selector); + w[62] = hc_byte_perm (w[43], w[42], selector); + w[61] = hc_byte_perm (w[42], w[41], selector); + w[60] = hc_byte_perm (w[41], w[40], selector); + w[59] = hc_byte_perm (w[40], w[39], selector); + w[58] = hc_byte_perm (w[39], w[38], selector); + w[57] = hc_byte_perm (w[38], w[37], selector); + w[56] = hc_byte_perm (w[37], w[36], selector); + w[55] = hc_byte_perm (w[36], w[35], selector); + w[54] = hc_byte_perm (w[35], w[34], selector); + w[53] = hc_byte_perm (w[34], w[33], selector); + w[52] = hc_byte_perm (w[33], w[32], selector); + w[51] = hc_byte_perm (w[32], w[31], selector); + w[50] = hc_byte_perm (w[31], w[30], selector); + w[49] = hc_byte_perm (w[30], w[29], selector); + w[48] = hc_byte_perm (w[29], w[28], selector); + w[47] = hc_byte_perm (w[28], w[27], selector); + w[46] = hc_byte_perm (w[27], w[26], selector); + w[45] = hc_byte_perm (w[26], w[25], selector); + w[44] = hc_byte_perm (w[25], w[24], selector); + w[43] = hc_byte_perm (w[24], w[23], selector); + w[42] = hc_byte_perm (w[23], w[22], selector); + w[41] = hc_byte_perm (w[22], w[21], selector); + w[40] = hc_byte_perm (w[21], w[20], selector); + w[39] = hc_byte_perm (w[20], w[19], selector); + w[38] = hc_byte_perm (w[19], w[18], selector); + w[37] = hc_byte_perm (w[18], w[17], selector); + w[36] = hc_byte_perm (w[17], w[16], selector); + w[35] = hc_byte_perm (w[16], w[15], selector); + w[34] = hc_byte_perm (w[15], w[14], selector); + w[33] = hc_byte_perm (w[14], w[13], selector); + w[32] = hc_byte_perm (w[13], w[12], selector); + w[31] = hc_byte_perm (w[12], w[11], selector); + w[30] = hc_byte_perm (w[11], w[10], selector); + w[29] = hc_byte_perm (w[10], w[ 9], selector); + w[28] = hc_byte_perm (w[ 9], w[ 8], selector); + w[27] = hc_byte_perm (w[ 8], w[ 7], selector); + w[26] = hc_byte_perm (w[ 7], w[ 6], selector); + w[25] = hc_byte_perm (w[ 6], w[ 5], selector); + w[24] = hc_byte_perm (w[ 5], w[ 4], selector); + w[23] = hc_byte_perm (w[ 4], w[ 3], selector); + w[22] = hc_byte_perm (w[ 3], w[ 2], selector); + w[21] = hc_byte_perm (w[ 2], w[ 1], selector); + w[20] = hc_byte_perm (w[ 1], w[ 0], selector); + w[19] = hc_byte_perm (w[ 0], 0, selector); + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; @@ -26792,64 +31064,132 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 7: - w[63] = hc_byte_perm (w[56], w[55], selector); - w[62] = hc_byte_perm (w[55], w[54], selector); - w[61] = hc_byte_perm (w[54], w[53], selector); - w[60] = hc_byte_perm (w[53], w[52], selector); - w[59] = hc_byte_perm (w[52], w[51], selector); - w[58] = hc_byte_perm (w[51], w[50], selector); - w[57] = hc_byte_perm (w[50], w[49], selector); - w[56] = hc_byte_perm (w[49], w[48], selector); - w[55] = hc_byte_perm (w[48], w[47], selector); - w[54] = hc_byte_perm (w[47], w[46], selector); - w[53] = hc_byte_perm (w[46], w[45], selector); - w[52] = hc_byte_perm (w[45], w[44], selector); - w[51] = hc_byte_perm (w[44], w[43], selector); - w[50] = hc_byte_perm (w[43], w[42], selector); - w[49] = hc_byte_perm (w[42], w[41], selector); - w[48] = hc_byte_perm (w[41], w[40], selector); - w[47] = hc_byte_perm (w[40], w[39], selector); - w[46] = hc_byte_perm (w[39], w[38], selector); - w[45] = hc_byte_perm (w[38], w[37], selector); - w[44] = hc_byte_perm (w[37], w[36], selector); - w[43] = hc_byte_perm (w[36], w[35], selector); - w[42] = hc_byte_perm (w[35], w[34], selector); - w[41] = hc_byte_perm (w[34], w[33], selector); - w[40] = hc_byte_perm (w[33], w[32], selector); - w[39] = hc_byte_perm (w[32], w[31], selector); - w[38] = hc_byte_perm (w[31], w[30], selector); - w[37] = hc_byte_perm (w[30], w[29], selector); - w[36] = hc_byte_perm (w[29], w[28], selector); - w[35] = hc_byte_perm (w[28], w[27], selector); - w[34] = hc_byte_perm (w[27], w[26], selector); - w[33] = hc_byte_perm (w[26], w[25], selector); - w[32] = hc_byte_perm (w[25], w[24], selector); - w[31] = hc_byte_perm (w[24], w[23], selector); - w[30] = hc_byte_perm (w[23], w[22], selector); - w[29] = hc_byte_perm (w[22], w[21], selector); - w[28] = hc_byte_perm (w[21], w[20], selector); - w[27] = hc_byte_perm (w[20], w[19], selector); - w[26] = hc_byte_perm (w[19], w[18], selector); - w[25] = hc_byte_perm (w[18], w[17], selector); - w[24] = hc_byte_perm (w[17], w[16], selector); - w[23] = hc_byte_perm (w[16], w[15], selector); - w[22] = hc_byte_perm (w[15], w[14], selector); - w[21] = hc_byte_perm (w[14], w[13], selector); - w[20] = hc_byte_perm (w[13], w[12], selector); - w[19] = hc_byte_perm (w[12], w[11], selector); - w[18] = hc_byte_perm (w[11], w[10], selector); - w[17] = hc_byte_perm (w[10], w[ 9], selector); - w[16] = hc_byte_perm (w[ 9], w[ 8], selector); - w[15] = hc_byte_perm (w[ 8], w[ 7], selector); - w[14] = hc_byte_perm (w[ 7], w[ 6], selector); - w[13] = hc_byte_perm (w[ 6], w[ 5], selector); - w[12] = hc_byte_perm (w[ 5], w[ 4], selector); - w[11] = hc_byte_perm (w[ 4], w[ 3], selector); - w[10] = hc_byte_perm (w[ 3], w[ 2], selector); - w[ 9] = hc_byte_perm (w[ 2], w[ 1], selector); - w[ 8] = hc_byte_perm (w[ 1], w[ 0], selector); - w[ 7] = hc_byte_perm (w[ 0], 0, selector); + case 20: + w[63] = hc_byte_perm (w[43], w[42], selector); + w[62] = hc_byte_perm (w[42], w[41], selector); + w[61] = hc_byte_perm (w[41], w[40], selector); + w[60] = hc_byte_perm (w[40], w[39], selector); + w[59] = hc_byte_perm (w[39], w[38], selector); + w[58] = hc_byte_perm (w[38], w[37], selector); + w[57] = hc_byte_perm (w[37], w[36], selector); + w[56] = hc_byte_perm (w[36], w[35], selector); + w[55] = hc_byte_perm (w[35], w[34], selector); + w[54] = hc_byte_perm (w[34], w[33], selector); + w[53] = hc_byte_perm (w[33], w[32], selector); + w[52] = hc_byte_perm (w[32], w[31], selector); + w[51] = hc_byte_perm (w[31], w[30], selector); + w[50] = hc_byte_perm (w[30], w[29], selector); + w[49] = hc_byte_perm (w[29], w[28], selector); + w[48] = hc_byte_perm (w[28], w[27], selector); + w[47] = hc_byte_perm (w[27], w[26], selector); + w[46] = hc_byte_perm (w[26], w[25], selector); + w[45] = hc_byte_perm (w[25], w[24], selector); + w[44] = hc_byte_perm (w[24], w[23], selector); + w[43] = hc_byte_perm (w[23], w[22], selector); + w[42] = hc_byte_perm (w[22], w[21], selector); + w[41] = hc_byte_perm (w[21], w[20], selector); + w[40] = hc_byte_perm (w[20], w[19], selector); + w[39] = hc_byte_perm (w[19], w[18], selector); + w[38] = hc_byte_perm (w[18], w[17], selector); + w[37] = hc_byte_perm (w[17], w[16], selector); + w[36] = hc_byte_perm (w[16], w[15], selector); + w[35] = hc_byte_perm (w[15], w[14], selector); + w[34] = hc_byte_perm (w[14], w[13], selector); + w[33] = hc_byte_perm (w[13], w[12], selector); + w[32] = hc_byte_perm (w[12], w[11], selector); + w[31] = hc_byte_perm (w[11], w[10], selector); + w[30] = hc_byte_perm (w[10], w[ 9], selector); + w[29] = hc_byte_perm (w[ 9], w[ 8], selector); + w[28] = hc_byte_perm (w[ 8], w[ 7], selector); + w[27] = hc_byte_perm (w[ 7], w[ 6], selector); + w[26] = hc_byte_perm (w[ 6], w[ 5], selector); + w[25] = hc_byte_perm (w[ 5], w[ 4], selector); + w[24] = hc_byte_perm (w[ 4], w[ 3], selector); + w[23] = hc_byte_perm (w[ 3], w[ 2], selector); + w[22] = hc_byte_perm (w[ 2], w[ 1], selector); + w[21] = hc_byte_perm (w[ 1], w[ 0], selector); + w[20] = hc_byte_perm (w[ 0], 0, selector); + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 21: + w[63] = hc_byte_perm (w[42], w[41], selector); + w[62] = hc_byte_perm (w[41], w[40], selector); + w[61] = hc_byte_perm (w[40], w[39], selector); + w[60] = hc_byte_perm (w[39], w[38], selector); + w[59] = hc_byte_perm (w[38], w[37], selector); + w[58] = hc_byte_perm (w[37], w[36], selector); + w[57] = hc_byte_perm (w[36], w[35], selector); + w[56] = hc_byte_perm (w[35], w[34], selector); + w[55] = hc_byte_perm (w[34], w[33], selector); + w[54] = hc_byte_perm (w[33], w[32], selector); + w[53] = hc_byte_perm (w[32], w[31], selector); + w[52] = hc_byte_perm (w[31], w[30], selector); + w[51] = hc_byte_perm (w[30], w[29], selector); + w[50] = hc_byte_perm (w[29], w[28], selector); + w[49] = hc_byte_perm (w[28], w[27], selector); + w[48] = hc_byte_perm (w[27], w[26], selector); + w[47] = hc_byte_perm (w[26], w[25], selector); + w[46] = hc_byte_perm (w[25], w[24], selector); + w[45] = hc_byte_perm (w[24], w[23], selector); + w[44] = hc_byte_perm (w[23], w[22], selector); + w[43] = hc_byte_perm (w[22], w[21], selector); + w[42] = hc_byte_perm (w[21], w[20], selector); + w[41] = hc_byte_perm (w[20], w[19], selector); + w[40] = hc_byte_perm (w[19], w[18], selector); + w[39] = hc_byte_perm (w[18], w[17], selector); + w[38] = hc_byte_perm (w[17], w[16], selector); + w[37] = hc_byte_perm (w[16], w[15], selector); + w[36] = hc_byte_perm (w[15], w[14], selector); + w[35] = hc_byte_perm (w[14], w[13], selector); + w[34] = hc_byte_perm (w[13], w[12], selector); + w[33] = hc_byte_perm (w[12], w[11], selector); + w[32] = hc_byte_perm (w[11], w[10], selector); + w[31] = hc_byte_perm (w[10], w[ 9], selector); + w[30] = hc_byte_perm (w[ 9], w[ 8], selector); + w[29] = hc_byte_perm (w[ 8], w[ 7], selector); + w[28] = hc_byte_perm (w[ 7], w[ 6], selector); + w[27] = hc_byte_perm (w[ 6], w[ 5], selector); + w[26] = hc_byte_perm (w[ 5], w[ 4], selector); + w[25] = hc_byte_perm (w[ 4], w[ 3], selector); + w[24] = hc_byte_perm (w[ 3], w[ 2], selector); + w[23] = hc_byte_perm (w[ 2], w[ 1], selector); + w[22] = hc_byte_perm (w[ 1], w[ 0], selector); + w[21] = hc_byte_perm (w[ 0], 0, selector); + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; @@ -26860,63 +31200,63 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 8: - w[63] = hc_byte_perm (w[55], w[54], selector); - w[62] = hc_byte_perm (w[54], w[53], selector); - w[61] = hc_byte_perm (w[53], w[52], selector); - w[60] = hc_byte_perm (w[52], w[51], selector); - w[59] = hc_byte_perm (w[51], w[50], selector); - w[58] = hc_byte_perm (w[50], w[49], selector); - w[57] = hc_byte_perm (w[49], w[48], selector); - w[56] = hc_byte_perm (w[48], w[47], selector); - w[55] = hc_byte_perm (w[47], w[46], selector); - w[54] = hc_byte_perm (w[46], w[45], selector); - w[53] = hc_byte_perm (w[45], w[44], selector); - w[52] = hc_byte_perm (w[44], w[43], selector); - w[51] = hc_byte_perm (w[43], w[42], selector); - w[50] = hc_byte_perm (w[42], w[41], selector); - w[49] = hc_byte_perm (w[41], w[40], selector); - w[48] = hc_byte_perm (w[40], w[39], selector); - w[47] = hc_byte_perm (w[39], w[38], selector); - w[46] = hc_byte_perm (w[38], w[37], selector); - w[45] = hc_byte_perm (w[37], w[36], selector); - w[44] = hc_byte_perm (w[36], w[35], selector); - w[43] = hc_byte_perm (w[35], w[34], selector); - w[42] = hc_byte_perm (w[34], w[33], selector); - w[41] = hc_byte_perm (w[33], w[32], selector); - w[40] = hc_byte_perm (w[32], w[31], selector); - w[39] = hc_byte_perm (w[31], w[30], selector); - w[38] = hc_byte_perm (w[30], w[29], selector); - w[37] = hc_byte_perm (w[29], w[28], selector); - w[36] = hc_byte_perm (w[28], w[27], selector); - w[35] = hc_byte_perm (w[27], w[26], selector); - w[34] = hc_byte_perm (w[26], w[25], selector); - w[33] = hc_byte_perm (w[25], w[24], selector); - w[32] = hc_byte_perm (w[24], w[23], selector); - w[31] = hc_byte_perm (w[23], w[22], selector); - w[30] = hc_byte_perm (w[22], w[21], selector); - w[29] = hc_byte_perm (w[21], w[20], selector); - w[28] = hc_byte_perm (w[20], w[19], selector); - w[27] = hc_byte_perm (w[19], w[18], selector); - w[26] = hc_byte_perm (w[18], w[17], selector); - w[25] = hc_byte_perm (w[17], w[16], selector); - w[24] = hc_byte_perm (w[16], w[15], selector); - w[23] = hc_byte_perm (w[15], w[14], selector); - w[22] = hc_byte_perm (w[14], w[13], selector); - w[21] = hc_byte_perm (w[13], w[12], selector); - w[20] = hc_byte_perm (w[12], w[11], selector); - w[19] = hc_byte_perm (w[11], w[10], selector); - w[18] = hc_byte_perm (w[10], w[ 9], selector); - w[17] = hc_byte_perm (w[ 9], w[ 8], selector); - w[16] = hc_byte_perm (w[ 8], w[ 7], selector); - w[15] = hc_byte_perm (w[ 7], w[ 6], selector); - w[14] = hc_byte_perm (w[ 6], w[ 5], selector); - w[13] = hc_byte_perm (w[ 5], w[ 4], selector); - w[12] = hc_byte_perm (w[ 4], w[ 3], selector); - w[11] = hc_byte_perm (w[ 3], w[ 2], selector); - w[10] = hc_byte_perm (w[ 2], w[ 1], selector); - w[ 9] = hc_byte_perm (w[ 1], w[ 0], selector); - w[ 8] = hc_byte_perm (w[ 0], 0, selector); + case 22: + w[63] = hc_byte_perm (w[41], w[40], selector); + w[62] = hc_byte_perm (w[40], w[39], selector); + w[61] = hc_byte_perm (w[39], w[38], selector); + w[60] = hc_byte_perm (w[38], w[37], selector); + w[59] = hc_byte_perm (w[37], w[36], selector); + w[58] = hc_byte_perm (w[36], w[35], selector); + w[57] = hc_byte_perm (w[35], w[34], selector); + w[56] = hc_byte_perm (w[34], w[33], selector); + w[55] = hc_byte_perm (w[33], w[32], selector); + w[54] = hc_byte_perm (w[32], w[31], selector); + w[53] = hc_byte_perm (w[31], w[30], selector); + w[52] = hc_byte_perm (w[30], w[29], selector); + w[51] = hc_byte_perm (w[29], w[28], selector); + w[50] = hc_byte_perm (w[28], w[27], selector); + w[49] = hc_byte_perm (w[27], w[26], selector); + w[48] = hc_byte_perm (w[26], w[25], selector); + w[47] = hc_byte_perm (w[25], w[24], selector); + w[46] = hc_byte_perm (w[24], w[23], selector); + w[45] = hc_byte_perm (w[23], w[22], selector); + w[44] = hc_byte_perm (w[22], w[21], selector); + w[43] = hc_byte_perm (w[21], w[20], selector); + w[42] = hc_byte_perm (w[20], w[19], selector); + w[41] = hc_byte_perm (w[19], w[18], selector); + w[40] = hc_byte_perm (w[18], w[17], selector); + w[39] = hc_byte_perm (w[17], w[16], selector); + w[38] = hc_byte_perm (w[16], w[15], selector); + w[37] = hc_byte_perm (w[15], w[14], selector); + w[36] = hc_byte_perm (w[14], w[13], selector); + w[35] = hc_byte_perm (w[13], w[12], selector); + w[34] = hc_byte_perm (w[12], w[11], selector); + w[33] = hc_byte_perm (w[11], w[10], selector); + w[32] = hc_byte_perm (w[10], w[ 9], selector); + w[31] = hc_byte_perm (w[ 9], w[ 8], selector); + w[30] = hc_byte_perm (w[ 8], w[ 7], selector); + w[29] = hc_byte_perm (w[ 7], w[ 6], selector); + w[28] = hc_byte_perm (w[ 6], w[ 5], selector); + w[27] = hc_byte_perm (w[ 5], w[ 4], selector); + w[26] = hc_byte_perm (w[ 4], w[ 3], selector); + w[25] = hc_byte_perm (w[ 3], w[ 2], selector); + w[24] = hc_byte_perm (w[ 2], w[ 1], selector); + w[23] = hc_byte_perm (w[ 1], w[ 0], selector); + w[22] = hc_byte_perm (w[ 0], 0, selector); + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; @@ -26928,62 +31268,62 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 9: - w[63] = hc_byte_perm (w[54], w[53], selector); - w[62] = hc_byte_perm (w[53], w[52], selector); - w[61] = hc_byte_perm (w[52], w[51], selector); - w[60] = hc_byte_perm (w[51], w[50], selector); - w[59] = hc_byte_perm (w[50], w[49], selector); - w[58] = hc_byte_perm (w[49], w[48], selector); - w[57] = hc_byte_perm (w[48], w[47], selector); - w[56] = hc_byte_perm (w[47], w[46], selector); - w[55] = hc_byte_perm (w[46], w[45], selector); - w[54] = hc_byte_perm (w[45], w[44], selector); - w[53] = hc_byte_perm (w[44], w[43], selector); - w[52] = hc_byte_perm (w[43], w[42], selector); - w[51] = hc_byte_perm (w[42], w[41], selector); - w[50] = hc_byte_perm (w[41], w[40], selector); - w[49] = hc_byte_perm (w[40], w[39], selector); - w[48] = hc_byte_perm (w[39], w[38], selector); - w[47] = hc_byte_perm (w[38], w[37], selector); - w[46] = hc_byte_perm (w[37], w[36], selector); - w[45] = hc_byte_perm (w[36], w[35], selector); - w[44] = hc_byte_perm (w[35], w[34], selector); - w[43] = hc_byte_perm (w[34], w[33], selector); - w[42] = hc_byte_perm (w[33], w[32], selector); - w[41] = hc_byte_perm (w[32], w[31], selector); - w[40] = hc_byte_perm (w[31], w[30], selector); - w[39] = hc_byte_perm (w[30], w[29], selector); - w[38] = hc_byte_perm (w[29], w[28], selector); - w[37] = hc_byte_perm (w[28], w[27], selector); - w[36] = hc_byte_perm (w[27], w[26], selector); - w[35] = hc_byte_perm (w[26], w[25], selector); - w[34] = hc_byte_perm (w[25], w[24], selector); - w[33] = hc_byte_perm (w[24], w[23], selector); - w[32] = hc_byte_perm (w[23], w[22], selector); - w[31] = hc_byte_perm (w[22], w[21], selector); - w[30] = hc_byte_perm (w[21], w[20], selector); - w[29] = hc_byte_perm (w[20], w[19], selector); - w[28] = hc_byte_perm (w[19], w[18], selector); - w[27] = hc_byte_perm (w[18], w[17], selector); - w[26] = hc_byte_perm (w[17], w[16], selector); - w[25] = hc_byte_perm (w[16], w[15], selector); - w[24] = hc_byte_perm (w[15], w[14], selector); - w[23] = hc_byte_perm (w[14], w[13], selector); - w[22] = hc_byte_perm (w[13], w[12], selector); - w[21] = hc_byte_perm (w[12], w[11], selector); - w[20] = hc_byte_perm (w[11], w[10], selector); - w[19] = hc_byte_perm (w[10], w[ 9], selector); - w[18] = hc_byte_perm (w[ 9], w[ 8], selector); - w[17] = hc_byte_perm (w[ 8], w[ 7], selector); - w[16] = hc_byte_perm (w[ 7], w[ 6], selector); - w[15] = hc_byte_perm (w[ 6], w[ 5], selector); - w[14] = hc_byte_perm (w[ 5], w[ 4], selector); - w[13] = hc_byte_perm (w[ 4], w[ 3], selector); - w[12] = hc_byte_perm (w[ 3], w[ 2], selector); - w[11] = hc_byte_perm (w[ 2], w[ 1], selector); - w[10] = hc_byte_perm (w[ 1], w[ 0], selector); - w[ 9] = hc_byte_perm (w[ 0], 0, selector); + case 23: + w[63] = hc_byte_perm (w[40], w[39], selector); + w[62] = hc_byte_perm (w[39], w[38], selector); + w[61] = hc_byte_perm (w[38], w[37], selector); + w[60] = hc_byte_perm (w[37], w[36], selector); + w[59] = hc_byte_perm (w[36], w[35], selector); + w[58] = hc_byte_perm (w[35], w[34], selector); + w[57] = hc_byte_perm (w[34], w[33], selector); + w[56] = hc_byte_perm (w[33], w[32], selector); + w[55] = hc_byte_perm (w[32], w[31], selector); + w[54] = hc_byte_perm (w[31], w[30], selector); + w[53] = hc_byte_perm (w[30], w[29], selector); + w[52] = hc_byte_perm (w[29], w[28], selector); + w[51] = hc_byte_perm (w[28], w[27], selector); + w[50] = hc_byte_perm (w[27], w[26], selector); + w[49] = hc_byte_perm (w[26], w[25], selector); + w[48] = hc_byte_perm (w[25], w[24], selector); + w[47] = hc_byte_perm (w[24], w[23], selector); + w[46] = hc_byte_perm (w[23], w[22], selector); + w[45] = hc_byte_perm (w[22], w[21], selector); + w[44] = hc_byte_perm (w[21], w[20], selector); + w[43] = hc_byte_perm (w[20], w[19], selector); + w[42] = hc_byte_perm (w[19], w[18], selector); + w[41] = hc_byte_perm (w[18], w[17], selector); + w[40] = hc_byte_perm (w[17], w[16], selector); + w[39] = hc_byte_perm (w[16], w[15], selector); + w[38] = hc_byte_perm (w[15], w[14], selector); + w[37] = hc_byte_perm (w[14], w[13], selector); + w[36] = hc_byte_perm (w[13], w[12], selector); + w[35] = hc_byte_perm (w[12], w[11], selector); + w[34] = hc_byte_perm (w[11], w[10], selector); + w[33] = hc_byte_perm (w[10], w[ 9], selector); + w[32] = hc_byte_perm (w[ 9], w[ 8], selector); + w[31] = hc_byte_perm (w[ 8], w[ 7], selector); + w[30] = hc_byte_perm (w[ 7], w[ 6], selector); + w[29] = hc_byte_perm (w[ 6], w[ 5], selector); + w[28] = hc_byte_perm (w[ 5], w[ 4], selector); + w[27] = hc_byte_perm (w[ 4], w[ 3], selector); + w[26] = hc_byte_perm (w[ 3], w[ 2], selector); + w[25] = hc_byte_perm (w[ 2], w[ 1], selector); + w[24] = hc_byte_perm (w[ 1], w[ 0], selector); + w[23] = hc_byte_perm (w[ 0], 0, selector); + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; @@ -26996,61 +31336,61 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 10: - w[63] = hc_byte_perm (w[53], w[52], selector); - w[62] = hc_byte_perm (w[52], w[51], selector); - w[61] = hc_byte_perm (w[51], w[50], selector); - w[60] = hc_byte_perm (w[50], w[49], selector); - w[59] = hc_byte_perm (w[49], w[48], selector); - w[58] = hc_byte_perm (w[48], w[47], selector); - w[57] = hc_byte_perm (w[47], w[46], selector); - w[56] = hc_byte_perm (w[46], w[45], selector); - w[55] = hc_byte_perm (w[45], w[44], selector); - w[54] = hc_byte_perm (w[44], w[43], selector); - w[53] = hc_byte_perm (w[43], w[42], selector); - w[52] = hc_byte_perm (w[42], w[41], selector); - w[51] = hc_byte_perm (w[41], w[40], selector); - w[50] = hc_byte_perm (w[40], w[39], selector); - w[49] = hc_byte_perm (w[39], w[38], selector); - w[48] = hc_byte_perm (w[38], w[37], selector); - w[47] = hc_byte_perm (w[37], w[36], selector); - w[46] = hc_byte_perm (w[36], w[35], selector); - w[45] = hc_byte_perm (w[35], w[34], selector); - w[44] = hc_byte_perm (w[34], w[33], selector); - w[43] = hc_byte_perm (w[33], w[32], selector); - w[42] = hc_byte_perm (w[32], w[31], selector); - w[41] = hc_byte_perm (w[31], w[30], selector); - w[40] = hc_byte_perm (w[30], w[29], selector); - w[39] = hc_byte_perm (w[29], w[28], selector); - w[38] = hc_byte_perm (w[28], w[27], selector); - w[37] = hc_byte_perm (w[27], w[26], selector); - w[36] = hc_byte_perm (w[26], w[25], selector); - w[35] = hc_byte_perm (w[25], w[24], selector); - w[34] = hc_byte_perm (w[24], w[23], selector); - w[33] = hc_byte_perm (w[23], w[22], selector); - w[32] = hc_byte_perm (w[22], w[21], selector); - w[31] = hc_byte_perm (w[21], w[20], selector); - w[30] = hc_byte_perm (w[20], w[19], selector); - w[29] = hc_byte_perm (w[19], w[18], selector); - w[28] = hc_byte_perm (w[18], w[17], selector); - w[27] = hc_byte_perm (w[17], w[16], selector); - w[26] = hc_byte_perm (w[16], w[15], selector); - w[25] = hc_byte_perm (w[15], w[14], selector); - w[24] = hc_byte_perm (w[14], w[13], selector); - w[23] = hc_byte_perm (w[13], w[12], selector); - w[22] = hc_byte_perm (w[12], w[11], selector); - w[21] = hc_byte_perm (w[11], w[10], selector); - w[20] = hc_byte_perm (w[10], w[ 9], selector); - w[19] = hc_byte_perm (w[ 9], w[ 8], selector); - w[18] = hc_byte_perm (w[ 8], w[ 7], selector); - w[17] = hc_byte_perm (w[ 7], w[ 6], selector); - w[16] = hc_byte_perm (w[ 6], w[ 5], selector); - w[15] = hc_byte_perm (w[ 5], w[ 4], selector); - w[14] = hc_byte_perm (w[ 4], w[ 3], selector); - w[13] = hc_byte_perm (w[ 3], w[ 2], selector); - w[12] = hc_byte_perm (w[ 2], w[ 1], selector); - w[11] = hc_byte_perm (w[ 1], w[ 0], selector); - w[10] = hc_byte_perm (w[ 0], 0, selector); + case 24: + w[63] = hc_byte_perm (w[39], w[38], selector); + w[62] = hc_byte_perm (w[38], w[37], selector); + w[61] = hc_byte_perm (w[37], w[36], selector); + w[60] = hc_byte_perm (w[36], w[35], selector); + w[59] = hc_byte_perm (w[35], w[34], selector); + w[58] = hc_byte_perm (w[34], w[33], selector); + w[57] = hc_byte_perm (w[33], w[32], selector); + w[56] = hc_byte_perm (w[32], w[31], selector); + w[55] = hc_byte_perm (w[31], w[30], selector); + w[54] = hc_byte_perm (w[30], w[29], selector); + w[53] = hc_byte_perm (w[29], w[28], selector); + w[52] = hc_byte_perm (w[28], w[27], selector); + w[51] = hc_byte_perm (w[27], w[26], selector); + w[50] = hc_byte_perm (w[26], w[25], selector); + w[49] = hc_byte_perm (w[25], w[24], selector); + w[48] = hc_byte_perm (w[24], w[23], selector); + w[47] = hc_byte_perm (w[23], w[22], selector); + w[46] = hc_byte_perm (w[22], w[21], selector); + w[45] = hc_byte_perm (w[21], w[20], selector); + w[44] = hc_byte_perm (w[20], w[19], selector); + w[43] = hc_byte_perm (w[19], w[18], selector); + w[42] = hc_byte_perm (w[18], w[17], selector); + w[41] = hc_byte_perm (w[17], w[16], selector); + w[40] = hc_byte_perm (w[16], w[15], selector); + w[39] = hc_byte_perm (w[15], w[14], selector); + w[38] = hc_byte_perm (w[14], w[13], selector); + w[37] = hc_byte_perm (w[13], w[12], selector); + w[36] = hc_byte_perm (w[12], w[11], selector); + w[35] = hc_byte_perm (w[11], w[10], selector); + w[34] = hc_byte_perm (w[10], w[ 9], selector); + w[33] = hc_byte_perm (w[ 9], w[ 8], selector); + w[32] = hc_byte_perm (w[ 8], w[ 7], selector); + w[31] = hc_byte_perm (w[ 7], w[ 6], selector); + w[30] = hc_byte_perm (w[ 6], w[ 5], selector); + w[29] = hc_byte_perm (w[ 5], w[ 4], selector); + w[28] = hc_byte_perm (w[ 4], w[ 3], selector); + w[27] = hc_byte_perm (w[ 3], w[ 2], selector); + w[26] = hc_byte_perm (w[ 2], w[ 1], selector); + w[25] = hc_byte_perm (w[ 1], w[ 0], selector); + w[24] = hc_byte_perm (w[ 0], 0, selector); + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; @@ -27064,60 +31404,128 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 11: - w[63] = hc_byte_perm (w[52], w[51], selector); - w[62] = hc_byte_perm (w[51], w[50], selector); - w[61] = hc_byte_perm (w[50], w[49], selector); - w[60] = hc_byte_perm (w[49], w[48], selector); - w[59] = hc_byte_perm (w[48], w[47], selector); - w[58] = hc_byte_perm (w[47], w[46], selector); - w[57] = hc_byte_perm (w[46], w[45], selector); - w[56] = hc_byte_perm (w[45], w[44], selector); - w[55] = hc_byte_perm (w[44], w[43], selector); - w[54] = hc_byte_perm (w[43], w[42], selector); - w[53] = hc_byte_perm (w[42], w[41], selector); - w[52] = hc_byte_perm (w[41], w[40], selector); - w[51] = hc_byte_perm (w[40], w[39], selector); - w[50] = hc_byte_perm (w[39], w[38], selector); - w[49] = hc_byte_perm (w[38], w[37], selector); - w[48] = hc_byte_perm (w[37], w[36], selector); - w[47] = hc_byte_perm (w[36], w[35], selector); - w[46] = hc_byte_perm (w[35], w[34], selector); - w[45] = hc_byte_perm (w[34], w[33], selector); - w[44] = hc_byte_perm (w[33], w[32], selector); - w[43] = hc_byte_perm (w[32], w[31], selector); - w[42] = hc_byte_perm (w[31], w[30], selector); - w[41] = hc_byte_perm (w[30], w[29], selector); - w[40] = hc_byte_perm (w[29], w[28], selector); - w[39] = hc_byte_perm (w[28], w[27], selector); - w[38] = hc_byte_perm (w[27], w[26], selector); - w[37] = hc_byte_perm (w[26], w[25], selector); - w[36] = hc_byte_perm (w[25], w[24], selector); - w[35] = hc_byte_perm (w[24], w[23], selector); - w[34] = hc_byte_perm (w[23], w[22], selector); - w[33] = hc_byte_perm (w[22], w[21], selector); - w[32] = hc_byte_perm (w[21], w[20], selector); - w[31] = hc_byte_perm (w[20], w[19], selector); - w[30] = hc_byte_perm (w[19], w[18], selector); - w[29] = hc_byte_perm (w[18], w[17], selector); - w[28] = hc_byte_perm (w[17], w[16], selector); - w[27] = hc_byte_perm (w[16], w[15], selector); - w[26] = hc_byte_perm (w[15], w[14], selector); - w[25] = hc_byte_perm (w[14], w[13], selector); - w[24] = hc_byte_perm (w[13], w[12], selector); - w[23] = hc_byte_perm (w[12], w[11], selector); - w[22] = hc_byte_perm (w[11], w[10], selector); - w[21] = hc_byte_perm (w[10], w[ 9], selector); - w[20] = hc_byte_perm (w[ 9], w[ 8], selector); - w[19] = hc_byte_perm (w[ 8], w[ 7], selector); - w[18] = hc_byte_perm (w[ 7], w[ 6], selector); - w[17] = hc_byte_perm (w[ 6], w[ 5], selector); - w[16] = hc_byte_perm (w[ 5], w[ 4], selector); - w[15] = hc_byte_perm (w[ 4], w[ 3], selector); - w[14] = hc_byte_perm (w[ 3], w[ 2], selector); - w[13] = hc_byte_perm (w[ 2], w[ 1], selector); - w[12] = hc_byte_perm (w[ 1], w[ 0], selector); - w[11] = hc_byte_perm (w[ 0], 0, selector); + case 25: + w[63] = hc_byte_perm (w[38], w[37], selector); + w[62] = hc_byte_perm (w[37], w[36], selector); + w[61] = hc_byte_perm (w[36], w[35], selector); + w[60] = hc_byte_perm (w[35], w[34], selector); + w[59] = hc_byte_perm (w[34], w[33], selector); + w[58] = hc_byte_perm (w[33], w[32], selector); + w[57] = hc_byte_perm (w[32], w[31], selector); + w[56] = hc_byte_perm (w[31], w[30], selector); + w[55] = hc_byte_perm (w[30], w[29], selector); + w[54] = hc_byte_perm (w[29], w[28], selector); + w[53] = hc_byte_perm (w[28], w[27], selector); + w[52] = hc_byte_perm (w[27], w[26], selector); + w[51] = hc_byte_perm (w[26], w[25], selector); + w[50] = hc_byte_perm (w[25], w[24], selector); + w[49] = hc_byte_perm (w[24], w[23], selector); + w[48] = hc_byte_perm (w[23], w[22], selector); + w[47] = hc_byte_perm (w[22], w[21], selector); + w[46] = hc_byte_perm (w[21], w[20], selector); + w[45] = hc_byte_perm (w[20], w[19], selector); + w[44] = hc_byte_perm (w[19], w[18], selector); + w[43] = hc_byte_perm (w[18], w[17], selector); + w[42] = hc_byte_perm (w[17], w[16], selector); + w[41] = hc_byte_perm (w[16], w[15], selector); + w[40] = hc_byte_perm (w[15], w[14], selector); + w[39] = hc_byte_perm (w[14], w[13], selector); + w[38] = hc_byte_perm (w[13], w[12], selector); + w[37] = hc_byte_perm (w[12], w[11], selector); + w[36] = hc_byte_perm (w[11], w[10], selector); + w[35] = hc_byte_perm (w[10], w[ 9], selector); + w[34] = hc_byte_perm (w[ 9], w[ 8], selector); + w[33] = hc_byte_perm (w[ 8], w[ 7], selector); + w[32] = hc_byte_perm (w[ 7], w[ 6], selector); + w[31] = hc_byte_perm (w[ 6], w[ 5], selector); + w[30] = hc_byte_perm (w[ 5], w[ 4], selector); + w[29] = hc_byte_perm (w[ 4], w[ 3], selector); + w[28] = hc_byte_perm (w[ 3], w[ 2], selector); + w[27] = hc_byte_perm (w[ 2], w[ 1], selector); + w[26] = hc_byte_perm (w[ 1], w[ 0], selector); + w[25] = hc_byte_perm (w[ 0], 0, selector); + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 26: + w[63] = hc_byte_perm (w[37], w[36], selector); + w[62] = hc_byte_perm (w[36], w[35], selector); + w[61] = hc_byte_perm (w[35], w[34], selector); + w[60] = hc_byte_perm (w[34], w[33], selector); + w[59] = hc_byte_perm (w[33], w[32], selector); + w[58] = hc_byte_perm (w[32], w[31], selector); + w[57] = hc_byte_perm (w[31], w[30], selector); + w[56] = hc_byte_perm (w[30], w[29], selector); + w[55] = hc_byte_perm (w[29], w[28], selector); + w[54] = hc_byte_perm (w[28], w[27], selector); + w[53] = hc_byte_perm (w[27], w[26], selector); + w[52] = hc_byte_perm (w[26], w[25], selector); + w[51] = hc_byte_perm (w[25], w[24], selector); + w[50] = hc_byte_perm (w[24], w[23], selector); + w[49] = hc_byte_perm (w[23], w[22], selector); + w[48] = hc_byte_perm (w[22], w[21], selector); + w[47] = hc_byte_perm (w[21], w[20], selector); + w[46] = hc_byte_perm (w[20], w[19], selector); + w[45] = hc_byte_perm (w[19], w[18], selector); + w[44] = hc_byte_perm (w[18], w[17], selector); + w[43] = hc_byte_perm (w[17], w[16], selector); + w[42] = hc_byte_perm (w[16], w[15], selector); + w[41] = hc_byte_perm (w[15], w[14], selector); + w[40] = hc_byte_perm (w[14], w[13], selector); + w[39] = hc_byte_perm (w[13], w[12], selector); + w[38] = hc_byte_perm (w[12], w[11], selector); + w[37] = hc_byte_perm (w[11], w[10], selector); + w[36] = hc_byte_perm (w[10], w[ 9], selector); + w[35] = hc_byte_perm (w[ 9], w[ 8], selector); + w[34] = hc_byte_perm (w[ 8], w[ 7], selector); + w[33] = hc_byte_perm (w[ 7], w[ 6], selector); + w[32] = hc_byte_perm (w[ 6], w[ 5], selector); + w[31] = hc_byte_perm (w[ 5], w[ 4], selector); + w[30] = hc_byte_perm (w[ 4], w[ 3], selector); + w[29] = hc_byte_perm (w[ 3], w[ 2], selector); + w[28] = hc_byte_perm (w[ 2], w[ 1], selector); + w[27] = hc_byte_perm (w[ 1], w[ 0], selector); + w[26] = hc_byte_perm (w[ 0], 0, selector); + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; @@ -27132,59 +31540,59 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 12: - w[63] = hc_byte_perm (w[51], w[50], selector); - w[62] = hc_byte_perm (w[50], w[49], selector); - w[61] = hc_byte_perm (w[49], w[48], selector); - w[60] = hc_byte_perm (w[48], w[47], selector); - w[59] = hc_byte_perm (w[47], w[46], selector); - w[58] = hc_byte_perm (w[46], w[45], selector); - w[57] = hc_byte_perm (w[45], w[44], selector); - w[56] = hc_byte_perm (w[44], w[43], selector); - w[55] = hc_byte_perm (w[43], w[42], selector); - w[54] = hc_byte_perm (w[42], w[41], selector); - w[53] = hc_byte_perm (w[41], w[40], selector); - w[52] = hc_byte_perm (w[40], w[39], selector); - w[51] = hc_byte_perm (w[39], w[38], selector); - w[50] = hc_byte_perm (w[38], w[37], selector); - w[49] = hc_byte_perm (w[37], w[36], selector); - w[48] = hc_byte_perm (w[36], w[35], selector); - w[47] = hc_byte_perm (w[35], w[34], selector); - w[46] = hc_byte_perm (w[34], w[33], selector); - w[45] = hc_byte_perm (w[33], w[32], selector); - w[44] = hc_byte_perm (w[32], w[31], selector); - w[43] = hc_byte_perm (w[31], w[30], selector); - w[42] = hc_byte_perm (w[30], w[29], selector); - w[41] = hc_byte_perm (w[29], w[28], selector); - w[40] = hc_byte_perm (w[28], w[27], selector); - w[39] = hc_byte_perm (w[27], w[26], selector); - w[38] = hc_byte_perm (w[26], w[25], selector); - w[37] = hc_byte_perm (w[25], w[24], selector); - w[36] = hc_byte_perm (w[24], w[23], selector); - w[35] = hc_byte_perm (w[23], w[22], selector); - w[34] = hc_byte_perm (w[22], w[21], selector); - w[33] = hc_byte_perm (w[21], w[20], selector); - w[32] = hc_byte_perm (w[20], w[19], selector); - w[31] = hc_byte_perm (w[19], w[18], selector); - w[30] = hc_byte_perm (w[18], w[17], selector); - w[29] = hc_byte_perm (w[17], w[16], selector); - w[28] = hc_byte_perm (w[16], w[15], selector); - w[27] = hc_byte_perm (w[15], w[14], selector); - w[26] = hc_byte_perm (w[14], w[13], selector); - w[25] = hc_byte_perm (w[13], w[12], selector); - w[24] = hc_byte_perm (w[12], w[11], selector); - w[23] = hc_byte_perm (w[11], w[10], selector); - w[22] = hc_byte_perm (w[10], w[ 9], selector); - w[21] = hc_byte_perm (w[ 9], w[ 8], selector); - w[20] = hc_byte_perm (w[ 8], w[ 7], selector); - w[19] = hc_byte_perm (w[ 7], w[ 6], selector); - w[18] = hc_byte_perm (w[ 6], w[ 5], selector); - w[17] = hc_byte_perm (w[ 5], w[ 4], selector); - w[16] = hc_byte_perm (w[ 4], w[ 3], selector); - w[15] = hc_byte_perm (w[ 3], w[ 2], selector); - w[14] = hc_byte_perm (w[ 2], w[ 1], selector); - w[13] = hc_byte_perm (w[ 1], w[ 0], selector); - w[12] = hc_byte_perm (w[ 0], 0, selector); + case 27: + w[63] = hc_byte_perm (w[36], w[35], selector); + w[62] = hc_byte_perm (w[35], w[34], selector); + w[61] = hc_byte_perm (w[34], w[33], selector); + w[60] = hc_byte_perm (w[33], w[32], selector); + w[59] = hc_byte_perm (w[32], w[31], selector); + w[58] = hc_byte_perm (w[31], w[30], selector); + w[57] = hc_byte_perm (w[30], w[29], selector); + w[56] = hc_byte_perm (w[29], w[28], selector); + w[55] = hc_byte_perm (w[28], w[27], selector); + w[54] = hc_byte_perm (w[27], w[26], selector); + w[53] = hc_byte_perm (w[26], w[25], selector); + w[52] = hc_byte_perm (w[25], w[24], selector); + w[51] = hc_byte_perm (w[24], w[23], selector); + w[50] = hc_byte_perm (w[23], w[22], selector); + w[49] = hc_byte_perm (w[22], w[21], selector); + w[48] = hc_byte_perm (w[21], w[20], selector); + w[47] = hc_byte_perm (w[20], w[19], selector); + w[46] = hc_byte_perm (w[19], w[18], selector); + w[45] = hc_byte_perm (w[18], w[17], selector); + w[44] = hc_byte_perm (w[17], w[16], selector); + w[43] = hc_byte_perm (w[16], w[15], selector); + w[42] = hc_byte_perm (w[15], w[14], selector); + w[41] = hc_byte_perm (w[14], w[13], selector); + w[40] = hc_byte_perm (w[13], w[12], selector); + w[39] = hc_byte_perm (w[12], w[11], selector); + w[38] = hc_byte_perm (w[11], w[10], selector); + w[37] = hc_byte_perm (w[10], w[ 9], selector); + w[36] = hc_byte_perm (w[ 9], w[ 8], selector); + w[35] = hc_byte_perm (w[ 8], w[ 7], selector); + w[34] = hc_byte_perm (w[ 7], w[ 6], selector); + w[33] = hc_byte_perm (w[ 6], w[ 5], selector); + w[32] = hc_byte_perm (w[ 5], w[ 4], selector); + w[31] = hc_byte_perm (w[ 4], w[ 3], selector); + w[30] = hc_byte_perm (w[ 3], w[ 2], selector); + w[29] = hc_byte_perm (w[ 2], w[ 1], selector); + w[28] = hc_byte_perm (w[ 1], w[ 0], selector); + w[27] = hc_byte_perm (w[ 0], 0, selector); + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; @@ -27200,58 +31608,58 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 13: - w[63] = hc_byte_perm (w[50], w[49], selector); - w[62] = hc_byte_perm (w[49], w[48], selector); - w[61] = hc_byte_perm (w[48], w[47], selector); - w[60] = hc_byte_perm (w[47], w[46], selector); - w[59] = hc_byte_perm (w[46], w[45], selector); - w[58] = hc_byte_perm (w[45], w[44], selector); - w[57] = hc_byte_perm (w[44], w[43], selector); - w[56] = hc_byte_perm (w[43], w[42], selector); - w[55] = hc_byte_perm (w[42], w[41], selector); - w[54] = hc_byte_perm (w[41], w[40], selector); - w[53] = hc_byte_perm (w[40], w[39], selector); - w[52] = hc_byte_perm (w[39], w[38], selector); - w[51] = hc_byte_perm (w[38], w[37], selector); - w[50] = hc_byte_perm (w[37], w[36], selector); - w[49] = hc_byte_perm (w[36], w[35], selector); - w[48] = hc_byte_perm (w[35], w[34], selector); - w[47] = hc_byte_perm (w[34], w[33], selector); - w[46] = hc_byte_perm (w[33], w[32], selector); - w[45] = hc_byte_perm (w[32], w[31], selector); - w[44] = hc_byte_perm (w[31], w[30], selector); - w[43] = hc_byte_perm (w[30], w[29], selector); - w[42] = hc_byte_perm (w[29], w[28], selector); - w[41] = hc_byte_perm (w[28], w[27], selector); - w[40] = hc_byte_perm (w[27], w[26], selector); - w[39] = hc_byte_perm (w[26], w[25], selector); - w[38] = hc_byte_perm (w[25], w[24], selector); - w[37] = hc_byte_perm (w[24], w[23], selector); - w[36] = hc_byte_perm (w[23], w[22], selector); - w[35] = hc_byte_perm (w[22], w[21], selector); - w[34] = hc_byte_perm (w[21], w[20], selector); - w[33] = hc_byte_perm (w[20], w[19], selector); - w[32] = hc_byte_perm (w[19], w[18], selector); - w[31] = hc_byte_perm (w[18], w[17], selector); - w[30] = hc_byte_perm (w[17], w[16], selector); - w[29] = hc_byte_perm (w[16], w[15], selector); - w[28] = hc_byte_perm (w[15], w[14], selector); - w[27] = hc_byte_perm (w[14], w[13], selector); - w[26] = hc_byte_perm (w[13], w[12], selector); - w[25] = hc_byte_perm (w[12], w[11], selector); - w[24] = hc_byte_perm (w[11], w[10], selector); - w[23] = hc_byte_perm (w[10], w[ 9], selector); - w[22] = hc_byte_perm (w[ 9], w[ 8], selector); - w[21] = hc_byte_perm (w[ 8], w[ 7], selector); - w[20] = hc_byte_perm (w[ 7], w[ 6], selector); - w[19] = hc_byte_perm (w[ 6], w[ 5], selector); - w[18] = hc_byte_perm (w[ 5], w[ 4], selector); - w[17] = hc_byte_perm (w[ 4], w[ 3], selector); - w[16] = hc_byte_perm (w[ 3], w[ 2], selector); - w[15] = hc_byte_perm (w[ 2], w[ 1], selector); - w[14] = hc_byte_perm (w[ 1], w[ 0], selector); - w[13] = hc_byte_perm (w[ 0], 0, selector); + case 28: + w[63] = hc_byte_perm (w[35], w[34], selector); + w[62] = hc_byte_perm (w[34], w[33], selector); + w[61] = hc_byte_perm (w[33], w[32], selector); + w[60] = hc_byte_perm (w[32], w[31], selector); + w[59] = hc_byte_perm (w[31], w[30], selector); + w[58] = hc_byte_perm (w[30], w[29], selector); + w[57] = hc_byte_perm (w[29], w[28], selector); + w[56] = hc_byte_perm (w[28], w[27], selector); + w[55] = hc_byte_perm (w[27], w[26], selector); + w[54] = hc_byte_perm (w[26], w[25], selector); + w[53] = hc_byte_perm (w[25], w[24], selector); + w[52] = hc_byte_perm (w[24], w[23], selector); + w[51] = hc_byte_perm (w[23], w[22], selector); + w[50] = hc_byte_perm (w[22], w[21], selector); + w[49] = hc_byte_perm (w[21], w[20], selector); + w[48] = hc_byte_perm (w[20], w[19], selector); + w[47] = hc_byte_perm (w[19], w[18], selector); + w[46] = hc_byte_perm (w[18], w[17], selector); + w[45] = hc_byte_perm (w[17], w[16], selector); + w[44] = hc_byte_perm (w[16], w[15], selector); + w[43] = hc_byte_perm (w[15], w[14], selector); + w[42] = hc_byte_perm (w[14], w[13], selector); + w[41] = hc_byte_perm (w[13], w[12], selector); + w[40] = hc_byte_perm (w[12], w[11], selector); + w[39] = hc_byte_perm (w[11], w[10], selector); + w[38] = hc_byte_perm (w[10], w[ 9], selector); + w[37] = hc_byte_perm (w[ 9], w[ 8], selector); + w[36] = hc_byte_perm (w[ 8], w[ 7], selector); + w[35] = hc_byte_perm (w[ 7], w[ 6], selector); + w[34] = hc_byte_perm (w[ 6], w[ 5], selector); + w[33] = hc_byte_perm (w[ 5], w[ 4], selector); + w[32] = hc_byte_perm (w[ 4], w[ 3], selector); + w[31] = hc_byte_perm (w[ 3], w[ 2], selector); + w[30] = hc_byte_perm (w[ 2], w[ 1], selector); + w[29] = hc_byte_perm (w[ 1], w[ 0], selector); + w[28] = hc_byte_perm (w[ 0], 0, selector); + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; @@ -27268,57 +31676,57 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 14: - w[63] = hc_byte_perm (w[49], w[48], selector); - w[62] = hc_byte_perm (w[48], w[47], selector); - w[61] = hc_byte_perm (w[47], w[46], selector); - w[60] = hc_byte_perm (w[46], w[45], selector); - w[59] = hc_byte_perm (w[45], w[44], selector); - w[58] = hc_byte_perm (w[44], w[43], selector); - w[57] = hc_byte_perm (w[43], w[42], selector); - w[56] = hc_byte_perm (w[42], w[41], selector); - w[55] = hc_byte_perm (w[41], w[40], selector); - w[54] = hc_byte_perm (w[40], w[39], selector); - w[53] = hc_byte_perm (w[39], w[38], selector); - w[52] = hc_byte_perm (w[38], w[37], selector); - w[51] = hc_byte_perm (w[37], w[36], selector); - w[50] = hc_byte_perm (w[36], w[35], selector); - w[49] = hc_byte_perm (w[35], w[34], selector); - w[48] = hc_byte_perm (w[34], w[33], selector); - w[47] = hc_byte_perm (w[33], w[32], selector); - w[46] = hc_byte_perm (w[32], w[31], selector); - w[45] = hc_byte_perm (w[31], w[30], selector); - w[44] = hc_byte_perm (w[30], w[29], selector); - w[43] = hc_byte_perm (w[29], w[28], selector); - w[42] = hc_byte_perm (w[28], w[27], selector); - w[41] = hc_byte_perm (w[27], w[26], selector); - w[40] = hc_byte_perm (w[26], w[25], selector); - w[39] = hc_byte_perm (w[25], w[24], selector); - w[38] = hc_byte_perm (w[24], w[23], selector); - w[37] = hc_byte_perm (w[23], w[22], selector); - w[36] = hc_byte_perm (w[22], w[21], selector); - w[35] = hc_byte_perm (w[21], w[20], selector); - w[34] = hc_byte_perm (w[20], w[19], selector); - w[33] = hc_byte_perm (w[19], w[18], selector); - w[32] = hc_byte_perm (w[18], w[17], selector); - w[31] = hc_byte_perm (w[17], w[16], selector); - w[30] = hc_byte_perm (w[16], w[15], selector); - w[29] = hc_byte_perm (w[15], w[14], selector); - w[28] = hc_byte_perm (w[14], w[13], selector); - w[27] = hc_byte_perm (w[13], w[12], selector); - w[26] = hc_byte_perm (w[12], w[11], selector); - w[25] = hc_byte_perm (w[11], w[10], selector); - w[24] = hc_byte_perm (w[10], w[ 9], selector); - w[23] = hc_byte_perm (w[ 9], w[ 8], selector); - w[22] = hc_byte_perm (w[ 8], w[ 7], selector); - w[21] = hc_byte_perm (w[ 7], w[ 6], selector); - w[20] = hc_byte_perm (w[ 6], w[ 5], selector); - w[19] = hc_byte_perm (w[ 5], w[ 4], selector); - w[18] = hc_byte_perm (w[ 4], w[ 3], selector); - w[17] = hc_byte_perm (w[ 3], w[ 2], selector); - w[16] = hc_byte_perm (w[ 2], w[ 1], selector); - w[15] = hc_byte_perm (w[ 1], w[ 0], selector); - w[14] = hc_byte_perm (w[ 0], 0, selector); + case 29: + w[63] = hc_byte_perm (w[34], w[33], selector); + w[62] = hc_byte_perm (w[33], w[32], selector); + w[61] = hc_byte_perm (w[32], w[31], selector); + w[60] = hc_byte_perm (w[31], w[30], selector); + w[59] = hc_byte_perm (w[30], w[29], selector); + w[58] = hc_byte_perm (w[29], w[28], selector); + w[57] = hc_byte_perm (w[28], w[27], selector); + w[56] = hc_byte_perm (w[27], w[26], selector); + w[55] = hc_byte_perm (w[26], w[25], selector); + w[54] = hc_byte_perm (w[25], w[24], selector); + w[53] = hc_byte_perm (w[24], w[23], selector); + w[52] = hc_byte_perm (w[23], w[22], selector); + w[51] = hc_byte_perm (w[22], w[21], selector); + w[50] = hc_byte_perm (w[21], w[20], selector); + w[49] = hc_byte_perm (w[20], w[19], selector); + w[48] = hc_byte_perm (w[19], w[18], selector); + w[47] = hc_byte_perm (w[18], w[17], selector); + w[46] = hc_byte_perm (w[17], w[16], selector); + w[45] = hc_byte_perm (w[16], w[15], selector); + w[44] = hc_byte_perm (w[15], w[14], selector); + w[43] = hc_byte_perm (w[14], w[13], selector); + w[42] = hc_byte_perm (w[13], w[12], selector); + w[41] = hc_byte_perm (w[12], w[11], selector); + w[40] = hc_byte_perm (w[11], w[10], selector); + w[39] = hc_byte_perm (w[10], w[ 9], selector); + w[38] = hc_byte_perm (w[ 9], w[ 8], selector); + w[37] = hc_byte_perm (w[ 8], w[ 7], selector); + w[36] = hc_byte_perm (w[ 7], w[ 6], selector); + w[35] = hc_byte_perm (w[ 6], w[ 5], selector); + w[34] = hc_byte_perm (w[ 5], w[ 4], selector); + w[33] = hc_byte_perm (w[ 4], w[ 3], selector); + w[32] = hc_byte_perm (w[ 3], w[ 2], selector); + w[31] = hc_byte_perm (w[ 2], w[ 1], selector); + w[30] = hc_byte_perm (w[ 1], w[ 0], selector); + w[29] = hc_byte_perm (w[ 0], 0, selector); + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; @@ -27336,56 +31744,124 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 15: - w[63] = hc_byte_perm (w[48], w[47], selector); - w[62] = hc_byte_perm (w[47], w[46], selector); - w[61] = hc_byte_perm (w[46], w[45], selector); - w[60] = hc_byte_perm (w[45], w[44], selector); - w[59] = hc_byte_perm (w[44], w[43], selector); - w[58] = hc_byte_perm (w[43], w[42], selector); - w[57] = hc_byte_perm (w[42], w[41], selector); - w[56] = hc_byte_perm (w[41], w[40], selector); - w[55] = hc_byte_perm (w[40], w[39], selector); - w[54] = hc_byte_perm (w[39], w[38], selector); - w[53] = hc_byte_perm (w[38], w[37], selector); - w[52] = hc_byte_perm (w[37], w[36], selector); - w[51] = hc_byte_perm (w[36], w[35], selector); - w[50] = hc_byte_perm (w[35], w[34], selector); - w[49] = hc_byte_perm (w[34], w[33], selector); - w[48] = hc_byte_perm (w[33], w[32], selector); - w[47] = hc_byte_perm (w[32], w[31], selector); - w[46] = hc_byte_perm (w[31], w[30], selector); - w[45] = hc_byte_perm (w[30], w[29], selector); - w[44] = hc_byte_perm (w[29], w[28], selector); - w[43] = hc_byte_perm (w[28], w[27], selector); - w[42] = hc_byte_perm (w[27], w[26], selector); - w[41] = hc_byte_perm (w[26], w[25], selector); - w[40] = hc_byte_perm (w[25], w[24], selector); - w[39] = hc_byte_perm (w[24], w[23], selector); - w[38] = hc_byte_perm (w[23], w[22], selector); - w[37] = hc_byte_perm (w[22], w[21], selector); - w[36] = hc_byte_perm (w[21], w[20], selector); - w[35] = hc_byte_perm (w[20], w[19], selector); - w[34] = hc_byte_perm (w[19], w[18], selector); - w[33] = hc_byte_perm (w[18], w[17], selector); - w[32] = hc_byte_perm (w[17], w[16], selector); - w[31] = hc_byte_perm (w[16], w[15], selector); - w[30] = hc_byte_perm (w[15], w[14], selector); - w[29] = hc_byte_perm (w[14], w[13], selector); - w[28] = hc_byte_perm (w[13], w[12], selector); - w[27] = hc_byte_perm (w[12], w[11], selector); - w[26] = hc_byte_perm (w[11], w[10], selector); - w[25] = hc_byte_perm (w[10], w[ 9], selector); - w[24] = hc_byte_perm (w[ 9], w[ 8], selector); - w[23] = hc_byte_perm (w[ 8], w[ 7], selector); - w[22] = hc_byte_perm (w[ 7], w[ 6], selector); - w[21] = hc_byte_perm (w[ 6], w[ 5], selector); - w[20] = hc_byte_perm (w[ 5], w[ 4], selector); - w[19] = hc_byte_perm (w[ 4], w[ 3], selector); - w[18] = hc_byte_perm (w[ 3], w[ 2], selector); - w[17] = hc_byte_perm (w[ 2], w[ 1], selector); - w[16] = hc_byte_perm (w[ 1], w[ 0], selector); - w[15] = hc_byte_perm (w[ 0], 0, selector); + case 30: + w[63] = hc_byte_perm (w[33], w[32], selector); + w[62] = hc_byte_perm (w[32], w[31], selector); + w[61] = hc_byte_perm (w[31], w[30], selector); + w[60] = hc_byte_perm (w[30], w[29], selector); + w[59] = hc_byte_perm (w[29], w[28], selector); + w[58] = hc_byte_perm (w[28], w[27], selector); + w[57] = hc_byte_perm (w[27], w[26], selector); + w[56] = hc_byte_perm (w[26], w[25], selector); + w[55] = hc_byte_perm (w[25], w[24], selector); + w[54] = hc_byte_perm (w[24], w[23], selector); + w[53] = hc_byte_perm (w[23], w[22], selector); + w[52] = hc_byte_perm (w[22], w[21], selector); + w[51] = hc_byte_perm (w[21], w[20], selector); + w[50] = hc_byte_perm (w[20], w[19], selector); + w[49] = hc_byte_perm (w[19], w[18], selector); + w[48] = hc_byte_perm (w[18], w[17], selector); + w[47] = hc_byte_perm (w[17], w[16], selector); + w[46] = hc_byte_perm (w[16], w[15], selector); + w[45] = hc_byte_perm (w[15], w[14], selector); + w[44] = hc_byte_perm (w[14], w[13], selector); + w[43] = hc_byte_perm (w[13], w[12], selector); + w[42] = hc_byte_perm (w[12], w[11], selector); + w[41] = hc_byte_perm (w[11], w[10], selector); + w[40] = hc_byte_perm (w[10], w[ 9], selector); + w[39] = hc_byte_perm (w[ 9], w[ 8], selector); + w[38] = hc_byte_perm (w[ 8], w[ 7], selector); + w[37] = hc_byte_perm (w[ 7], w[ 6], selector); + w[36] = hc_byte_perm (w[ 6], w[ 5], selector); + w[35] = hc_byte_perm (w[ 5], w[ 4], selector); + w[34] = hc_byte_perm (w[ 4], w[ 3], selector); + w[33] = hc_byte_perm (w[ 3], w[ 2], selector); + w[32] = hc_byte_perm (w[ 2], w[ 1], selector); + w[31] = hc_byte_perm (w[ 1], w[ 0], selector); + w[30] = hc_byte_perm (w[ 0], 0, selector); + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 31: + w[63] = hc_byte_perm (w[32], w[31], selector); + w[62] = hc_byte_perm (w[31], w[30], selector); + w[61] = hc_byte_perm (w[30], w[29], selector); + w[60] = hc_byte_perm (w[29], w[28], selector); + w[59] = hc_byte_perm (w[28], w[27], selector); + w[58] = hc_byte_perm (w[27], w[26], selector); + w[57] = hc_byte_perm (w[26], w[25], selector); + w[56] = hc_byte_perm (w[25], w[24], selector); + w[55] = hc_byte_perm (w[24], w[23], selector); + w[54] = hc_byte_perm (w[23], w[22], selector); + w[53] = hc_byte_perm (w[22], w[21], selector); + w[52] = hc_byte_perm (w[21], w[20], selector); + w[51] = hc_byte_perm (w[20], w[19], selector); + w[50] = hc_byte_perm (w[19], w[18], selector); + w[49] = hc_byte_perm (w[18], w[17], selector); + w[48] = hc_byte_perm (w[17], w[16], selector); + w[47] = hc_byte_perm (w[16], w[15], selector); + w[46] = hc_byte_perm (w[15], w[14], selector); + w[45] = hc_byte_perm (w[14], w[13], selector); + w[44] = hc_byte_perm (w[13], w[12], selector); + w[43] = hc_byte_perm (w[12], w[11], selector); + w[42] = hc_byte_perm (w[11], w[10], selector); + w[41] = hc_byte_perm (w[10], w[ 9], selector); + w[40] = hc_byte_perm (w[ 9], w[ 8], selector); + w[39] = hc_byte_perm (w[ 8], w[ 7], selector); + w[38] = hc_byte_perm (w[ 7], w[ 6], selector); + w[37] = hc_byte_perm (w[ 6], w[ 5], selector); + w[36] = hc_byte_perm (w[ 5], w[ 4], selector); + w[35] = hc_byte_perm (w[ 4], w[ 3], selector); + w[34] = hc_byte_perm (w[ 3], w[ 2], selector); + w[33] = hc_byte_perm (w[ 2], w[ 1], selector); + w[32] = hc_byte_perm (w[ 1], w[ 0], selector); + w[31] = hc_byte_perm (w[ 0], 0, selector); + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; @@ -27404,55 +31880,55 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 16: - w[63] = hc_byte_perm (w[47], w[46], selector); - w[62] = hc_byte_perm (w[46], w[45], selector); - w[61] = hc_byte_perm (w[45], w[44], selector); - w[60] = hc_byte_perm (w[44], w[43], selector); - w[59] = hc_byte_perm (w[43], w[42], selector); - w[58] = hc_byte_perm (w[42], w[41], selector); - w[57] = hc_byte_perm (w[41], w[40], selector); - w[56] = hc_byte_perm (w[40], w[39], selector); - w[55] = hc_byte_perm (w[39], w[38], selector); - w[54] = hc_byte_perm (w[38], w[37], selector); - w[53] = hc_byte_perm (w[37], w[36], selector); - w[52] = hc_byte_perm (w[36], w[35], selector); - w[51] = hc_byte_perm (w[35], w[34], selector); - w[50] = hc_byte_perm (w[34], w[33], selector); - w[49] = hc_byte_perm (w[33], w[32], selector); - w[48] = hc_byte_perm (w[32], w[31], selector); - w[47] = hc_byte_perm (w[31], w[30], selector); - w[46] = hc_byte_perm (w[30], w[29], selector); - w[45] = hc_byte_perm (w[29], w[28], selector); - w[44] = hc_byte_perm (w[28], w[27], selector); - w[43] = hc_byte_perm (w[27], w[26], selector); - w[42] = hc_byte_perm (w[26], w[25], selector); - w[41] = hc_byte_perm (w[25], w[24], selector); - w[40] = hc_byte_perm (w[24], w[23], selector); - w[39] = hc_byte_perm (w[23], w[22], selector); - w[38] = hc_byte_perm (w[22], w[21], selector); - w[37] = hc_byte_perm (w[21], w[20], selector); - w[36] = hc_byte_perm (w[20], w[19], selector); - w[35] = hc_byte_perm (w[19], w[18], selector); - w[34] = hc_byte_perm (w[18], w[17], selector); - w[33] = hc_byte_perm (w[17], w[16], selector); - w[32] = hc_byte_perm (w[16], w[15], selector); - w[31] = hc_byte_perm (w[15], w[14], selector); - w[30] = hc_byte_perm (w[14], w[13], selector); - w[29] = hc_byte_perm (w[13], w[12], selector); - w[28] = hc_byte_perm (w[12], w[11], selector); - w[27] = hc_byte_perm (w[11], w[10], selector); - w[26] = hc_byte_perm (w[10], w[ 9], selector); - w[25] = hc_byte_perm (w[ 9], w[ 8], selector); - w[24] = hc_byte_perm (w[ 8], w[ 7], selector); - w[23] = hc_byte_perm (w[ 7], w[ 6], selector); - w[22] = hc_byte_perm (w[ 6], w[ 5], selector); - w[21] = hc_byte_perm (w[ 5], w[ 4], selector); - w[20] = hc_byte_perm (w[ 4], w[ 3], selector); - w[19] = hc_byte_perm (w[ 3], w[ 2], selector); - w[18] = hc_byte_perm (w[ 2], w[ 1], selector); - w[17] = hc_byte_perm (w[ 1], w[ 0], selector); - w[16] = hc_byte_perm (w[ 0], 0, selector); + case 32: + w[63] = hc_byte_perm (w[31], w[30], selector); + w[62] = hc_byte_perm (w[30], w[29], selector); + w[61] = hc_byte_perm (w[29], w[28], selector); + w[60] = hc_byte_perm (w[28], w[27], selector); + w[59] = hc_byte_perm (w[27], w[26], selector); + w[58] = hc_byte_perm (w[26], w[25], selector); + w[57] = hc_byte_perm (w[25], w[24], selector); + w[56] = hc_byte_perm (w[24], w[23], selector); + w[55] = hc_byte_perm (w[23], w[22], selector); + w[54] = hc_byte_perm (w[22], w[21], selector); + w[53] = hc_byte_perm (w[21], w[20], selector); + w[52] = hc_byte_perm (w[20], w[19], selector); + w[51] = hc_byte_perm (w[19], w[18], selector); + w[50] = hc_byte_perm (w[18], w[17], selector); + w[49] = hc_byte_perm (w[17], w[16], selector); + w[48] = hc_byte_perm (w[16], w[15], selector); + w[47] = hc_byte_perm (w[15], w[14], selector); + w[46] = hc_byte_perm (w[14], w[13], selector); + w[45] = hc_byte_perm (w[13], w[12], selector); + w[44] = hc_byte_perm (w[12], w[11], selector); + w[43] = hc_byte_perm (w[11], w[10], selector); + w[42] = hc_byte_perm (w[10], w[ 9], selector); + w[41] = hc_byte_perm (w[ 9], w[ 8], selector); + w[40] = hc_byte_perm (w[ 8], w[ 7], selector); + w[39] = hc_byte_perm (w[ 7], w[ 6], selector); + w[38] = hc_byte_perm (w[ 6], w[ 5], selector); + w[37] = hc_byte_perm (w[ 5], w[ 4], selector); + w[36] = hc_byte_perm (w[ 4], w[ 3], selector); + w[35] = hc_byte_perm (w[ 3], w[ 2], selector); + w[34] = hc_byte_perm (w[ 2], w[ 1], selector); + w[33] = hc_byte_perm (w[ 1], w[ 0], selector); + w[32] = hc_byte_perm (w[ 0], 0, selector); + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; @@ -27472,54 +31948,122 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 17: - w[63] = hc_byte_perm (w[46], w[45], selector); - w[62] = hc_byte_perm (w[45], w[44], selector); - w[61] = hc_byte_perm (w[44], w[43], selector); - w[60] = hc_byte_perm (w[43], w[42], selector); - w[59] = hc_byte_perm (w[42], w[41], selector); - w[58] = hc_byte_perm (w[41], w[40], selector); - w[57] = hc_byte_perm (w[40], w[39], selector); - w[56] = hc_byte_perm (w[39], w[38], selector); - w[55] = hc_byte_perm (w[38], w[37], selector); - w[54] = hc_byte_perm (w[37], w[36], selector); - w[53] = hc_byte_perm (w[36], w[35], selector); - w[52] = hc_byte_perm (w[35], w[34], selector); - w[51] = hc_byte_perm (w[34], w[33], selector); - w[50] = hc_byte_perm (w[33], w[32], selector); - w[49] = hc_byte_perm (w[32], w[31], selector); - w[48] = hc_byte_perm (w[31], w[30], selector); - w[47] = hc_byte_perm (w[30], w[29], selector); - w[46] = hc_byte_perm (w[29], w[28], selector); - w[45] = hc_byte_perm (w[28], w[27], selector); - w[44] = hc_byte_perm (w[27], w[26], selector); - w[43] = hc_byte_perm (w[26], w[25], selector); - w[42] = hc_byte_perm (w[25], w[24], selector); - w[41] = hc_byte_perm (w[24], w[23], selector); - w[40] = hc_byte_perm (w[23], w[22], selector); - w[39] = hc_byte_perm (w[22], w[21], selector); - w[38] = hc_byte_perm (w[21], w[20], selector); - w[37] = hc_byte_perm (w[20], w[19], selector); - w[36] = hc_byte_perm (w[19], w[18], selector); - w[35] = hc_byte_perm (w[18], w[17], selector); - w[34] = hc_byte_perm (w[17], w[16], selector); - w[33] = hc_byte_perm (w[16], w[15], selector); - w[32] = hc_byte_perm (w[15], w[14], selector); - w[31] = hc_byte_perm (w[14], w[13], selector); - w[30] = hc_byte_perm (w[13], w[12], selector); - w[29] = hc_byte_perm (w[12], w[11], selector); - w[28] = hc_byte_perm (w[11], w[10], selector); - w[27] = hc_byte_perm (w[10], w[ 9], selector); - w[26] = hc_byte_perm (w[ 9], w[ 8], selector); - w[25] = hc_byte_perm (w[ 8], w[ 7], selector); - w[24] = hc_byte_perm (w[ 7], w[ 6], selector); - w[23] = hc_byte_perm (w[ 6], w[ 5], selector); - w[22] = hc_byte_perm (w[ 5], w[ 4], selector); - w[21] = hc_byte_perm (w[ 4], w[ 3], selector); - w[20] = hc_byte_perm (w[ 3], w[ 2], selector); - w[19] = hc_byte_perm (w[ 2], w[ 1], selector); - w[18] = hc_byte_perm (w[ 1], w[ 0], selector); - w[17] = hc_byte_perm (w[ 0], 0, selector); + case 33: + w[63] = hc_byte_perm (w[30], w[29], selector); + w[62] = hc_byte_perm (w[29], w[28], selector); + w[61] = hc_byte_perm (w[28], w[27], selector); + w[60] = hc_byte_perm (w[27], w[26], selector); + w[59] = hc_byte_perm (w[26], w[25], selector); + w[58] = hc_byte_perm (w[25], w[24], selector); + w[57] = hc_byte_perm (w[24], w[23], selector); + w[56] = hc_byte_perm (w[23], w[22], selector); + w[55] = hc_byte_perm (w[22], w[21], selector); + w[54] = hc_byte_perm (w[21], w[20], selector); + w[53] = hc_byte_perm (w[20], w[19], selector); + w[52] = hc_byte_perm (w[19], w[18], selector); + w[51] = hc_byte_perm (w[18], w[17], selector); + w[50] = hc_byte_perm (w[17], w[16], selector); + w[49] = hc_byte_perm (w[16], w[15], selector); + w[48] = hc_byte_perm (w[15], w[14], selector); + w[47] = hc_byte_perm (w[14], w[13], selector); + w[46] = hc_byte_perm (w[13], w[12], selector); + w[45] = hc_byte_perm (w[12], w[11], selector); + w[44] = hc_byte_perm (w[11], w[10], selector); + w[43] = hc_byte_perm (w[10], w[ 9], selector); + w[42] = hc_byte_perm (w[ 9], w[ 8], selector); + w[41] = hc_byte_perm (w[ 8], w[ 7], selector); + w[40] = hc_byte_perm (w[ 7], w[ 6], selector); + w[39] = hc_byte_perm (w[ 6], w[ 5], selector); + w[38] = hc_byte_perm (w[ 5], w[ 4], selector); + w[37] = hc_byte_perm (w[ 4], w[ 3], selector); + w[36] = hc_byte_perm (w[ 3], w[ 2], selector); + w[35] = hc_byte_perm (w[ 2], w[ 1], selector); + w[34] = hc_byte_perm (w[ 1], w[ 0], selector); + w[33] = hc_byte_perm (w[ 0], 0, selector); + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 34: + w[63] = hc_byte_perm (w[29], w[28], selector); + w[62] = hc_byte_perm (w[28], w[27], selector); + w[61] = hc_byte_perm (w[27], w[26], selector); + w[60] = hc_byte_perm (w[26], w[25], selector); + w[59] = hc_byte_perm (w[25], w[24], selector); + w[58] = hc_byte_perm (w[24], w[23], selector); + w[57] = hc_byte_perm (w[23], w[22], selector); + w[56] = hc_byte_perm (w[22], w[21], selector); + w[55] = hc_byte_perm (w[21], w[20], selector); + w[54] = hc_byte_perm (w[20], w[19], selector); + w[53] = hc_byte_perm (w[19], w[18], selector); + w[52] = hc_byte_perm (w[18], w[17], selector); + w[51] = hc_byte_perm (w[17], w[16], selector); + w[50] = hc_byte_perm (w[16], w[15], selector); + w[49] = hc_byte_perm (w[15], w[14], selector); + w[48] = hc_byte_perm (w[14], w[13], selector); + w[47] = hc_byte_perm (w[13], w[12], selector); + w[46] = hc_byte_perm (w[12], w[11], selector); + w[45] = hc_byte_perm (w[11], w[10], selector); + w[44] = hc_byte_perm (w[10], w[ 9], selector); + w[43] = hc_byte_perm (w[ 9], w[ 8], selector); + w[42] = hc_byte_perm (w[ 8], w[ 7], selector); + w[41] = hc_byte_perm (w[ 7], w[ 6], selector); + w[40] = hc_byte_perm (w[ 6], w[ 5], selector); + w[39] = hc_byte_perm (w[ 5], w[ 4], selector); + w[38] = hc_byte_perm (w[ 4], w[ 3], selector); + w[37] = hc_byte_perm (w[ 3], w[ 2], selector); + w[36] = hc_byte_perm (w[ 2], w[ 1], selector); + w[35] = hc_byte_perm (w[ 1], w[ 0], selector); + w[34] = hc_byte_perm (w[ 0], 0, selector); + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; @@ -27540,53 +32084,53 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 18: - w[63] = hc_byte_perm (w[45], w[44], selector); - w[62] = hc_byte_perm (w[44], w[43], selector); - w[61] = hc_byte_perm (w[43], w[42], selector); - w[60] = hc_byte_perm (w[42], w[41], selector); - w[59] = hc_byte_perm (w[41], w[40], selector); - w[58] = hc_byte_perm (w[40], w[39], selector); - w[57] = hc_byte_perm (w[39], w[38], selector); - w[56] = hc_byte_perm (w[38], w[37], selector); - w[55] = hc_byte_perm (w[37], w[36], selector); - w[54] = hc_byte_perm (w[36], w[35], selector); - w[53] = hc_byte_perm (w[35], w[34], selector); - w[52] = hc_byte_perm (w[34], w[33], selector); - w[51] = hc_byte_perm (w[33], w[32], selector); - w[50] = hc_byte_perm (w[32], w[31], selector); - w[49] = hc_byte_perm (w[31], w[30], selector); - w[48] = hc_byte_perm (w[30], w[29], selector); - w[47] = hc_byte_perm (w[29], w[28], selector); - w[46] = hc_byte_perm (w[28], w[27], selector); - w[45] = hc_byte_perm (w[27], w[26], selector); - w[44] = hc_byte_perm (w[26], w[25], selector); - w[43] = hc_byte_perm (w[25], w[24], selector); - w[42] = hc_byte_perm (w[24], w[23], selector); - w[41] = hc_byte_perm (w[23], w[22], selector); - w[40] = hc_byte_perm (w[22], w[21], selector); - w[39] = hc_byte_perm (w[21], w[20], selector); - w[38] = hc_byte_perm (w[20], w[19], selector); - w[37] = hc_byte_perm (w[19], w[18], selector); - w[36] = hc_byte_perm (w[18], w[17], selector); - w[35] = hc_byte_perm (w[17], w[16], selector); - w[34] = hc_byte_perm (w[16], w[15], selector); - w[33] = hc_byte_perm (w[15], w[14], selector); - w[32] = hc_byte_perm (w[14], w[13], selector); - w[31] = hc_byte_perm (w[13], w[12], selector); - w[30] = hc_byte_perm (w[12], w[11], selector); - w[29] = hc_byte_perm (w[11], w[10], selector); - w[28] = hc_byte_perm (w[10], w[ 9], selector); - w[27] = hc_byte_perm (w[ 9], w[ 8], selector); - w[26] = hc_byte_perm (w[ 8], w[ 7], selector); - w[25] = hc_byte_perm (w[ 7], w[ 6], selector); - w[24] = hc_byte_perm (w[ 6], w[ 5], selector); - w[23] = hc_byte_perm (w[ 5], w[ 4], selector); - w[22] = hc_byte_perm (w[ 4], w[ 3], selector); - w[21] = hc_byte_perm (w[ 3], w[ 2], selector); - w[20] = hc_byte_perm (w[ 2], w[ 1], selector); - w[19] = hc_byte_perm (w[ 1], w[ 0], selector); - w[18] = hc_byte_perm (w[ 0], 0, selector); + case 35: + w[63] = hc_byte_perm (w[28], w[27], selector); + w[62] = hc_byte_perm (w[27], w[26], selector); + w[61] = hc_byte_perm (w[26], w[25], selector); + w[60] = hc_byte_perm (w[25], w[24], selector); + w[59] = hc_byte_perm (w[24], w[23], selector); + w[58] = hc_byte_perm (w[23], w[22], selector); + w[57] = hc_byte_perm (w[22], w[21], selector); + w[56] = hc_byte_perm (w[21], w[20], selector); + w[55] = hc_byte_perm (w[20], w[19], selector); + w[54] = hc_byte_perm (w[19], w[18], selector); + w[53] = hc_byte_perm (w[18], w[17], selector); + w[52] = hc_byte_perm (w[17], w[16], selector); + w[51] = hc_byte_perm (w[16], w[15], selector); + w[50] = hc_byte_perm (w[15], w[14], selector); + w[49] = hc_byte_perm (w[14], w[13], selector); + w[48] = hc_byte_perm (w[13], w[12], selector); + w[47] = hc_byte_perm (w[12], w[11], selector); + w[46] = hc_byte_perm (w[11], w[10], selector); + w[45] = hc_byte_perm (w[10], w[ 9], selector); + w[44] = hc_byte_perm (w[ 9], w[ 8], selector); + w[43] = hc_byte_perm (w[ 8], w[ 7], selector); + w[42] = hc_byte_perm (w[ 7], w[ 6], selector); + w[41] = hc_byte_perm (w[ 6], w[ 5], selector); + w[40] = hc_byte_perm (w[ 5], w[ 4], selector); + w[39] = hc_byte_perm (w[ 4], w[ 3], selector); + w[38] = hc_byte_perm (w[ 3], w[ 2], selector); + w[37] = hc_byte_perm (w[ 2], w[ 1], selector); + w[36] = hc_byte_perm (w[ 1], w[ 0], selector); + w[35] = hc_byte_perm (w[ 0], 0, selector); + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; @@ -27608,52 +32152,120 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 19: - w[63] = hc_byte_perm (w[44], w[43], selector); - w[62] = hc_byte_perm (w[43], w[42], selector); - w[61] = hc_byte_perm (w[42], w[41], selector); - w[60] = hc_byte_perm (w[41], w[40], selector); - w[59] = hc_byte_perm (w[40], w[39], selector); - w[58] = hc_byte_perm (w[39], w[38], selector); - w[57] = hc_byte_perm (w[38], w[37], selector); - w[56] = hc_byte_perm (w[37], w[36], selector); - w[55] = hc_byte_perm (w[36], w[35], selector); - w[54] = hc_byte_perm (w[35], w[34], selector); - w[53] = hc_byte_perm (w[34], w[33], selector); - w[52] = hc_byte_perm (w[33], w[32], selector); - w[51] = hc_byte_perm (w[32], w[31], selector); - w[50] = hc_byte_perm (w[31], w[30], selector); - w[49] = hc_byte_perm (w[30], w[29], selector); - w[48] = hc_byte_perm (w[29], w[28], selector); - w[47] = hc_byte_perm (w[28], w[27], selector); - w[46] = hc_byte_perm (w[27], w[26], selector); - w[45] = hc_byte_perm (w[26], w[25], selector); - w[44] = hc_byte_perm (w[25], w[24], selector); - w[43] = hc_byte_perm (w[24], w[23], selector); - w[42] = hc_byte_perm (w[23], w[22], selector); - w[41] = hc_byte_perm (w[22], w[21], selector); - w[40] = hc_byte_perm (w[21], w[20], selector); - w[39] = hc_byte_perm (w[20], w[19], selector); - w[38] = hc_byte_perm (w[19], w[18], selector); - w[37] = hc_byte_perm (w[18], w[17], selector); - w[36] = hc_byte_perm (w[17], w[16], selector); - w[35] = hc_byte_perm (w[16], w[15], selector); - w[34] = hc_byte_perm (w[15], w[14], selector); - w[33] = hc_byte_perm (w[14], w[13], selector); - w[32] = hc_byte_perm (w[13], w[12], selector); - w[31] = hc_byte_perm (w[12], w[11], selector); - w[30] = hc_byte_perm (w[11], w[10], selector); - w[29] = hc_byte_perm (w[10], w[ 9], selector); - w[28] = hc_byte_perm (w[ 9], w[ 8], selector); - w[27] = hc_byte_perm (w[ 8], w[ 7], selector); - w[26] = hc_byte_perm (w[ 7], w[ 6], selector); - w[25] = hc_byte_perm (w[ 6], w[ 5], selector); - w[24] = hc_byte_perm (w[ 5], w[ 4], selector); - w[23] = hc_byte_perm (w[ 4], w[ 3], selector); - w[22] = hc_byte_perm (w[ 3], w[ 2], selector); - w[21] = hc_byte_perm (w[ 2], w[ 1], selector); - w[20] = hc_byte_perm (w[ 1], w[ 0], selector); - w[19] = hc_byte_perm (w[ 0], 0, selector); + case 36: + w[63] = hc_byte_perm (w[27], w[26], selector); + w[62] = hc_byte_perm (w[26], w[25], selector); + w[61] = hc_byte_perm (w[25], w[24], selector); + w[60] = hc_byte_perm (w[24], w[23], selector); + w[59] = hc_byte_perm (w[23], w[22], selector); + w[58] = hc_byte_perm (w[22], w[21], selector); + w[57] = hc_byte_perm (w[21], w[20], selector); + w[56] = hc_byte_perm (w[20], w[19], selector); + w[55] = hc_byte_perm (w[19], w[18], selector); + w[54] = hc_byte_perm (w[18], w[17], selector); + w[53] = hc_byte_perm (w[17], w[16], selector); + w[52] = hc_byte_perm (w[16], w[15], selector); + w[51] = hc_byte_perm (w[15], w[14], selector); + w[50] = hc_byte_perm (w[14], w[13], selector); + w[49] = hc_byte_perm (w[13], w[12], selector); + w[48] = hc_byte_perm (w[12], w[11], selector); + w[47] = hc_byte_perm (w[11], w[10], selector); + w[46] = hc_byte_perm (w[10], w[ 9], selector); + w[45] = hc_byte_perm (w[ 9], w[ 8], selector); + w[44] = hc_byte_perm (w[ 8], w[ 7], selector); + w[43] = hc_byte_perm (w[ 7], w[ 6], selector); + w[42] = hc_byte_perm (w[ 6], w[ 5], selector); + w[41] = hc_byte_perm (w[ 5], w[ 4], selector); + w[40] = hc_byte_perm (w[ 4], w[ 3], selector); + w[39] = hc_byte_perm (w[ 3], w[ 2], selector); + w[38] = hc_byte_perm (w[ 2], w[ 1], selector); + w[37] = hc_byte_perm (w[ 1], w[ 0], selector); + w[36] = hc_byte_perm (w[ 0], 0, selector); + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 37: + w[63] = hc_byte_perm (w[26], w[25], selector); + w[62] = hc_byte_perm (w[25], w[24], selector); + w[61] = hc_byte_perm (w[24], w[23], selector); + w[60] = hc_byte_perm (w[23], w[22], selector); + w[59] = hc_byte_perm (w[22], w[21], selector); + w[58] = hc_byte_perm (w[21], w[20], selector); + w[57] = hc_byte_perm (w[20], w[19], selector); + w[56] = hc_byte_perm (w[19], w[18], selector); + w[55] = hc_byte_perm (w[18], w[17], selector); + w[54] = hc_byte_perm (w[17], w[16], selector); + w[53] = hc_byte_perm (w[16], w[15], selector); + w[52] = hc_byte_perm (w[15], w[14], selector); + w[51] = hc_byte_perm (w[14], w[13], selector); + w[50] = hc_byte_perm (w[13], w[12], selector); + w[49] = hc_byte_perm (w[12], w[11], selector); + w[48] = hc_byte_perm (w[11], w[10], selector); + w[47] = hc_byte_perm (w[10], w[ 9], selector); + w[46] = hc_byte_perm (w[ 9], w[ 8], selector); + w[45] = hc_byte_perm (w[ 8], w[ 7], selector); + w[44] = hc_byte_perm (w[ 7], w[ 6], selector); + w[43] = hc_byte_perm (w[ 6], w[ 5], selector); + w[42] = hc_byte_perm (w[ 5], w[ 4], selector); + w[41] = hc_byte_perm (w[ 4], w[ 3], selector); + w[40] = hc_byte_perm (w[ 3], w[ 2], selector); + w[39] = hc_byte_perm (w[ 2], w[ 1], selector); + w[38] = hc_byte_perm (w[ 1], w[ 0], selector); + w[37] = hc_byte_perm (w[ 0], 0, selector); + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; @@ -27676,51 +32288,51 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 20: - w[63] = hc_byte_perm (w[43], w[42], selector); - w[62] = hc_byte_perm (w[42], w[41], selector); - w[61] = hc_byte_perm (w[41], w[40], selector); - w[60] = hc_byte_perm (w[40], w[39], selector); - w[59] = hc_byte_perm (w[39], w[38], selector); - w[58] = hc_byte_perm (w[38], w[37], selector); - w[57] = hc_byte_perm (w[37], w[36], selector); - w[56] = hc_byte_perm (w[36], w[35], selector); - w[55] = hc_byte_perm (w[35], w[34], selector); - w[54] = hc_byte_perm (w[34], w[33], selector); - w[53] = hc_byte_perm (w[33], w[32], selector); - w[52] = hc_byte_perm (w[32], w[31], selector); - w[51] = hc_byte_perm (w[31], w[30], selector); - w[50] = hc_byte_perm (w[30], w[29], selector); - w[49] = hc_byte_perm (w[29], w[28], selector); - w[48] = hc_byte_perm (w[28], w[27], selector); - w[47] = hc_byte_perm (w[27], w[26], selector); - w[46] = hc_byte_perm (w[26], w[25], selector); - w[45] = hc_byte_perm (w[25], w[24], selector); - w[44] = hc_byte_perm (w[24], w[23], selector); - w[43] = hc_byte_perm (w[23], w[22], selector); - w[42] = hc_byte_perm (w[22], w[21], selector); - w[41] = hc_byte_perm (w[21], w[20], selector); - w[40] = hc_byte_perm (w[20], w[19], selector); - w[39] = hc_byte_perm (w[19], w[18], selector); - w[38] = hc_byte_perm (w[18], w[17], selector); - w[37] = hc_byte_perm (w[17], w[16], selector); - w[36] = hc_byte_perm (w[16], w[15], selector); - w[35] = hc_byte_perm (w[15], w[14], selector); - w[34] = hc_byte_perm (w[14], w[13], selector); - w[33] = hc_byte_perm (w[13], w[12], selector); - w[32] = hc_byte_perm (w[12], w[11], selector); - w[31] = hc_byte_perm (w[11], w[10], selector); - w[30] = hc_byte_perm (w[10], w[ 9], selector); - w[29] = hc_byte_perm (w[ 9], w[ 8], selector); - w[28] = hc_byte_perm (w[ 8], w[ 7], selector); - w[27] = hc_byte_perm (w[ 7], w[ 6], selector); - w[26] = hc_byte_perm (w[ 6], w[ 5], selector); - w[25] = hc_byte_perm (w[ 5], w[ 4], selector); - w[24] = hc_byte_perm (w[ 4], w[ 3], selector); - w[23] = hc_byte_perm (w[ 3], w[ 2], selector); - w[22] = hc_byte_perm (w[ 2], w[ 1], selector); - w[21] = hc_byte_perm (w[ 1], w[ 0], selector); - w[20] = hc_byte_perm (w[ 0], 0, selector); + case 38: + w[63] = hc_byte_perm (w[25], w[24], selector); + w[62] = hc_byte_perm (w[24], w[23], selector); + w[61] = hc_byte_perm (w[23], w[22], selector); + w[60] = hc_byte_perm (w[22], w[21], selector); + w[59] = hc_byte_perm (w[21], w[20], selector); + w[58] = hc_byte_perm (w[20], w[19], selector); + w[57] = hc_byte_perm (w[19], w[18], selector); + w[56] = hc_byte_perm (w[18], w[17], selector); + w[55] = hc_byte_perm (w[17], w[16], selector); + w[54] = hc_byte_perm (w[16], w[15], selector); + w[53] = hc_byte_perm (w[15], w[14], selector); + w[52] = hc_byte_perm (w[14], w[13], selector); + w[51] = hc_byte_perm (w[13], w[12], selector); + w[50] = hc_byte_perm (w[12], w[11], selector); + w[49] = hc_byte_perm (w[11], w[10], selector); + w[48] = hc_byte_perm (w[10], w[ 9], selector); + w[47] = hc_byte_perm (w[ 9], w[ 8], selector); + w[46] = hc_byte_perm (w[ 8], w[ 7], selector); + w[45] = hc_byte_perm (w[ 7], w[ 6], selector); + w[44] = hc_byte_perm (w[ 6], w[ 5], selector); + w[43] = hc_byte_perm (w[ 5], w[ 4], selector); + w[42] = hc_byte_perm (w[ 4], w[ 3], selector); + w[41] = hc_byte_perm (w[ 3], w[ 2], selector); + w[40] = hc_byte_perm (w[ 2], w[ 1], selector); + w[39] = hc_byte_perm (w[ 1], w[ 0], selector); + w[38] = hc_byte_perm (w[ 0], 0, selector); + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; @@ -27744,50 +32356,118 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 21: - w[63] = hc_byte_perm (w[42], w[41], selector); - w[62] = hc_byte_perm (w[41], w[40], selector); - w[61] = hc_byte_perm (w[40], w[39], selector); - w[60] = hc_byte_perm (w[39], w[38], selector); - w[59] = hc_byte_perm (w[38], w[37], selector); - w[58] = hc_byte_perm (w[37], w[36], selector); - w[57] = hc_byte_perm (w[36], w[35], selector); - w[56] = hc_byte_perm (w[35], w[34], selector); - w[55] = hc_byte_perm (w[34], w[33], selector); - w[54] = hc_byte_perm (w[33], w[32], selector); - w[53] = hc_byte_perm (w[32], w[31], selector); - w[52] = hc_byte_perm (w[31], w[30], selector); - w[51] = hc_byte_perm (w[30], w[29], selector); - w[50] = hc_byte_perm (w[29], w[28], selector); - w[49] = hc_byte_perm (w[28], w[27], selector); - w[48] = hc_byte_perm (w[27], w[26], selector); - w[47] = hc_byte_perm (w[26], w[25], selector); - w[46] = hc_byte_perm (w[25], w[24], selector); - w[45] = hc_byte_perm (w[24], w[23], selector); - w[44] = hc_byte_perm (w[23], w[22], selector); - w[43] = hc_byte_perm (w[22], w[21], selector); - w[42] = hc_byte_perm (w[21], w[20], selector); - w[41] = hc_byte_perm (w[20], w[19], selector); - w[40] = hc_byte_perm (w[19], w[18], selector); - w[39] = hc_byte_perm (w[18], w[17], selector); - w[38] = hc_byte_perm (w[17], w[16], selector); - w[37] = hc_byte_perm (w[16], w[15], selector); - w[36] = hc_byte_perm (w[15], w[14], selector); - w[35] = hc_byte_perm (w[14], w[13], selector); - w[34] = hc_byte_perm (w[13], w[12], selector); - w[33] = hc_byte_perm (w[12], w[11], selector); - w[32] = hc_byte_perm (w[11], w[10], selector); - w[31] = hc_byte_perm (w[10], w[ 9], selector); - w[30] = hc_byte_perm (w[ 9], w[ 8], selector); - w[29] = hc_byte_perm (w[ 8], w[ 7], selector); - w[28] = hc_byte_perm (w[ 7], w[ 6], selector); - w[27] = hc_byte_perm (w[ 6], w[ 5], selector); - w[26] = hc_byte_perm (w[ 5], w[ 4], selector); - w[25] = hc_byte_perm (w[ 4], w[ 3], selector); - w[24] = hc_byte_perm (w[ 3], w[ 2], selector); - w[23] = hc_byte_perm (w[ 2], w[ 1], selector); - w[22] = hc_byte_perm (w[ 1], w[ 0], selector); - w[21] = hc_byte_perm (w[ 0], 0, selector); + case 39: + w[63] = hc_byte_perm (w[24], w[23], selector); + w[62] = hc_byte_perm (w[23], w[22], selector); + w[61] = hc_byte_perm (w[22], w[21], selector); + w[60] = hc_byte_perm (w[21], w[20], selector); + w[59] = hc_byte_perm (w[20], w[19], selector); + w[58] = hc_byte_perm (w[19], w[18], selector); + w[57] = hc_byte_perm (w[18], w[17], selector); + w[56] = hc_byte_perm (w[17], w[16], selector); + w[55] = hc_byte_perm (w[16], w[15], selector); + w[54] = hc_byte_perm (w[15], w[14], selector); + w[53] = hc_byte_perm (w[14], w[13], selector); + w[52] = hc_byte_perm (w[13], w[12], selector); + w[51] = hc_byte_perm (w[12], w[11], selector); + w[50] = hc_byte_perm (w[11], w[10], selector); + w[49] = hc_byte_perm (w[10], w[ 9], selector); + w[48] = hc_byte_perm (w[ 9], w[ 8], selector); + w[47] = hc_byte_perm (w[ 8], w[ 7], selector); + w[46] = hc_byte_perm (w[ 7], w[ 6], selector); + w[45] = hc_byte_perm (w[ 6], w[ 5], selector); + w[44] = hc_byte_perm (w[ 5], w[ 4], selector); + w[43] = hc_byte_perm (w[ 4], w[ 3], selector); + w[42] = hc_byte_perm (w[ 3], w[ 2], selector); + w[41] = hc_byte_perm (w[ 2], w[ 1], selector); + w[40] = hc_byte_perm (w[ 1], w[ 0], selector); + w[39] = hc_byte_perm (w[ 0], 0, selector); + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 40: + w[63] = hc_byte_perm (w[23], w[22], selector); + w[62] = hc_byte_perm (w[22], w[21], selector); + w[61] = hc_byte_perm (w[21], w[20], selector); + w[60] = hc_byte_perm (w[20], w[19], selector); + w[59] = hc_byte_perm (w[19], w[18], selector); + w[58] = hc_byte_perm (w[18], w[17], selector); + w[57] = hc_byte_perm (w[17], w[16], selector); + w[56] = hc_byte_perm (w[16], w[15], selector); + w[55] = hc_byte_perm (w[15], w[14], selector); + w[54] = hc_byte_perm (w[14], w[13], selector); + w[53] = hc_byte_perm (w[13], w[12], selector); + w[52] = hc_byte_perm (w[12], w[11], selector); + w[51] = hc_byte_perm (w[11], w[10], selector); + w[50] = hc_byte_perm (w[10], w[ 9], selector); + w[49] = hc_byte_perm (w[ 9], w[ 8], selector); + w[48] = hc_byte_perm (w[ 8], w[ 7], selector); + w[47] = hc_byte_perm (w[ 7], w[ 6], selector); + w[46] = hc_byte_perm (w[ 6], w[ 5], selector); + w[45] = hc_byte_perm (w[ 5], w[ 4], selector); + w[44] = hc_byte_perm (w[ 4], w[ 3], selector); + w[43] = hc_byte_perm (w[ 3], w[ 2], selector); + w[42] = hc_byte_perm (w[ 2], w[ 1], selector); + w[41] = hc_byte_perm (w[ 1], w[ 0], selector); + w[40] = hc_byte_perm (w[ 0], 0, selector); + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; @@ -27812,49 +32492,49 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 22: - w[63] = hc_byte_perm (w[41], w[40], selector); - w[62] = hc_byte_perm (w[40], w[39], selector); - w[61] = hc_byte_perm (w[39], w[38], selector); - w[60] = hc_byte_perm (w[38], w[37], selector); - w[59] = hc_byte_perm (w[37], w[36], selector); - w[58] = hc_byte_perm (w[36], w[35], selector); - w[57] = hc_byte_perm (w[35], w[34], selector); - w[56] = hc_byte_perm (w[34], w[33], selector); - w[55] = hc_byte_perm (w[33], w[32], selector); - w[54] = hc_byte_perm (w[32], w[31], selector); - w[53] = hc_byte_perm (w[31], w[30], selector); - w[52] = hc_byte_perm (w[30], w[29], selector); - w[51] = hc_byte_perm (w[29], w[28], selector); - w[50] = hc_byte_perm (w[28], w[27], selector); - w[49] = hc_byte_perm (w[27], w[26], selector); - w[48] = hc_byte_perm (w[26], w[25], selector); - w[47] = hc_byte_perm (w[25], w[24], selector); - w[46] = hc_byte_perm (w[24], w[23], selector); - w[45] = hc_byte_perm (w[23], w[22], selector); - w[44] = hc_byte_perm (w[22], w[21], selector); - w[43] = hc_byte_perm (w[21], w[20], selector); - w[42] = hc_byte_perm (w[20], w[19], selector); - w[41] = hc_byte_perm (w[19], w[18], selector); - w[40] = hc_byte_perm (w[18], w[17], selector); - w[39] = hc_byte_perm (w[17], w[16], selector); - w[38] = hc_byte_perm (w[16], w[15], selector); - w[37] = hc_byte_perm (w[15], w[14], selector); - w[36] = hc_byte_perm (w[14], w[13], selector); - w[35] = hc_byte_perm (w[13], w[12], selector); - w[34] = hc_byte_perm (w[12], w[11], selector); - w[33] = hc_byte_perm (w[11], w[10], selector); - w[32] = hc_byte_perm (w[10], w[ 9], selector); - w[31] = hc_byte_perm (w[ 9], w[ 8], selector); - w[30] = hc_byte_perm (w[ 8], w[ 7], selector); - w[29] = hc_byte_perm (w[ 7], w[ 6], selector); - w[28] = hc_byte_perm (w[ 6], w[ 5], selector); - w[27] = hc_byte_perm (w[ 5], w[ 4], selector); - w[26] = hc_byte_perm (w[ 4], w[ 3], selector); - w[25] = hc_byte_perm (w[ 3], w[ 2], selector); - w[24] = hc_byte_perm (w[ 2], w[ 1], selector); - w[23] = hc_byte_perm (w[ 1], w[ 0], selector); - w[22] = hc_byte_perm (w[ 0], 0, selector); + case 41: + w[63] = hc_byte_perm (w[22], w[21], selector); + w[62] = hc_byte_perm (w[21], w[20], selector); + w[61] = hc_byte_perm (w[20], w[19], selector); + w[60] = hc_byte_perm (w[19], w[18], selector); + w[59] = hc_byte_perm (w[18], w[17], selector); + w[58] = hc_byte_perm (w[17], w[16], selector); + w[57] = hc_byte_perm (w[16], w[15], selector); + w[56] = hc_byte_perm (w[15], w[14], selector); + w[55] = hc_byte_perm (w[14], w[13], selector); + w[54] = hc_byte_perm (w[13], w[12], selector); + w[53] = hc_byte_perm (w[12], w[11], selector); + w[52] = hc_byte_perm (w[11], w[10], selector); + w[51] = hc_byte_perm (w[10], w[ 9], selector); + w[50] = hc_byte_perm (w[ 9], w[ 8], selector); + w[49] = hc_byte_perm (w[ 8], w[ 7], selector); + w[48] = hc_byte_perm (w[ 7], w[ 6], selector); + w[47] = hc_byte_perm (w[ 6], w[ 5], selector); + w[46] = hc_byte_perm (w[ 5], w[ 4], selector); + w[45] = hc_byte_perm (w[ 4], w[ 3], selector); + w[44] = hc_byte_perm (w[ 3], w[ 2], selector); + w[43] = hc_byte_perm (w[ 2], w[ 1], selector); + w[42] = hc_byte_perm (w[ 1], w[ 0], selector); + w[41] = hc_byte_perm (w[ 0], 0, selector); + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; @@ -27880,48 +32560,48 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 23: - w[63] = hc_byte_perm (w[40], w[39], selector); - w[62] = hc_byte_perm (w[39], w[38], selector); - w[61] = hc_byte_perm (w[38], w[37], selector); - w[60] = hc_byte_perm (w[37], w[36], selector); - w[59] = hc_byte_perm (w[36], w[35], selector); - w[58] = hc_byte_perm (w[35], w[34], selector); - w[57] = hc_byte_perm (w[34], w[33], selector); - w[56] = hc_byte_perm (w[33], w[32], selector); - w[55] = hc_byte_perm (w[32], w[31], selector); - w[54] = hc_byte_perm (w[31], w[30], selector); - w[53] = hc_byte_perm (w[30], w[29], selector); - w[52] = hc_byte_perm (w[29], w[28], selector); - w[51] = hc_byte_perm (w[28], w[27], selector); - w[50] = hc_byte_perm (w[27], w[26], selector); - w[49] = hc_byte_perm (w[26], w[25], selector); - w[48] = hc_byte_perm (w[25], w[24], selector); - w[47] = hc_byte_perm (w[24], w[23], selector); - w[46] = hc_byte_perm (w[23], w[22], selector); - w[45] = hc_byte_perm (w[22], w[21], selector); - w[44] = hc_byte_perm (w[21], w[20], selector); - w[43] = hc_byte_perm (w[20], w[19], selector); - w[42] = hc_byte_perm (w[19], w[18], selector); - w[41] = hc_byte_perm (w[18], w[17], selector); - w[40] = hc_byte_perm (w[17], w[16], selector); - w[39] = hc_byte_perm (w[16], w[15], selector); - w[38] = hc_byte_perm (w[15], w[14], selector); - w[37] = hc_byte_perm (w[14], w[13], selector); - w[36] = hc_byte_perm (w[13], w[12], selector); - w[35] = hc_byte_perm (w[12], w[11], selector); - w[34] = hc_byte_perm (w[11], w[10], selector); - w[33] = hc_byte_perm (w[10], w[ 9], selector); - w[32] = hc_byte_perm (w[ 9], w[ 8], selector); - w[31] = hc_byte_perm (w[ 8], w[ 7], selector); - w[30] = hc_byte_perm (w[ 7], w[ 6], selector); - w[29] = hc_byte_perm (w[ 6], w[ 5], selector); - w[28] = hc_byte_perm (w[ 5], w[ 4], selector); - w[27] = hc_byte_perm (w[ 4], w[ 3], selector); - w[26] = hc_byte_perm (w[ 3], w[ 2], selector); - w[25] = hc_byte_perm (w[ 2], w[ 1], selector); - w[24] = hc_byte_perm (w[ 1], w[ 0], selector); - w[23] = hc_byte_perm (w[ 0], 0, selector); + case 42: + w[63] = hc_byte_perm (w[21], w[20], selector); + w[62] = hc_byte_perm (w[20], w[19], selector); + w[61] = hc_byte_perm (w[19], w[18], selector); + w[60] = hc_byte_perm (w[18], w[17], selector); + w[59] = hc_byte_perm (w[17], w[16], selector); + w[58] = hc_byte_perm (w[16], w[15], selector); + w[57] = hc_byte_perm (w[15], w[14], selector); + w[56] = hc_byte_perm (w[14], w[13], selector); + w[55] = hc_byte_perm (w[13], w[12], selector); + w[54] = hc_byte_perm (w[12], w[11], selector); + w[53] = hc_byte_perm (w[11], w[10], selector); + w[52] = hc_byte_perm (w[10], w[ 9], selector); + w[51] = hc_byte_perm (w[ 9], w[ 8], selector); + w[50] = hc_byte_perm (w[ 8], w[ 7], selector); + w[49] = hc_byte_perm (w[ 7], w[ 6], selector); + w[48] = hc_byte_perm (w[ 6], w[ 5], selector); + w[47] = hc_byte_perm (w[ 5], w[ 4], selector); + w[46] = hc_byte_perm (w[ 4], w[ 3], selector); + w[45] = hc_byte_perm (w[ 3], w[ 2], selector); + w[44] = hc_byte_perm (w[ 2], w[ 1], selector); + w[43] = hc_byte_perm (w[ 1], w[ 0], selector); + w[42] = hc_byte_perm (w[ 0], 0, selector); + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; @@ -27948,47 +32628,47 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 24: - w[63] = hc_byte_perm (w[39], w[38], selector); - w[62] = hc_byte_perm (w[38], w[37], selector); - w[61] = hc_byte_perm (w[37], w[36], selector); - w[60] = hc_byte_perm (w[36], w[35], selector); - w[59] = hc_byte_perm (w[35], w[34], selector); - w[58] = hc_byte_perm (w[34], w[33], selector); - w[57] = hc_byte_perm (w[33], w[32], selector); - w[56] = hc_byte_perm (w[32], w[31], selector); - w[55] = hc_byte_perm (w[31], w[30], selector); - w[54] = hc_byte_perm (w[30], w[29], selector); - w[53] = hc_byte_perm (w[29], w[28], selector); - w[52] = hc_byte_perm (w[28], w[27], selector); - w[51] = hc_byte_perm (w[27], w[26], selector); - w[50] = hc_byte_perm (w[26], w[25], selector); - w[49] = hc_byte_perm (w[25], w[24], selector); - w[48] = hc_byte_perm (w[24], w[23], selector); - w[47] = hc_byte_perm (w[23], w[22], selector); - w[46] = hc_byte_perm (w[22], w[21], selector); - w[45] = hc_byte_perm (w[21], w[20], selector); - w[44] = hc_byte_perm (w[20], w[19], selector); - w[43] = hc_byte_perm (w[19], w[18], selector); - w[42] = hc_byte_perm (w[18], w[17], selector); - w[41] = hc_byte_perm (w[17], w[16], selector); - w[40] = hc_byte_perm (w[16], w[15], selector); - w[39] = hc_byte_perm (w[15], w[14], selector); - w[38] = hc_byte_perm (w[14], w[13], selector); - w[37] = hc_byte_perm (w[13], w[12], selector); - w[36] = hc_byte_perm (w[12], w[11], selector); - w[35] = hc_byte_perm (w[11], w[10], selector); - w[34] = hc_byte_perm (w[10], w[ 9], selector); - w[33] = hc_byte_perm (w[ 9], w[ 8], selector); - w[32] = hc_byte_perm (w[ 8], w[ 7], selector); - w[31] = hc_byte_perm (w[ 7], w[ 6], selector); - w[30] = hc_byte_perm (w[ 6], w[ 5], selector); - w[29] = hc_byte_perm (w[ 5], w[ 4], selector); - w[28] = hc_byte_perm (w[ 4], w[ 3], selector); - w[27] = hc_byte_perm (w[ 3], w[ 2], selector); - w[26] = hc_byte_perm (w[ 2], w[ 1], selector); - w[25] = hc_byte_perm (w[ 1], w[ 0], selector); - w[24] = hc_byte_perm (w[ 0], 0, selector); + case 43: + w[63] = hc_byte_perm (w[20], w[19], selector); + w[62] = hc_byte_perm (w[19], w[18], selector); + w[61] = hc_byte_perm (w[18], w[17], selector); + w[60] = hc_byte_perm (w[17], w[16], selector); + w[59] = hc_byte_perm (w[16], w[15], selector); + w[58] = hc_byte_perm (w[15], w[14], selector); + w[57] = hc_byte_perm (w[14], w[13], selector); + w[56] = hc_byte_perm (w[13], w[12], selector); + w[55] = hc_byte_perm (w[12], w[11], selector); + w[54] = hc_byte_perm (w[11], w[10], selector); + w[53] = hc_byte_perm (w[10], w[ 9], selector); + w[52] = hc_byte_perm (w[ 9], w[ 8], selector); + w[51] = hc_byte_perm (w[ 8], w[ 7], selector); + w[50] = hc_byte_perm (w[ 7], w[ 6], selector); + w[49] = hc_byte_perm (w[ 6], w[ 5], selector); + w[48] = hc_byte_perm (w[ 5], w[ 4], selector); + w[47] = hc_byte_perm (w[ 4], w[ 3], selector); + w[46] = hc_byte_perm (w[ 3], w[ 2], selector); + w[45] = hc_byte_perm (w[ 2], w[ 1], selector); + w[44] = hc_byte_perm (w[ 1], w[ 0], selector); + w[43] = hc_byte_perm (w[ 0], 0, selector); + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; @@ -28016,46 +32696,46 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 25: - w[63] = hc_byte_perm (w[38], w[37], selector); - w[62] = hc_byte_perm (w[37], w[36], selector); - w[61] = hc_byte_perm (w[36], w[35], selector); - w[60] = hc_byte_perm (w[35], w[34], selector); - w[59] = hc_byte_perm (w[34], w[33], selector); - w[58] = hc_byte_perm (w[33], w[32], selector); - w[57] = hc_byte_perm (w[32], w[31], selector); - w[56] = hc_byte_perm (w[31], w[30], selector); - w[55] = hc_byte_perm (w[30], w[29], selector); - w[54] = hc_byte_perm (w[29], w[28], selector); - w[53] = hc_byte_perm (w[28], w[27], selector); - w[52] = hc_byte_perm (w[27], w[26], selector); - w[51] = hc_byte_perm (w[26], w[25], selector); - w[50] = hc_byte_perm (w[25], w[24], selector); - w[49] = hc_byte_perm (w[24], w[23], selector); - w[48] = hc_byte_perm (w[23], w[22], selector); - w[47] = hc_byte_perm (w[22], w[21], selector); - w[46] = hc_byte_perm (w[21], w[20], selector); - w[45] = hc_byte_perm (w[20], w[19], selector); - w[44] = hc_byte_perm (w[19], w[18], selector); - w[43] = hc_byte_perm (w[18], w[17], selector); - w[42] = hc_byte_perm (w[17], w[16], selector); - w[41] = hc_byte_perm (w[16], w[15], selector); - w[40] = hc_byte_perm (w[15], w[14], selector); - w[39] = hc_byte_perm (w[14], w[13], selector); - w[38] = hc_byte_perm (w[13], w[12], selector); - w[37] = hc_byte_perm (w[12], w[11], selector); - w[36] = hc_byte_perm (w[11], w[10], selector); - w[35] = hc_byte_perm (w[10], w[ 9], selector); - w[34] = hc_byte_perm (w[ 9], w[ 8], selector); - w[33] = hc_byte_perm (w[ 8], w[ 7], selector); - w[32] = hc_byte_perm (w[ 7], w[ 6], selector); - w[31] = hc_byte_perm (w[ 6], w[ 5], selector); - w[30] = hc_byte_perm (w[ 5], w[ 4], selector); - w[29] = hc_byte_perm (w[ 4], w[ 3], selector); - w[28] = hc_byte_perm (w[ 3], w[ 2], selector); - w[27] = hc_byte_perm (w[ 2], w[ 1], selector); - w[26] = hc_byte_perm (w[ 1], w[ 0], selector); - w[25] = hc_byte_perm (w[ 0], 0, selector); + case 44: + w[63] = hc_byte_perm (w[19], w[18], selector); + w[62] = hc_byte_perm (w[18], w[17], selector); + w[61] = hc_byte_perm (w[17], w[16], selector); + w[60] = hc_byte_perm (w[16], w[15], selector); + w[59] = hc_byte_perm (w[15], w[14], selector); + w[58] = hc_byte_perm (w[14], w[13], selector); + w[57] = hc_byte_perm (w[13], w[12], selector); + w[56] = hc_byte_perm (w[12], w[11], selector); + w[55] = hc_byte_perm (w[11], w[10], selector); + w[54] = hc_byte_perm (w[10], w[ 9], selector); + w[53] = hc_byte_perm (w[ 9], w[ 8], selector); + w[52] = hc_byte_perm (w[ 8], w[ 7], selector); + w[51] = hc_byte_perm (w[ 7], w[ 6], selector); + w[50] = hc_byte_perm (w[ 6], w[ 5], selector); + w[49] = hc_byte_perm (w[ 5], w[ 4], selector); + w[48] = hc_byte_perm (w[ 4], w[ 3], selector); + w[47] = hc_byte_perm (w[ 3], w[ 2], selector); + w[46] = hc_byte_perm (w[ 2], w[ 1], selector); + w[45] = hc_byte_perm (w[ 1], w[ 0], selector); + w[44] = hc_byte_perm (w[ 0], 0, selector); + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; @@ -28084,45 +32764,45 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 26: - w[63] = hc_byte_perm (w[37], w[36], selector); - w[62] = hc_byte_perm (w[36], w[35], selector); - w[61] = hc_byte_perm (w[35], w[34], selector); - w[60] = hc_byte_perm (w[34], w[33], selector); - w[59] = hc_byte_perm (w[33], w[32], selector); - w[58] = hc_byte_perm (w[32], w[31], selector); - w[57] = hc_byte_perm (w[31], w[30], selector); - w[56] = hc_byte_perm (w[30], w[29], selector); - w[55] = hc_byte_perm (w[29], w[28], selector); - w[54] = hc_byte_perm (w[28], w[27], selector); - w[53] = hc_byte_perm (w[27], w[26], selector); - w[52] = hc_byte_perm (w[26], w[25], selector); - w[51] = hc_byte_perm (w[25], w[24], selector); - w[50] = hc_byte_perm (w[24], w[23], selector); - w[49] = hc_byte_perm (w[23], w[22], selector); - w[48] = hc_byte_perm (w[22], w[21], selector); - w[47] = hc_byte_perm (w[21], w[20], selector); - w[46] = hc_byte_perm (w[20], w[19], selector); - w[45] = hc_byte_perm (w[19], w[18], selector); - w[44] = hc_byte_perm (w[18], w[17], selector); - w[43] = hc_byte_perm (w[17], w[16], selector); - w[42] = hc_byte_perm (w[16], w[15], selector); - w[41] = hc_byte_perm (w[15], w[14], selector); - w[40] = hc_byte_perm (w[14], w[13], selector); - w[39] = hc_byte_perm (w[13], w[12], selector); - w[38] = hc_byte_perm (w[12], w[11], selector); - w[37] = hc_byte_perm (w[11], w[10], selector); - w[36] = hc_byte_perm (w[10], w[ 9], selector); - w[35] = hc_byte_perm (w[ 9], w[ 8], selector); - w[34] = hc_byte_perm (w[ 8], w[ 7], selector); - w[33] = hc_byte_perm (w[ 7], w[ 6], selector); - w[32] = hc_byte_perm (w[ 6], w[ 5], selector); - w[31] = hc_byte_perm (w[ 5], w[ 4], selector); - w[30] = hc_byte_perm (w[ 4], w[ 3], selector); - w[29] = hc_byte_perm (w[ 3], w[ 2], selector); - w[28] = hc_byte_perm (w[ 2], w[ 1], selector); - w[27] = hc_byte_perm (w[ 1], w[ 0], selector); - w[26] = hc_byte_perm (w[ 0], 0, selector); + case 45: + w[63] = hc_byte_perm (w[18], w[17], selector); + w[62] = hc_byte_perm (w[17], w[16], selector); + w[61] = hc_byte_perm (w[16], w[15], selector); + w[60] = hc_byte_perm (w[15], w[14], selector); + w[59] = hc_byte_perm (w[14], w[13], selector); + w[58] = hc_byte_perm (w[13], w[12], selector); + w[57] = hc_byte_perm (w[12], w[11], selector); + w[56] = hc_byte_perm (w[11], w[10], selector); + w[55] = hc_byte_perm (w[10], w[ 9], selector); + w[54] = hc_byte_perm (w[ 9], w[ 8], selector); + w[53] = hc_byte_perm (w[ 8], w[ 7], selector); + w[52] = hc_byte_perm (w[ 7], w[ 6], selector); + w[51] = hc_byte_perm (w[ 6], w[ 5], selector); + w[50] = hc_byte_perm (w[ 5], w[ 4], selector); + w[49] = hc_byte_perm (w[ 4], w[ 3], selector); + w[48] = hc_byte_perm (w[ 3], w[ 2], selector); + w[47] = hc_byte_perm (w[ 2], w[ 1], selector); + w[46] = hc_byte_perm (w[ 1], w[ 0], selector); + w[45] = hc_byte_perm (w[ 0], 0, selector); + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; @@ -28152,44 +32832,112 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 27: - w[63] = hc_byte_perm (w[36], w[35], selector); - w[62] = hc_byte_perm (w[35], w[34], selector); - w[61] = hc_byte_perm (w[34], w[33], selector); - w[60] = hc_byte_perm (w[33], w[32], selector); - w[59] = hc_byte_perm (w[32], w[31], selector); - w[58] = hc_byte_perm (w[31], w[30], selector); - w[57] = hc_byte_perm (w[30], w[29], selector); - w[56] = hc_byte_perm (w[29], w[28], selector); - w[55] = hc_byte_perm (w[28], w[27], selector); - w[54] = hc_byte_perm (w[27], w[26], selector); - w[53] = hc_byte_perm (w[26], w[25], selector); - w[52] = hc_byte_perm (w[25], w[24], selector); - w[51] = hc_byte_perm (w[24], w[23], selector); - w[50] = hc_byte_perm (w[23], w[22], selector); - w[49] = hc_byte_perm (w[22], w[21], selector); - w[48] = hc_byte_perm (w[21], w[20], selector); - w[47] = hc_byte_perm (w[20], w[19], selector); - w[46] = hc_byte_perm (w[19], w[18], selector); - w[45] = hc_byte_perm (w[18], w[17], selector); - w[44] = hc_byte_perm (w[17], w[16], selector); - w[43] = hc_byte_perm (w[16], w[15], selector); - w[42] = hc_byte_perm (w[15], w[14], selector); - w[41] = hc_byte_perm (w[14], w[13], selector); - w[40] = hc_byte_perm (w[13], w[12], selector); - w[39] = hc_byte_perm (w[12], w[11], selector); - w[38] = hc_byte_perm (w[11], w[10], selector); - w[37] = hc_byte_perm (w[10], w[ 9], selector); - w[36] = hc_byte_perm (w[ 9], w[ 8], selector); - w[35] = hc_byte_perm (w[ 8], w[ 7], selector); - w[34] = hc_byte_perm (w[ 7], w[ 6], selector); - w[33] = hc_byte_perm (w[ 6], w[ 5], selector); - w[32] = hc_byte_perm (w[ 5], w[ 4], selector); - w[31] = hc_byte_perm (w[ 4], w[ 3], selector); - w[30] = hc_byte_perm (w[ 3], w[ 2], selector); - w[29] = hc_byte_perm (w[ 2], w[ 1], selector); - w[28] = hc_byte_perm (w[ 1], w[ 0], selector); - w[27] = hc_byte_perm (w[ 0], 0, selector); + case 46: + w[63] = hc_byte_perm (w[17], w[16], selector); + w[62] = hc_byte_perm (w[16], w[15], selector); + w[61] = hc_byte_perm (w[15], w[14], selector); + w[60] = hc_byte_perm (w[14], w[13], selector); + w[59] = hc_byte_perm (w[13], w[12], selector); + w[58] = hc_byte_perm (w[12], w[11], selector); + w[57] = hc_byte_perm (w[11], w[10], selector); + w[56] = hc_byte_perm (w[10], w[ 9], selector); + w[55] = hc_byte_perm (w[ 9], w[ 8], selector); + w[54] = hc_byte_perm (w[ 8], w[ 7], selector); + w[53] = hc_byte_perm (w[ 7], w[ 6], selector); + w[52] = hc_byte_perm (w[ 6], w[ 5], selector); + w[51] = hc_byte_perm (w[ 5], w[ 4], selector); + w[50] = hc_byte_perm (w[ 4], w[ 3], selector); + w[49] = hc_byte_perm (w[ 3], w[ 2], selector); + w[48] = hc_byte_perm (w[ 2], w[ 1], selector); + w[47] = hc_byte_perm (w[ 1], w[ 0], selector); + w[46] = hc_byte_perm (w[ 0], 0, selector); + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 47: + w[63] = hc_byte_perm (w[16], w[15], selector); + w[62] = hc_byte_perm (w[15], w[14], selector); + w[61] = hc_byte_perm (w[14], w[13], selector); + w[60] = hc_byte_perm (w[13], w[12], selector); + w[59] = hc_byte_perm (w[12], w[11], selector); + w[58] = hc_byte_perm (w[11], w[10], selector); + w[57] = hc_byte_perm (w[10], w[ 9], selector); + w[56] = hc_byte_perm (w[ 9], w[ 8], selector); + w[55] = hc_byte_perm (w[ 8], w[ 7], selector); + w[54] = hc_byte_perm (w[ 7], w[ 6], selector); + w[53] = hc_byte_perm (w[ 6], w[ 5], selector); + w[52] = hc_byte_perm (w[ 5], w[ 4], selector); + w[51] = hc_byte_perm (w[ 4], w[ 3], selector); + w[50] = hc_byte_perm (w[ 3], w[ 2], selector); + w[49] = hc_byte_perm (w[ 2], w[ 1], selector); + w[48] = hc_byte_perm (w[ 1], w[ 0], selector); + w[47] = hc_byte_perm (w[ 0], 0, selector); + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; @@ -28220,43 +32968,43 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 28: - w[63] = hc_byte_perm (w[35], w[34], selector); - w[62] = hc_byte_perm (w[34], w[33], selector); - w[61] = hc_byte_perm (w[33], w[32], selector); - w[60] = hc_byte_perm (w[32], w[31], selector); - w[59] = hc_byte_perm (w[31], w[30], selector); - w[58] = hc_byte_perm (w[30], w[29], selector); - w[57] = hc_byte_perm (w[29], w[28], selector); - w[56] = hc_byte_perm (w[28], w[27], selector); - w[55] = hc_byte_perm (w[27], w[26], selector); - w[54] = hc_byte_perm (w[26], w[25], selector); - w[53] = hc_byte_perm (w[25], w[24], selector); - w[52] = hc_byte_perm (w[24], w[23], selector); - w[51] = hc_byte_perm (w[23], w[22], selector); - w[50] = hc_byte_perm (w[22], w[21], selector); - w[49] = hc_byte_perm (w[21], w[20], selector); - w[48] = hc_byte_perm (w[20], w[19], selector); - w[47] = hc_byte_perm (w[19], w[18], selector); - w[46] = hc_byte_perm (w[18], w[17], selector); - w[45] = hc_byte_perm (w[17], w[16], selector); - w[44] = hc_byte_perm (w[16], w[15], selector); - w[43] = hc_byte_perm (w[15], w[14], selector); - w[42] = hc_byte_perm (w[14], w[13], selector); - w[41] = hc_byte_perm (w[13], w[12], selector); - w[40] = hc_byte_perm (w[12], w[11], selector); - w[39] = hc_byte_perm (w[11], w[10], selector); - w[38] = hc_byte_perm (w[10], w[ 9], selector); - w[37] = hc_byte_perm (w[ 9], w[ 8], selector); - w[36] = hc_byte_perm (w[ 8], w[ 7], selector); - w[35] = hc_byte_perm (w[ 7], w[ 6], selector); - w[34] = hc_byte_perm (w[ 6], w[ 5], selector); - w[33] = hc_byte_perm (w[ 5], w[ 4], selector); - w[32] = hc_byte_perm (w[ 4], w[ 3], selector); - w[31] = hc_byte_perm (w[ 3], w[ 2], selector); - w[30] = hc_byte_perm (w[ 2], w[ 1], selector); - w[29] = hc_byte_perm (w[ 1], w[ 0], selector); - w[28] = hc_byte_perm (w[ 0], 0, selector); + case 48: + w[63] = hc_byte_perm (w[15], w[14], selector); + w[62] = hc_byte_perm (w[14], w[13], selector); + w[61] = hc_byte_perm (w[13], w[12], selector); + w[60] = hc_byte_perm (w[12], w[11], selector); + w[59] = hc_byte_perm (w[11], w[10], selector); + w[58] = hc_byte_perm (w[10], w[ 9], selector); + w[57] = hc_byte_perm (w[ 9], w[ 8], selector); + w[56] = hc_byte_perm (w[ 8], w[ 7], selector); + w[55] = hc_byte_perm (w[ 7], w[ 6], selector); + w[54] = hc_byte_perm (w[ 6], w[ 5], selector); + w[53] = hc_byte_perm (w[ 5], w[ 4], selector); + w[52] = hc_byte_perm (w[ 4], w[ 3], selector); + w[51] = hc_byte_perm (w[ 3], w[ 2], selector); + w[50] = hc_byte_perm (w[ 2], w[ 1], selector); + w[49] = hc_byte_perm (w[ 1], w[ 0], selector); + w[48] = hc_byte_perm (w[ 0], 0, selector); + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; @@ -28288,42 +33036,42 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 29: - w[63] = hc_byte_perm (w[34], w[33], selector); - w[62] = hc_byte_perm (w[33], w[32], selector); - w[61] = hc_byte_perm (w[32], w[31], selector); - w[60] = hc_byte_perm (w[31], w[30], selector); - w[59] = hc_byte_perm (w[30], w[29], selector); - w[58] = hc_byte_perm (w[29], w[28], selector); - w[57] = hc_byte_perm (w[28], w[27], selector); - w[56] = hc_byte_perm (w[27], w[26], selector); - w[55] = hc_byte_perm (w[26], w[25], selector); - w[54] = hc_byte_perm (w[25], w[24], selector); - w[53] = hc_byte_perm (w[24], w[23], selector); - w[52] = hc_byte_perm (w[23], w[22], selector); - w[51] = hc_byte_perm (w[22], w[21], selector); - w[50] = hc_byte_perm (w[21], w[20], selector); - w[49] = hc_byte_perm (w[20], w[19], selector); - w[48] = hc_byte_perm (w[19], w[18], selector); - w[47] = hc_byte_perm (w[18], w[17], selector); - w[46] = hc_byte_perm (w[17], w[16], selector); - w[45] = hc_byte_perm (w[16], w[15], selector); - w[44] = hc_byte_perm (w[15], w[14], selector); - w[43] = hc_byte_perm (w[14], w[13], selector); - w[42] = hc_byte_perm (w[13], w[12], selector); - w[41] = hc_byte_perm (w[12], w[11], selector); - w[40] = hc_byte_perm (w[11], w[10], selector); - w[39] = hc_byte_perm (w[10], w[ 9], selector); - w[38] = hc_byte_perm (w[ 9], w[ 8], selector); - w[37] = hc_byte_perm (w[ 8], w[ 7], selector); - w[36] = hc_byte_perm (w[ 7], w[ 6], selector); - w[35] = hc_byte_perm (w[ 6], w[ 5], selector); - w[34] = hc_byte_perm (w[ 5], w[ 4], selector); - w[33] = hc_byte_perm (w[ 4], w[ 3], selector); - w[32] = hc_byte_perm (w[ 3], w[ 2], selector); - w[31] = hc_byte_perm (w[ 2], w[ 1], selector); - w[30] = hc_byte_perm (w[ 1], w[ 0], selector); - w[29] = hc_byte_perm (w[ 0], 0, selector); + case 49: + w[63] = hc_byte_perm (w[14], w[13], selector); + w[62] = hc_byte_perm (w[13], w[12], selector); + w[61] = hc_byte_perm (w[12], w[11], selector); + w[60] = hc_byte_perm (w[11], w[10], selector); + w[59] = hc_byte_perm (w[10], w[ 9], selector); + w[58] = hc_byte_perm (w[ 9], w[ 8], selector); + w[57] = hc_byte_perm (w[ 8], w[ 7], selector); + w[56] = hc_byte_perm (w[ 7], w[ 6], selector); + w[55] = hc_byte_perm (w[ 6], w[ 5], selector); + w[54] = hc_byte_perm (w[ 5], w[ 4], selector); + w[53] = hc_byte_perm (w[ 4], w[ 3], selector); + w[52] = hc_byte_perm (w[ 3], w[ 2], selector); + w[51] = hc_byte_perm (w[ 2], w[ 1], selector); + w[50] = hc_byte_perm (w[ 1], w[ 0], selector); + w[49] = hc_byte_perm (w[ 0], 0, selector); + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; @@ -28356,41 +33104,41 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 30: - w[63] = hc_byte_perm (w[33], w[32], selector); - w[62] = hc_byte_perm (w[32], w[31], selector); - w[61] = hc_byte_perm (w[31], w[30], selector); - w[60] = hc_byte_perm (w[30], w[29], selector); - w[59] = hc_byte_perm (w[29], w[28], selector); - w[58] = hc_byte_perm (w[28], w[27], selector); - w[57] = hc_byte_perm (w[27], w[26], selector); - w[56] = hc_byte_perm (w[26], w[25], selector); - w[55] = hc_byte_perm (w[25], w[24], selector); - w[54] = hc_byte_perm (w[24], w[23], selector); - w[53] = hc_byte_perm (w[23], w[22], selector); - w[52] = hc_byte_perm (w[22], w[21], selector); - w[51] = hc_byte_perm (w[21], w[20], selector); - w[50] = hc_byte_perm (w[20], w[19], selector); - w[49] = hc_byte_perm (w[19], w[18], selector); - w[48] = hc_byte_perm (w[18], w[17], selector); - w[47] = hc_byte_perm (w[17], w[16], selector); - w[46] = hc_byte_perm (w[16], w[15], selector); - w[45] = hc_byte_perm (w[15], w[14], selector); - w[44] = hc_byte_perm (w[14], w[13], selector); - w[43] = hc_byte_perm (w[13], w[12], selector); - w[42] = hc_byte_perm (w[12], w[11], selector); - w[41] = hc_byte_perm (w[11], w[10], selector); - w[40] = hc_byte_perm (w[10], w[ 9], selector); - w[39] = hc_byte_perm (w[ 9], w[ 8], selector); - w[38] = hc_byte_perm (w[ 8], w[ 7], selector); - w[37] = hc_byte_perm (w[ 7], w[ 6], selector); - w[36] = hc_byte_perm (w[ 6], w[ 5], selector); - w[35] = hc_byte_perm (w[ 5], w[ 4], selector); - w[34] = hc_byte_perm (w[ 4], w[ 3], selector); - w[33] = hc_byte_perm (w[ 3], w[ 2], selector); - w[32] = hc_byte_perm (w[ 2], w[ 1], selector); - w[31] = hc_byte_perm (w[ 1], w[ 0], selector); - w[30] = hc_byte_perm (w[ 0], 0, selector); + case 50: + w[63] = hc_byte_perm (w[13], w[12], selector); + w[62] = hc_byte_perm (w[12], w[11], selector); + w[61] = hc_byte_perm (w[11], w[10], selector); + w[60] = hc_byte_perm (w[10], w[ 9], selector); + w[59] = hc_byte_perm (w[ 9], w[ 8], selector); + w[58] = hc_byte_perm (w[ 8], w[ 7], selector); + w[57] = hc_byte_perm (w[ 7], w[ 6], selector); + w[56] = hc_byte_perm (w[ 6], w[ 5], selector); + w[55] = hc_byte_perm (w[ 5], w[ 4], selector); + w[54] = hc_byte_perm (w[ 4], w[ 3], selector); + w[53] = hc_byte_perm (w[ 3], w[ 2], selector); + w[52] = hc_byte_perm (w[ 2], w[ 1], selector); + w[51] = hc_byte_perm (w[ 1], w[ 0], selector); + w[50] = hc_byte_perm (w[ 0], 0, selector); + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; @@ -28424,40 +33172,40 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 31: - w[63] = hc_byte_perm (w[32], w[31], selector); - w[62] = hc_byte_perm (w[31], w[30], selector); - w[61] = hc_byte_perm (w[30], w[29], selector); - w[60] = hc_byte_perm (w[29], w[28], selector); - w[59] = hc_byte_perm (w[28], w[27], selector); - w[58] = hc_byte_perm (w[27], w[26], selector); - w[57] = hc_byte_perm (w[26], w[25], selector); - w[56] = hc_byte_perm (w[25], w[24], selector); - w[55] = hc_byte_perm (w[24], w[23], selector); - w[54] = hc_byte_perm (w[23], w[22], selector); - w[53] = hc_byte_perm (w[22], w[21], selector); - w[52] = hc_byte_perm (w[21], w[20], selector); - w[51] = hc_byte_perm (w[20], w[19], selector); - w[50] = hc_byte_perm (w[19], w[18], selector); - w[49] = hc_byte_perm (w[18], w[17], selector); - w[48] = hc_byte_perm (w[17], w[16], selector); - w[47] = hc_byte_perm (w[16], w[15], selector); - w[46] = hc_byte_perm (w[15], w[14], selector); - w[45] = hc_byte_perm (w[14], w[13], selector); - w[44] = hc_byte_perm (w[13], w[12], selector); - w[43] = hc_byte_perm (w[12], w[11], selector); - w[42] = hc_byte_perm (w[11], w[10], selector); - w[41] = hc_byte_perm (w[10], w[ 9], selector); - w[40] = hc_byte_perm (w[ 9], w[ 8], selector); - w[39] = hc_byte_perm (w[ 8], w[ 7], selector); - w[38] = hc_byte_perm (w[ 7], w[ 6], selector); - w[37] = hc_byte_perm (w[ 6], w[ 5], selector); - w[36] = hc_byte_perm (w[ 5], w[ 4], selector); - w[35] = hc_byte_perm (w[ 4], w[ 3], selector); - w[34] = hc_byte_perm (w[ 3], w[ 2], selector); - w[33] = hc_byte_perm (w[ 2], w[ 1], selector); - w[32] = hc_byte_perm (w[ 1], w[ 0], selector); - w[31] = hc_byte_perm (w[ 0], 0, selector); + case 51: + w[63] = hc_byte_perm (w[12], w[11], selector); + w[62] = hc_byte_perm (w[11], w[10], selector); + w[61] = hc_byte_perm (w[10], w[ 9], selector); + w[60] = hc_byte_perm (w[ 9], w[ 8], selector); + w[59] = hc_byte_perm (w[ 8], w[ 7], selector); + w[58] = hc_byte_perm (w[ 7], w[ 6], selector); + w[57] = hc_byte_perm (w[ 6], w[ 5], selector); + w[56] = hc_byte_perm (w[ 5], w[ 4], selector); + w[55] = hc_byte_perm (w[ 4], w[ 3], selector); + w[54] = hc_byte_perm (w[ 3], w[ 2], selector); + w[53] = hc_byte_perm (w[ 2], w[ 1], selector); + w[52] = hc_byte_perm (w[ 1], w[ 0], selector); + w[51] = hc_byte_perm (w[ 0], 0, selector); + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; @@ -28492,39 +33240,39 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 32: - w[63] = hc_byte_perm (w[31], w[30], selector); - w[62] = hc_byte_perm (w[30], w[29], selector); - w[61] = hc_byte_perm (w[29], w[28], selector); - w[60] = hc_byte_perm (w[28], w[27], selector); - w[59] = hc_byte_perm (w[27], w[26], selector); - w[58] = hc_byte_perm (w[26], w[25], selector); - w[57] = hc_byte_perm (w[25], w[24], selector); - w[56] = hc_byte_perm (w[24], w[23], selector); - w[55] = hc_byte_perm (w[23], w[22], selector); - w[54] = hc_byte_perm (w[22], w[21], selector); - w[53] = hc_byte_perm (w[21], w[20], selector); - w[52] = hc_byte_perm (w[20], w[19], selector); - w[51] = hc_byte_perm (w[19], w[18], selector); - w[50] = hc_byte_perm (w[18], w[17], selector); - w[49] = hc_byte_perm (w[17], w[16], selector); - w[48] = hc_byte_perm (w[16], w[15], selector); - w[47] = hc_byte_perm (w[15], w[14], selector); - w[46] = hc_byte_perm (w[14], w[13], selector); - w[45] = hc_byte_perm (w[13], w[12], selector); - w[44] = hc_byte_perm (w[12], w[11], selector); - w[43] = hc_byte_perm (w[11], w[10], selector); - w[42] = hc_byte_perm (w[10], w[ 9], selector); - w[41] = hc_byte_perm (w[ 9], w[ 8], selector); - w[40] = hc_byte_perm (w[ 8], w[ 7], selector); - w[39] = hc_byte_perm (w[ 7], w[ 6], selector); - w[38] = hc_byte_perm (w[ 6], w[ 5], selector); - w[37] = hc_byte_perm (w[ 5], w[ 4], selector); - w[36] = hc_byte_perm (w[ 4], w[ 3], selector); - w[35] = hc_byte_perm (w[ 3], w[ 2], selector); - w[34] = hc_byte_perm (w[ 2], w[ 1], selector); - w[33] = hc_byte_perm (w[ 1], w[ 0], selector); - w[32] = hc_byte_perm (w[ 0], 0, selector); + case 52: + w[63] = hc_byte_perm (w[11], w[10], selector); + w[62] = hc_byte_perm (w[10], w[ 9], selector); + w[61] = hc_byte_perm (w[ 9], w[ 8], selector); + w[60] = hc_byte_perm (w[ 8], w[ 7], selector); + w[59] = hc_byte_perm (w[ 7], w[ 6], selector); + w[58] = hc_byte_perm (w[ 6], w[ 5], selector); + w[57] = hc_byte_perm (w[ 5], w[ 4], selector); + w[56] = hc_byte_perm (w[ 4], w[ 3], selector); + w[55] = hc_byte_perm (w[ 3], w[ 2], selector); + w[54] = hc_byte_perm (w[ 2], w[ 1], selector); + w[53] = hc_byte_perm (w[ 1], w[ 0], selector); + w[52] = hc_byte_perm (w[ 0], 0, selector); + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; @@ -28560,38 +33308,38 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 33: - w[63] = hc_byte_perm (w[30], w[29], selector); - w[62] = hc_byte_perm (w[29], w[28], selector); - w[61] = hc_byte_perm (w[28], w[27], selector); - w[60] = hc_byte_perm (w[27], w[26], selector); - w[59] = hc_byte_perm (w[26], w[25], selector); - w[58] = hc_byte_perm (w[25], w[24], selector); - w[57] = hc_byte_perm (w[24], w[23], selector); - w[56] = hc_byte_perm (w[23], w[22], selector); - w[55] = hc_byte_perm (w[22], w[21], selector); - w[54] = hc_byte_perm (w[21], w[20], selector); - w[53] = hc_byte_perm (w[20], w[19], selector); - w[52] = hc_byte_perm (w[19], w[18], selector); - w[51] = hc_byte_perm (w[18], w[17], selector); - w[50] = hc_byte_perm (w[17], w[16], selector); - w[49] = hc_byte_perm (w[16], w[15], selector); - w[48] = hc_byte_perm (w[15], w[14], selector); - w[47] = hc_byte_perm (w[14], w[13], selector); - w[46] = hc_byte_perm (w[13], w[12], selector); - w[45] = hc_byte_perm (w[12], w[11], selector); - w[44] = hc_byte_perm (w[11], w[10], selector); - w[43] = hc_byte_perm (w[10], w[ 9], selector); - w[42] = hc_byte_perm (w[ 9], w[ 8], selector); - w[41] = hc_byte_perm (w[ 8], w[ 7], selector); - w[40] = hc_byte_perm (w[ 7], w[ 6], selector); - w[39] = hc_byte_perm (w[ 6], w[ 5], selector); - w[38] = hc_byte_perm (w[ 5], w[ 4], selector); - w[37] = hc_byte_perm (w[ 4], w[ 3], selector); - w[36] = hc_byte_perm (w[ 3], w[ 2], selector); - w[35] = hc_byte_perm (w[ 2], w[ 1], selector); - w[34] = hc_byte_perm (w[ 1], w[ 0], selector); - w[33] = hc_byte_perm (w[ 0], 0, selector); + case 53: + w[63] = hc_byte_perm (w[10], w[ 9], selector); + w[62] = hc_byte_perm (w[ 9], w[ 8], selector); + w[61] = hc_byte_perm (w[ 8], w[ 7], selector); + w[60] = hc_byte_perm (w[ 7], w[ 6], selector); + w[59] = hc_byte_perm (w[ 6], w[ 5], selector); + w[58] = hc_byte_perm (w[ 5], w[ 4], selector); + w[57] = hc_byte_perm (w[ 4], w[ 3], selector); + w[56] = hc_byte_perm (w[ 3], w[ 2], selector); + w[55] = hc_byte_perm (w[ 2], w[ 1], selector); + w[54] = hc_byte_perm (w[ 1], w[ 0], selector); + w[53] = hc_byte_perm (w[ 0], 0, selector); + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; @@ -28614,51 +33362,51 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) w[13] = 0; w[12] = 0; w[11] = 0; - w[10] = 0; - w[ 9] = 0; - w[ 8] = 0; - w[ 7] = 0; - w[ 6] = 0; - w[ 5] = 0; - w[ 4] = 0; - w[ 3] = 0; - w[ 2] = 0; - w[ 1] = 0; - w[ 0] = 0; - - break; - - case 34: - w[63] = hc_byte_perm (w[29], w[28], selector); - w[62] = hc_byte_perm (w[28], w[27], selector); - w[61] = hc_byte_perm (w[27], w[26], selector); - w[60] = hc_byte_perm (w[26], w[25], selector); - w[59] = hc_byte_perm (w[25], w[24], selector); - w[58] = hc_byte_perm (w[24], w[23], selector); - w[57] = hc_byte_perm (w[23], w[22], selector); - w[56] = hc_byte_perm (w[22], w[21], selector); - w[55] = hc_byte_perm (w[21], w[20], selector); - w[54] = hc_byte_perm (w[20], w[19], selector); - w[53] = hc_byte_perm (w[19], w[18], selector); - w[52] = hc_byte_perm (w[18], w[17], selector); - w[51] = hc_byte_perm (w[17], w[16], selector); - w[50] = hc_byte_perm (w[16], w[15], selector); - w[49] = hc_byte_perm (w[15], w[14], selector); - w[48] = hc_byte_perm (w[14], w[13], selector); - w[47] = hc_byte_perm (w[13], w[12], selector); - w[46] = hc_byte_perm (w[12], w[11], selector); - w[45] = hc_byte_perm (w[11], w[10], selector); - w[44] = hc_byte_perm (w[10], w[ 9], selector); - w[43] = hc_byte_perm (w[ 9], w[ 8], selector); - w[42] = hc_byte_perm (w[ 8], w[ 7], selector); - w[41] = hc_byte_perm (w[ 7], w[ 6], selector); - w[40] = hc_byte_perm (w[ 6], w[ 5], selector); - w[39] = hc_byte_perm (w[ 5], w[ 4], selector); - w[38] = hc_byte_perm (w[ 4], w[ 3], selector); - w[37] = hc_byte_perm (w[ 3], w[ 2], selector); - w[36] = hc_byte_perm (w[ 2], w[ 1], selector); - w[35] = hc_byte_perm (w[ 1], w[ 0], selector); - w[34] = hc_byte_perm (w[ 0], 0, selector); + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 54: + w[63] = hc_byte_perm (w[ 9], w[ 8], selector); + w[62] = hc_byte_perm (w[ 8], w[ 7], selector); + w[61] = hc_byte_perm (w[ 7], w[ 6], selector); + w[60] = hc_byte_perm (w[ 6], w[ 5], selector); + w[59] = hc_byte_perm (w[ 5], w[ 4], selector); + w[58] = hc_byte_perm (w[ 4], w[ 3], selector); + w[57] = hc_byte_perm (w[ 3], w[ 2], selector); + w[56] = hc_byte_perm (w[ 2], w[ 1], selector); + w[55] = hc_byte_perm (w[ 1], w[ 0], selector); + w[54] = hc_byte_perm (w[ 0], 0, selector); + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; @@ -28696,36 +33444,36 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 35: - w[63] = hc_byte_perm (w[28], w[27], selector); - w[62] = hc_byte_perm (w[27], w[26], selector); - w[61] = hc_byte_perm (w[26], w[25], selector); - w[60] = hc_byte_perm (w[25], w[24], selector); - w[59] = hc_byte_perm (w[24], w[23], selector); - w[58] = hc_byte_perm (w[23], w[22], selector); - w[57] = hc_byte_perm (w[22], w[21], selector); - w[56] = hc_byte_perm (w[21], w[20], selector); - w[55] = hc_byte_perm (w[20], w[19], selector); - w[54] = hc_byte_perm (w[19], w[18], selector); - w[53] = hc_byte_perm (w[18], w[17], selector); - w[52] = hc_byte_perm (w[17], w[16], selector); - w[51] = hc_byte_perm (w[16], w[15], selector); - w[50] = hc_byte_perm (w[15], w[14], selector); - w[49] = hc_byte_perm (w[14], w[13], selector); - w[48] = hc_byte_perm (w[13], w[12], selector); - w[47] = hc_byte_perm (w[12], w[11], selector); - w[46] = hc_byte_perm (w[11], w[10], selector); - w[45] = hc_byte_perm (w[10], w[ 9], selector); - w[44] = hc_byte_perm (w[ 9], w[ 8], selector); - w[43] = hc_byte_perm (w[ 8], w[ 7], selector); - w[42] = hc_byte_perm (w[ 7], w[ 6], selector); - w[41] = hc_byte_perm (w[ 6], w[ 5], selector); - w[40] = hc_byte_perm (w[ 5], w[ 4], selector); - w[39] = hc_byte_perm (w[ 4], w[ 3], selector); - w[38] = hc_byte_perm (w[ 3], w[ 2], selector); - w[37] = hc_byte_perm (w[ 2], w[ 1], selector); - w[36] = hc_byte_perm (w[ 1], w[ 0], selector); - w[35] = hc_byte_perm (w[ 0], 0, selector); + case 55: + w[63] = hc_byte_perm (w[ 8], w[ 7], selector); + w[62] = hc_byte_perm (w[ 7], w[ 6], selector); + w[61] = hc_byte_perm (w[ 6], w[ 5], selector); + w[60] = hc_byte_perm (w[ 5], w[ 4], selector); + w[59] = hc_byte_perm (w[ 4], w[ 3], selector); + w[58] = hc_byte_perm (w[ 3], w[ 2], selector); + w[57] = hc_byte_perm (w[ 2], w[ 1], selector); + w[56] = hc_byte_perm (w[ 1], w[ 0], selector); + w[55] = hc_byte_perm (w[ 0], 0, selector); + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; @@ -28764,35 +33512,35 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 36: - w[63] = hc_byte_perm (w[27], w[26], selector); - w[62] = hc_byte_perm (w[26], w[25], selector); - w[61] = hc_byte_perm (w[25], w[24], selector); - w[60] = hc_byte_perm (w[24], w[23], selector); - w[59] = hc_byte_perm (w[23], w[22], selector); - w[58] = hc_byte_perm (w[22], w[21], selector); - w[57] = hc_byte_perm (w[21], w[20], selector); - w[56] = hc_byte_perm (w[20], w[19], selector); - w[55] = hc_byte_perm (w[19], w[18], selector); - w[54] = hc_byte_perm (w[18], w[17], selector); - w[53] = hc_byte_perm (w[17], w[16], selector); - w[52] = hc_byte_perm (w[16], w[15], selector); - w[51] = hc_byte_perm (w[15], w[14], selector); - w[50] = hc_byte_perm (w[14], w[13], selector); - w[49] = hc_byte_perm (w[13], w[12], selector); - w[48] = hc_byte_perm (w[12], w[11], selector); - w[47] = hc_byte_perm (w[11], w[10], selector); - w[46] = hc_byte_perm (w[10], w[ 9], selector); - w[45] = hc_byte_perm (w[ 9], w[ 8], selector); - w[44] = hc_byte_perm (w[ 8], w[ 7], selector); - w[43] = hc_byte_perm (w[ 7], w[ 6], selector); - w[42] = hc_byte_perm (w[ 6], w[ 5], selector); - w[41] = hc_byte_perm (w[ 5], w[ 4], selector); - w[40] = hc_byte_perm (w[ 4], w[ 3], selector); - w[39] = hc_byte_perm (w[ 3], w[ 2], selector); - w[38] = hc_byte_perm (w[ 2], w[ 1], selector); - w[37] = hc_byte_perm (w[ 1], w[ 0], selector); - w[36] = hc_byte_perm (w[ 0], 0, selector); + case 56: + w[63] = hc_byte_perm (w[ 7], w[ 6], selector); + w[62] = hc_byte_perm (w[ 6], w[ 5], selector); + w[61] = hc_byte_perm (w[ 5], w[ 4], selector); + w[60] = hc_byte_perm (w[ 4], w[ 3], selector); + w[59] = hc_byte_perm (w[ 3], w[ 2], selector); + w[58] = hc_byte_perm (w[ 2], w[ 1], selector); + w[57] = hc_byte_perm (w[ 1], w[ 0], selector); + w[56] = hc_byte_perm (w[ 0], 0, selector); + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; @@ -28832,34 +33580,34 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 37: - w[63] = hc_byte_perm (w[26], w[25], selector); - w[62] = hc_byte_perm (w[25], w[24], selector); - w[61] = hc_byte_perm (w[24], w[23], selector); - w[60] = hc_byte_perm (w[23], w[22], selector); - w[59] = hc_byte_perm (w[22], w[21], selector); - w[58] = hc_byte_perm (w[21], w[20], selector); - w[57] = hc_byte_perm (w[20], w[19], selector); - w[56] = hc_byte_perm (w[19], w[18], selector); - w[55] = hc_byte_perm (w[18], w[17], selector); - w[54] = hc_byte_perm (w[17], w[16], selector); - w[53] = hc_byte_perm (w[16], w[15], selector); - w[52] = hc_byte_perm (w[15], w[14], selector); - w[51] = hc_byte_perm (w[14], w[13], selector); - w[50] = hc_byte_perm (w[13], w[12], selector); - w[49] = hc_byte_perm (w[12], w[11], selector); - w[48] = hc_byte_perm (w[11], w[10], selector); - w[47] = hc_byte_perm (w[10], w[ 9], selector); - w[46] = hc_byte_perm (w[ 9], w[ 8], selector); - w[45] = hc_byte_perm (w[ 8], w[ 7], selector); - w[44] = hc_byte_perm (w[ 7], w[ 6], selector); - w[43] = hc_byte_perm (w[ 6], w[ 5], selector); - w[42] = hc_byte_perm (w[ 5], w[ 4], selector); - w[41] = hc_byte_perm (w[ 4], w[ 3], selector); - w[40] = hc_byte_perm (w[ 3], w[ 2], selector); - w[39] = hc_byte_perm (w[ 2], w[ 1], selector); - w[38] = hc_byte_perm (w[ 1], w[ 0], selector); - w[37] = hc_byte_perm (w[ 0], 0, selector); + case 57: + w[63] = hc_byte_perm (w[ 6], w[ 5], selector); + w[62] = hc_byte_perm (w[ 5], w[ 4], selector); + w[61] = hc_byte_perm (w[ 4], w[ 3], selector); + w[60] = hc_byte_perm (w[ 3], w[ 2], selector); + w[59] = hc_byte_perm (w[ 2], w[ 1], selector); + w[58] = hc_byte_perm (w[ 1], w[ 0], selector); + w[57] = hc_byte_perm (w[ 0], 0, selector); + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; @@ -28900,33 +33648,33 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 38: - w[63] = hc_byte_perm (w[25], w[24], selector); - w[62] = hc_byte_perm (w[24], w[23], selector); - w[61] = hc_byte_perm (w[23], w[22], selector); - w[60] = hc_byte_perm (w[22], w[21], selector); - w[59] = hc_byte_perm (w[21], w[20], selector); - w[58] = hc_byte_perm (w[20], w[19], selector); - w[57] = hc_byte_perm (w[19], w[18], selector); - w[56] = hc_byte_perm (w[18], w[17], selector); - w[55] = hc_byte_perm (w[17], w[16], selector); - w[54] = hc_byte_perm (w[16], w[15], selector); - w[53] = hc_byte_perm (w[15], w[14], selector); - w[52] = hc_byte_perm (w[14], w[13], selector); - w[51] = hc_byte_perm (w[13], w[12], selector); - w[50] = hc_byte_perm (w[12], w[11], selector); - w[49] = hc_byte_perm (w[11], w[10], selector); - w[48] = hc_byte_perm (w[10], w[ 9], selector); - w[47] = hc_byte_perm (w[ 9], w[ 8], selector); - w[46] = hc_byte_perm (w[ 8], w[ 7], selector); - w[45] = hc_byte_perm (w[ 7], w[ 6], selector); - w[44] = hc_byte_perm (w[ 6], w[ 5], selector); - w[43] = hc_byte_perm (w[ 5], w[ 4], selector); - w[42] = hc_byte_perm (w[ 4], w[ 3], selector); - w[41] = hc_byte_perm (w[ 3], w[ 2], selector); - w[40] = hc_byte_perm (w[ 2], w[ 1], selector); - w[39] = hc_byte_perm (w[ 1], w[ 0], selector); - w[38] = hc_byte_perm (w[ 0], 0, selector); + case 58: + w[63] = hc_byte_perm (w[ 5], w[ 4], selector); + w[62] = hc_byte_perm (w[ 4], w[ 3], selector); + w[61] = hc_byte_perm (w[ 3], w[ 2], selector); + w[60] = hc_byte_perm (w[ 2], w[ 1], selector); + w[59] = hc_byte_perm (w[ 1], w[ 0], selector); + w[58] = hc_byte_perm (w[ 0], 0, selector); + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; @@ -28968,32 +33716,32 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 39: - w[63] = hc_byte_perm (w[24], w[23], selector); - w[62] = hc_byte_perm (w[23], w[22], selector); - w[61] = hc_byte_perm (w[22], w[21], selector); - w[60] = hc_byte_perm (w[21], w[20], selector); - w[59] = hc_byte_perm (w[20], w[19], selector); - w[58] = hc_byte_perm (w[19], w[18], selector); - w[57] = hc_byte_perm (w[18], w[17], selector); - w[56] = hc_byte_perm (w[17], w[16], selector); - w[55] = hc_byte_perm (w[16], w[15], selector); - w[54] = hc_byte_perm (w[15], w[14], selector); - w[53] = hc_byte_perm (w[14], w[13], selector); - w[52] = hc_byte_perm (w[13], w[12], selector); - w[51] = hc_byte_perm (w[12], w[11], selector); - w[50] = hc_byte_perm (w[11], w[10], selector); - w[49] = hc_byte_perm (w[10], w[ 9], selector); - w[48] = hc_byte_perm (w[ 9], w[ 8], selector); - w[47] = hc_byte_perm (w[ 8], w[ 7], selector); - w[46] = hc_byte_perm (w[ 7], w[ 6], selector); - w[45] = hc_byte_perm (w[ 6], w[ 5], selector); - w[44] = hc_byte_perm (w[ 5], w[ 4], selector); - w[43] = hc_byte_perm (w[ 4], w[ 3], selector); - w[42] = hc_byte_perm (w[ 3], w[ 2], selector); - w[41] = hc_byte_perm (w[ 2], w[ 1], selector); - w[40] = hc_byte_perm (w[ 1], w[ 0], selector); - w[39] = hc_byte_perm (w[ 0], 0, selector); + case 59: + w[63] = hc_byte_perm (w[ 4], w[ 3], selector); + w[62] = hc_byte_perm (w[ 3], w[ 2], selector); + w[61] = hc_byte_perm (w[ 2], w[ 1], selector); + w[60] = hc_byte_perm (w[ 1], w[ 0], selector); + w[59] = hc_byte_perm (w[ 0], 0, selector); + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; @@ -29036,31 +33784,31 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 40: - w[63] = hc_byte_perm (w[23], w[22], selector); - w[62] = hc_byte_perm (w[22], w[21], selector); - w[61] = hc_byte_perm (w[21], w[20], selector); - w[60] = hc_byte_perm (w[20], w[19], selector); - w[59] = hc_byte_perm (w[19], w[18], selector); - w[58] = hc_byte_perm (w[18], w[17], selector); - w[57] = hc_byte_perm (w[17], w[16], selector); - w[56] = hc_byte_perm (w[16], w[15], selector); - w[55] = hc_byte_perm (w[15], w[14], selector); - w[54] = hc_byte_perm (w[14], w[13], selector); - w[53] = hc_byte_perm (w[13], w[12], selector); - w[52] = hc_byte_perm (w[12], w[11], selector); - w[51] = hc_byte_perm (w[11], w[10], selector); - w[50] = hc_byte_perm (w[10], w[ 9], selector); - w[49] = hc_byte_perm (w[ 9], w[ 8], selector); - w[48] = hc_byte_perm (w[ 8], w[ 7], selector); - w[47] = hc_byte_perm (w[ 7], w[ 6], selector); - w[46] = hc_byte_perm (w[ 6], w[ 5], selector); - w[45] = hc_byte_perm (w[ 5], w[ 4], selector); - w[44] = hc_byte_perm (w[ 4], w[ 3], selector); - w[43] = hc_byte_perm (w[ 3], w[ 2], selector); - w[42] = hc_byte_perm (w[ 2], w[ 1], selector); - w[41] = hc_byte_perm (w[ 1], w[ 0], selector); - w[40] = hc_byte_perm (w[ 0], 0, selector); + case 60: + w[63] = hc_byte_perm (w[ 3], w[ 2], selector); + w[62] = hc_byte_perm (w[ 2], w[ 1], selector); + w[61] = hc_byte_perm (w[ 1], w[ 0], selector); + w[60] = hc_byte_perm (w[ 0], 0, selector); + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; @@ -29104,30 +33852,30 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 41: - w[63] = hc_byte_perm (w[22], w[21], selector); - w[62] = hc_byte_perm (w[21], w[20], selector); - w[61] = hc_byte_perm (w[20], w[19], selector); - w[60] = hc_byte_perm (w[19], w[18], selector); - w[59] = hc_byte_perm (w[18], w[17], selector); - w[58] = hc_byte_perm (w[17], w[16], selector); - w[57] = hc_byte_perm (w[16], w[15], selector); - w[56] = hc_byte_perm (w[15], w[14], selector); - w[55] = hc_byte_perm (w[14], w[13], selector); - w[54] = hc_byte_perm (w[13], w[12], selector); - w[53] = hc_byte_perm (w[12], w[11], selector); - w[52] = hc_byte_perm (w[11], w[10], selector); - w[51] = hc_byte_perm (w[10], w[ 9], selector); - w[50] = hc_byte_perm (w[ 9], w[ 8], selector); - w[49] = hc_byte_perm (w[ 8], w[ 7], selector); - w[48] = hc_byte_perm (w[ 7], w[ 6], selector); - w[47] = hc_byte_perm (w[ 6], w[ 5], selector); - w[46] = hc_byte_perm (w[ 5], w[ 4], selector); - w[45] = hc_byte_perm (w[ 4], w[ 3], selector); - w[44] = hc_byte_perm (w[ 3], w[ 2], selector); - w[43] = hc_byte_perm (w[ 2], w[ 1], selector); - w[42] = hc_byte_perm (w[ 1], w[ 0], selector); - w[41] = hc_byte_perm (w[ 0], 0, selector); + case 61: + w[63] = hc_byte_perm (w[ 2], w[ 1], selector); + w[62] = hc_byte_perm (w[ 1], w[ 0], selector); + w[61] = hc_byte_perm (w[ 0], 0, selector); + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; @@ -29172,29 +33920,29 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 42: - w[63] = hc_byte_perm (w[21], w[20], selector); - w[62] = hc_byte_perm (w[20], w[19], selector); - w[61] = hc_byte_perm (w[19], w[18], selector); - w[60] = hc_byte_perm (w[18], w[17], selector); - w[59] = hc_byte_perm (w[17], w[16], selector); - w[58] = hc_byte_perm (w[16], w[15], selector); - w[57] = hc_byte_perm (w[15], w[14], selector); - w[56] = hc_byte_perm (w[14], w[13], selector); - w[55] = hc_byte_perm (w[13], w[12], selector); - w[54] = hc_byte_perm (w[12], w[11], selector); - w[53] = hc_byte_perm (w[11], w[10], selector); - w[52] = hc_byte_perm (w[10], w[ 9], selector); - w[51] = hc_byte_perm (w[ 9], w[ 8], selector); - w[50] = hc_byte_perm (w[ 8], w[ 7], selector); - w[49] = hc_byte_perm (w[ 7], w[ 6], selector); - w[48] = hc_byte_perm (w[ 6], w[ 5], selector); - w[47] = hc_byte_perm (w[ 5], w[ 4], selector); - w[46] = hc_byte_perm (w[ 4], w[ 3], selector); - w[45] = hc_byte_perm (w[ 3], w[ 2], selector); - w[44] = hc_byte_perm (w[ 2], w[ 1], selector); - w[43] = hc_byte_perm (w[ 1], w[ 0], selector); - w[42] = hc_byte_perm (w[ 0], 0, selector); + case 62: + w[63] = hc_byte_perm (w[ 1], w[ 0], selector); + w[62] = hc_byte_perm (w[ 0], 0, selector); + w[61] = 0; + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; @@ -29240,28 +33988,28 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) break; - case 43: - w[63] = hc_byte_perm (w[20], w[19], selector); - w[62] = hc_byte_perm (w[19], w[18], selector); - w[61] = hc_byte_perm (w[18], w[17], selector); - w[60] = hc_byte_perm (w[17], w[16], selector); - w[59] = hc_byte_perm (w[16], w[15], selector); - w[58] = hc_byte_perm (w[15], w[14], selector); - w[57] = hc_byte_perm (w[14], w[13], selector); - w[56] = hc_byte_perm (w[13], w[12], selector); - w[55] = hc_byte_perm (w[12], w[11], selector); - w[54] = hc_byte_perm (w[11], w[10], selector); - w[53] = hc_byte_perm (w[10], w[ 9], selector); - w[52] = hc_byte_perm (w[ 9], w[ 8], selector); - w[51] = hc_byte_perm (w[ 8], w[ 7], selector); - w[50] = hc_byte_perm (w[ 7], w[ 6], selector); - w[49] = hc_byte_perm (w[ 6], w[ 5], selector); - w[48] = hc_byte_perm (w[ 5], w[ 4], selector); - w[47] = hc_byte_perm (w[ 4], w[ 3], selector); - w[46] = hc_byte_perm (w[ 3], w[ 2], selector); - w[45] = hc_byte_perm (w[ 2], w[ 1], selector); - w[44] = hc_byte_perm (w[ 1], w[ 0], selector); - w[43] = hc_byte_perm (w[ 0], 0, selector); + case 63: + w[63] = hc_byte_perm (w[ 0], 0, selector); + w[62] = 0; + w[61] = 0; + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; @@ -29306,1596 +34054,1042 @@ DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset) w[ 1] = 0; w[ 0] = 0; + break; + } + #endif +} + +/** + * vector functions as scalar (for outer loop usage) + */ + +DECLSPEC void truncate_block_4x4_le_S (u32 *w0, const u32 len) +{ + switch (len) + { + case 0: + w0[0] = 0; + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + + break; + + case 1: + w0[0] &= 0x000000ff; + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + + break; + + case 2: + w0[0] &= 0x0000ffff; + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + + break; + + case 3: + w0[0] &= 0x00ffffff; + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + + break; + + case 4: + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + + break; + + case 5: + w0[1] &= 0x000000ff; + w0[2] = 0; + w0[3] = 0; + + break; + + case 6: + w0[1] &= 0x0000ffff; + w0[2] = 0; + w0[3] = 0; + + break; + + case 7: + w0[1] &= 0x00ffffff; + w0[2] = 0; + w0[3] = 0; + + break; + + case 8: + w0[2] = 0; + w0[3] = 0; + + break; + + case 9: + w0[2] &= 0x000000ff; + w0[3] = 0; + + break; + + case 10: + w0[2] &= 0x0000ffff; + w0[3] = 0; + + break; + + case 11: + w0[2] &= 0x00ffffff; + w0[3] = 0; + + break; + + case 12: + w0[3] = 0; + + break; + + case 13: + w0[3] &= 0x000000ff; + + break; + + case 14: + w0[3] &= 0x0000ffff; + + break; + + case 15: + w0[3] &= 0x00ffffff; + + break; + } +} + +DECLSPEC void truncate_block_4x4_be_S (u32 *w0, const u32 len) +{ + switch (len) + { + case 0: + w0[0] = 0; + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + + break; + + case 1: + w0[0] &= 0xff000000; + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + + break; + + case 2: + w0[0] &= 0xffff0000; + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + + break; + + case 3: + w0[0] &= 0xffffff00; + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + + break; + + case 4: + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + + break; + + case 5: + w0[1] &= 0xff000000; + w0[2] = 0; + w0[3] = 0; + + break; + + case 6: + w0[1] &= 0xffff0000; + w0[2] = 0; + w0[3] = 0; + + break; + + case 7: + w0[1] &= 0xffffff00; + w0[2] = 0; + w0[3] = 0; + + break; + + case 8: + w0[2] = 0; + w0[3] = 0; + + break; + + case 9: + w0[2] &= 0xff000000; + w0[3] = 0; + + break; + + case 10: + w0[2] &= 0xffff0000; + w0[3] = 0; + + break; + + case 11: + w0[2] &= 0xffffff00; + w0[3] = 0; + + break; + + case 12: + w0[3] = 0; + + break; + + case 13: + w0[3] &= 0xff000000; + + break; + + case 14: + w0[3] &= 0xffff0000; + + break; + + case 15: + w0[3] &= 0xffffff00; + + break; + } +} + +DECLSPEC void truncate_block_16x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 len) +{ + switch (len) + { + case 0: + w0[0] = 0; + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 1: + w0[0] &= 0x000000ff; + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + break; - case 44: - w[63] = hc_byte_perm (w[19], w[18], selector); - w[62] = hc_byte_perm (w[18], w[17], selector); - w[61] = hc_byte_perm (w[17], w[16], selector); - w[60] = hc_byte_perm (w[16], w[15], selector); - w[59] = hc_byte_perm (w[15], w[14], selector); - w[58] = hc_byte_perm (w[14], w[13], selector); - w[57] = hc_byte_perm (w[13], w[12], selector); - w[56] = hc_byte_perm (w[12], w[11], selector); - w[55] = hc_byte_perm (w[11], w[10], selector); - w[54] = hc_byte_perm (w[10], w[ 9], selector); - w[53] = hc_byte_perm (w[ 9], w[ 8], selector); - w[52] = hc_byte_perm (w[ 8], w[ 7], selector); - w[51] = hc_byte_perm (w[ 7], w[ 6], selector); - w[50] = hc_byte_perm (w[ 6], w[ 5], selector); - w[49] = hc_byte_perm (w[ 5], w[ 4], selector); - w[48] = hc_byte_perm (w[ 4], w[ 3], selector); - w[47] = hc_byte_perm (w[ 3], w[ 2], selector); - w[46] = hc_byte_perm (w[ 2], w[ 1], selector); - w[45] = hc_byte_perm (w[ 1], w[ 0], selector); - w[44] = hc_byte_perm (w[ 0], 0, selector); - w[43] = 0; - w[42] = 0; - w[41] = 0; - w[40] = 0; - w[39] = 0; - w[38] = 0; - w[37] = 0; - w[36] = 0; - w[35] = 0; - w[34] = 0; - w[33] = 0; - w[32] = 0; - w[31] = 0; - w[30] = 0; - w[29] = 0; - w[28] = 0; - w[27] = 0; - w[26] = 0; - w[25] = 0; - w[24] = 0; - w[23] = 0; - w[22] = 0; - w[21] = 0; - w[20] = 0; - w[19] = 0; - w[18] = 0; - w[17] = 0; - w[16] = 0; - w[15] = 0; - w[14] = 0; - w[13] = 0; - w[12] = 0; - w[11] = 0; - w[10] = 0; - w[ 9] = 0; - w[ 8] = 0; - w[ 7] = 0; - w[ 6] = 0; - w[ 5] = 0; - w[ 4] = 0; - w[ 3] = 0; - w[ 2] = 0; - w[ 1] = 0; - w[ 0] = 0; + case 2: + w0[0] &= 0x0000ffff; + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; break; - case 45: - w[63] = hc_byte_perm (w[18], w[17], selector); - w[62] = hc_byte_perm (w[17], w[16], selector); - w[61] = hc_byte_perm (w[16], w[15], selector); - w[60] = hc_byte_perm (w[15], w[14], selector); - w[59] = hc_byte_perm (w[14], w[13], selector); - w[58] = hc_byte_perm (w[13], w[12], selector); - w[57] = hc_byte_perm (w[12], w[11], selector); - w[56] = hc_byte_perm (w[11], w[10], selector); - w[55] = hc_byte_perm (w[10], w[ 9], selector); - w[54] = hc_byte_perm (w[ 9], w[ 8], selector); - w[53] = hc_byte_perm (w[ 8], w[ 7], selector); - w[52] = hc_byte_perm (w[ 7], w[ 6], selector); - w[51] = hc_byte_perm (w[ 6], w[ 5], selector); - w[50] = hc_byte_perm (w[ 5], w[ 4], selector); - w[49] = hc_byte_perm (w[ 4], w[ 3], selector); - w[48] = hc_byte_perm (w[ 3], w[ 2], selector); - w[47] = hc_byte_perm (w[ 2], w[ 1], selector); - w[46] = hc_byte_perm (w[ 1], w[ 0], selector); - w[45] = hc_byte_perm (w[ 0], 0, selector); - w[44] = 0; - w[43] = 0; - w[42] = 0; - w[41] = 0; - w[40] = 0; - w[39] = 0; - w[38] = 0; - w[37] = 0; - w[36] = 0; - w[35] = 0; - w[34] = 0; - w[33] = 0; - w[32] = 0; - w[31] = 0; - w[30] = 0; - w[29] = 0; - w[28] = 0; - w[27] = 0; - w[26] = 0; - w[25] = 0; - w[24] = 0; - w[23] = 0; - w[22] = 0; - w[21] = 0; - w[20] = 0; - w[19] = 0; - w[18] = 0; - w[17] = 0; - w[16] = 0; - w[15] = 0; - w[14] = 0; - w[13] = 0; - w[12] = 0; - w[11] = 0; - w[10] = 0; - w[ 9] = 0; - w[ 8] = 0; - w[ 7] = 0; - w[ 6] = 0; - w[ 5] = 0; - w[ 4] = 0; - w[ 3] = 0; - w[ 2] = 0; - w[ 1] = 0; - w[ 0] = 0; + case 3: + w0[0] &= 0x00ffffff; + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; break; - case 46: - w[63] = hc_byte_perm (w[17], w[16], selector); - w[62] = hc_byte_perm (w[16], w[15], selector); - w[61] = hc_byte_perm (w[15], w[14], selector); - w[60] = hc_byte_perm (w[14], w[13], selector); - w[59] = hc_byte_perm (w[13], w[12], selector); - w[58] = hc_byte_perm (w[12], w[11], selector); - w[57] = hc_byte_perm (w[11], w[10], selector); - w[56] = hc_byte_perm (w[10], w[ 9], selector); - w[55] = hc_byte_perm (w[ 9], w[ 8], selector); - w[54] = hc_byte_perm (w[ 8], w[ 7], selector); - w[53] = hc_byte_perm (w[ 7], w[ 6], selector); - w[52] = hc_byte_perm (w[ 6], w[ 5], selector); - w[51] = hc_byte_perm (w[ 5], w[ 4], selector); - w[50] = hc_byte_perm (w[ 4], w[ 3], selector); - w[49] = hc_byte_perm (w[ 3], w[ 2], selector); - w[48] = hc_byte_perm (w[ 2], w[ 1], selector); - w[47] = hc_byte_perm (w[ 1], w[ 0], selector); - w[46] = hc_byte_perm (w[ 0], 0, selector); - w[45] = 0; - w[44] = 0; - w[43] = 0; - w[42] = 0; - w[41] = 0; - w[40] = 0; - w[39] = 0; - w[38] = 0; - w[37] = 0; - w[36] = 0; - w[35] = 0; - w[34] = 0; - w[33] = 0; - w[32] = 0; - w[31] = 0; - w[30] = 0; - w[29] = 0; - w[28] = 0; - w[27] = 0; - w[26] = 0; - w[25] = 0; - w[24] = 0; - w[23] = 0; - w[22] = 0; - w[21] = 0; - w[20] = 0; - w[19] = 0; - w[18] = 0; - w[17] = 0; - w[16] = 0; - w[15] = 0; - w[14] = 0; - w[13] = 0; - w[12] = 0; - w[11] = 0; - w[10] = 0; - w[ 9] = 0; - w[ 8] = 0; - w[ 7] = 0; - w[ 6] = 0; - w[ 5] = 0; - w[ 4] = 0; - w[ 3] = 0; - w[ 2] = 0; - w[ 1] = 0; - w[ 0] = 0; + case 4: + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; break; - case 47: - w[63] = hc_byte_perm (w[16], w[15], selector); - w[62] = hc_byte_perm (w[15], w[14], selector); - w[61] = hc_byte_perm (w[14], w[13], selector); - w[60] = hc_byte_perm (w[13], w[12], selector); - w[59] = hc_byte_perm (w[12], w[11], selector); - w[58] = hc_byte_perm (w[11], w[10], selector); - w[57] = hc_byte_perm (w[10], w[ 9], selector); - w[56] = hc_byte_perm (w[ 9], w[ 8], selector); - w[55] = hc_byte_perm (w[ 8], w[ 7], selector); - w[54] = hc_byte_perm (w[ 7], w[ 6], selector); - w[53] = hc_byte_perm (w[ 6], w[ 5], selector); - w[52] = hc_byte_perm (w[ 5], w[ 4], selector); - w[51] = hc_byte_perm (w[ 4], w[ 3], selector); - w[50] = hc_byte_perm (w[ 3], w[ 2], selector); - w[49] = hc_byte_perm (w[ 2], w[ 1], selector); - w[48] = hc_byte_perm (w[ 1], w[ 0], selector); - w[47] = hc_byte_perm (w[ 0], 0, selector); - w[46] = 0; - w[45] = 0; - w[44] = 0; - w[43] = 0; - w[42] = 0; - w[41] = 0; - w[40] = 0; - w[39] = 0; - w[38] = 0; - w[37] = 0; - w[36] = 0; - w[35] = 0; - w[34] = 0; - w[33] = 0; - w[32] = 0; - w[31] = 0; - w[30] = 0; - w[29] = 0; - w[28] = 0; - w[27] = 0; - w[26] = 0; - w[25] = 0; - w[24] = 0; - w[23] = 0; - w[22] = 0; - w[21] = 0; - w[20] = 0; - w[19] = 0; - w[18] = 0; - w[17] = 0; - w[16] = 0; - w[15] = 0; - w[14] = 0; - w[13] = 0; - w[12] = 0; - w[11] = 0; - w[10] = 0; - w[ 9] = 0; - w[ 8] = 0; - w[ 7] = 0; - w[ 6] = 0; - w[ 5] = 0; - w[ 4] = 0; - w[ 3] = 0; - w[ 2] = 0; - w[ 1] = 0; - w[ 0] = 0; + case 5: + w0[1] &= 0x000000ff; + w0[2] = 0; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; break; - case 48: - w[63] = hc_byte_perm (w[15], w[14], selector); - w[62] = hc_byte_perm (w[14], w[13], selector); - w[61] = hc_byte_perm (w[13], w[12], selector); - w[60] = hc_byte_perm (w[12], w[11], selector); - w[59] = hc_byte_perm (w[11], w[10], selector); - w[58] = hc_byte_perm (w[10], w[ 9], selector); - w[57] = hc_byte_perm (w[ 9], w[ 8], selector); - w[56] = hc_byte_perm (w[ 8], w[ 7], selector); - w[55] = hc_byte_perm (w[ 7], w[ 6], selector); - w[54] = hc_byte_perm (w[ 6], w[ 5], selector); - w[53] = hc_byte_perm (w[ 5], w[ 4], selector); - w[52] = hc_byte_perm (w[ 4], w[ 3], selector); - w[51] = hc_byte_perm (w[ 3], w[ 2], selector); - w[50] = hc_byte_perm (w[ 2], w[ 1], selector); - w[49] = hc_byte_perm (w[ 1], w[ 0], selector); - w[48] = hc_byte_perm (w[ 0], 0, selector); - w[47] = 0; - w[46] = 0; - w[45] = 0; - w[44] = 0; - w[43] = 0; - w[42] = 0; - w[41] = 0; - w[40] = 0; - w[39] = 0; - w[38] = 0; - w[37] = 0; - w[36] = 0; - w[35] = 0; - w[34] = 0; - w[33] = 0; - w[32] = 0; - w[31] = 0; - w[30] = 0; - w[29] = 0; - w[28] = 0; - w[27] = 0; - w[26] = 0; - w[25] = 0; - w[24] = 0; - w[23] = 0; - w[22] = 0; - w[21] = 0; - w[20] = 0; - w[19] = 0; - w[18] = 0; - w[17] = 0; - w[16] = 0; - w[15] = 0; - w[14] = 0; - w[13] = 0; - w[12] = 0; - w[11] = 0; - w[10] = 0; - w[ 9] = 0; - w[ 8] = 0; - w[ 7] = 0; - w[ 6] = 0; - w[ 5] = 0; - w[ 4] = 0; - w[ 3] = 0; - w[ 2] = 0; - w[ 1] = 0; - w[ 0] = 0; + case 6: + w0[1] &= 0x0000ffff; + w0[2] = 0; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; break; - case 49: - w[63] = hc_byte_perm (w[14], w[13], selector); - w[62] = hc_byte_perm (w[13], w[12], selector); - w[61] = hc_byte_perm (w[12], w[11], selector); - w[60] = hc_byte_perm (w[11], w[10], selector); - w[59] = hc_byte_perm (w[10], w[ 9], selector); - w[58] = hc_byte_perm (w[ 9], w[ 8], selector); - w[57] = hc_byte_perm (w[ 8], w[ 7], selector); - w[56] = hc_byte_perm (w[ 7], w[ 6], selector); - w[55] = hc_byte_perm (w[ 6], w[ 5], selector); - w[54] = hc_byte_perm (w[ 5], w[ 4], selector); - w[53] = hc_byte_perm (w[ 4], w[ 3], selector); - w[52] = hc_byte_perm (w[ 3], w[ 2], selector); - w[51] = hc_byte_perm (w[ 2], w[ 1], selector); - w[50] = hc_byte_perm (w[ 1], w[ 0], selector); - w[49] = hc_byte_perm (w[ 0], 0, selector); - w[48] = 0; - w[47] = 0; - w[46] = 0; - w[45] = 0; - w[44] = 0; - w[43] = 0; - w[42] = 0; - w[41] = 0; - w[40] = 0; - w[39] = 0; - w[38] = 0; - w[37] = 0; - w[36] = 0; - w[35] = 0; - w[34] = 0; - w[33] = 0; - w[32] = 0; - w[31] = 0; - w[30] = 0; - w[29] = 0; - w[28] = 0; - w[27] = 0; - w[26] = 0; - w[25] = 0; - w[24] = 0; - w[23] = 0; - w[22] = 0; - w[21] = 0; - w[20] = 0; - w[19] = 0; - w[18] = 0; - w[17] = 0; - w[16] = 0; - w[15] = 0; - w[14] = 0; - w[13] = 0; - w[12] = 0; - w[11] = 0; - w[10] = 0; - w[ 9] = 0; - w[ 8] = 0; - w[ 7] = 0; - w[ 6] = 0; - w[ 5] = 0; - w[ 4] = 0; - w[ 3] = 0; - w[ 2] = 0; - w[ 1] = 0; - w[ 0] = 0; + case 7: + w0[1] &= 0x00ffffff; + w0[2] = 0; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 8: + w0[2] = 0; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; break; - case 50: - w[63] = hc_byte_perm (w[13], w[12], selector); - w[62] = hc_byte_perm (w[12], w[11], selector); - w[61] = hc_byte_perm (w[11], w[10], selector); - w[60] = hc_byte_perm (w[10], w[ 9], selector); - w[59] = hc_byte_perm (w[ 9], w[ 8], selector); - w[58] = hc_byte_perm (w[ 8], w[ 7], selector); - w[57] = hc_byte_perm (w[ 7], w[ 6], selector); - w[56] = hc_byte_perm (w[ 6], w[ 5], selector); - w[55] = hc_byte_perm (w[ 5], w[ 4], selector); - w[54] = hc_byte_perm (w[ 4], w[ 3], selector); - w[53] = hc_byte_perm (w[ 3], w[ 2], selector); - w[52] = hc_byte_perm (w[ 2], w[ 1], selector); - w[51] = hc_byte_perm (w[ 1], w[ 0], selector); - w[50] = hc_byte_perm (w[ 0], 0, selector); - w[49] = 0; - w[48] = 0; - w[47] = 0; - w[46] = 0; - w[45] = 0; - w[44] = 0; - w[43] = 0; - w[42] = 0; - w[41] = 0; - w[40] = 0; - w[39] = 0; - w[38] = 0; - w[37] = 0; - w[36] = 0; - w[35] = 0; - w[34] = 0; - w[33] = 0; - w[32] = 0; - w[31] = 0; - w[30] = 0; - w[29] = 0; - w[28] = 0; - w[27] = 0; - w[26] = 0; - w[25] = 0; - w[24] = 0; - w[23] = 0; - w[22] = 0; - w[21] = 0; - w[20] = 0; - w[19] = 0; - w[18] = 0; - w[17] = 0; - w[16] = 0; - w[15] = 0; - w[14] = 0; - w[13] = 0; - w[12] = 0; - w[11] = 0; - w[10] = 0; - w[ 9] = 0; - w[ 8] = 0; - w[ 7] = 0; - w[ 6] = 0; - w[ 5] = 0; - w[ 4] = 0; - w[ 3] = 0; - w[ 2] = 0; - w[ 1] = 0; - w[ 0] = 0; + case 9: + w0[2] &= 0x000000ff; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; break; - case 51: - w[63] = hc_byte_perm (w[12], w[11], selector); - w[62] = hc_byte_perm (w[11], w[10], selector); - w[61] = hc_byte_perm (w[10], w[ 9], selector); - w[60] = hc_byte_perm (w[ 9], w[ 8], selector); - w[59] = hc_byte_perm (w[ 8], w[ 7], selector); - w[58] = hc_byte_perm (w[ 7], w[ 6], selector); - w[57] = hc_byte_perm (w[ 6], w[ 5], selector); - w[56] = hc_byte_perm (w[ 5], w[ 4], selector); - w[55] = hc_byte_perm (w[ 4], w[ 3], selector); - w[54] = hc_byte_perm (w[ 3], w[ 2], selector); - w[53] = hc_byte_perm (w[ 2], w[ 1], selector); - w[52] = hc_byte_perm (w[ 1], w[ 0], selector); - w[51] = hc_byte_perm (w[ 0], 0, selector); - w[50] = 0; - w[49] = 0; - w[48] = 0; - w[47] = 0; - w[46] = 0; - w[45] = 0; - w[44] = 0; - w[43] = 0; - w[42] = 0; - w[41] = 0; - w[40] = 0; - w[39] = 0; - w[38] = 0; - w[37] = 0; - w[36] = 0; - w[35] = 0; - w[34] = 0; - w[33] = 0; - w[32] = 0; - w[31] = 0; - w[30] = 0; - w[29] = 0; - w[28] = 0; - w[27] = 0; - w[26] = 0; - w[25] = 0; - w[24] = 0; - w[23] = 0; - w[22] = 0; - w[21] = 0; - w[20] = 0; - w[19] = 0; - w[18] = 0; - w[17] = 0; - w[16] = 0; - w[15] = 0; - w[14] = 0; - w[13] = 0; - w[12] = 0; - w[11] = 0; - w[10] = 0; - w[ 9] = 0; - w[ 8] = 0; - w[ 7] = 0; - w[ 6] = 0; - w[ 5] = 0; - w[ 4] = 0; - w[ 3] = 0; - w[ 2] = 0; - w[ 1] = 0; - w[ 0] = 0; + case 10: + w0[2] &= 0x0000ffff; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; break; - case 52: - w[63] = hc_byte_perm (w[11], w[10], selector); - w[62] = hc_byte_perm (w[10], w[ 9], selector); - w[61] = hc_byte_perm (w[ 9], w[ 8], selector); - w[60] = hc_byte_perm (w[ 8], w[ 7], selector); - w[59] = hc_byte_perm (w[ 7], w[ 6], selector); - w[58] = hc_byte_perm (w[ 6], w[ 5], selector); - w[57] = hc_byte_perm (w[ 5], w[ 4], selector); - w[56] = hc_byte_perm (w[ 4], w[ 3], selector); - w[55] = hc_byte_perm (w[ 3], w[ 2], selector); - w[54] = hc_byte_perm (w[ 2], w[ 1], selector); - w[53] = hc_byte_perm (w[ 1], w[ 0], selector); - w[52] = hc_byte_perm (w[ 0], 0, selector); - w[51] = 0; - w[50] = 0; - w[49] = 0; - w[48] = 0; - w[47] = 0; - w[46] = 0; - w[45] = 0; - w[44] = 0; - w[43] = 0; - w[42] = 0; - w[41] = 0; - w[40] = 0; - w[39] = 0; - w[38] = 0; - w[37] = 0; - w[36] = 0; - w[35] = 0; - w[34] = 0; - w[33] = 0; - w[32] = 0; - w[31] = 0; - w[30] = 0; - w[29] = 0; - w[28] = 0; - w[27] = 0; - w[26] = 0; - w[25] = 0; - w[24] = 0; - w[23] = 0; - w[22] = 0; - w[21] = 0; - w[20] = 0; - w[19] = 0; - w[18] = 0; - w[17] = 0; - w[16] = 0; - w[15] = 0; - w[14] = 0; - w[13] = 0; - w[12] = 0; - w[11] = 0; - w[10] = 0; - w[ 9] = 0; - w[ 8] = 0; - w[ 7] = 0; - w[ 6] = 0; - w[ 5] = 0; - w[ 4] = 0; - w[ 3] = 0; - w[ 2] = 0; - w[ 1] = 0; - w[ 0] = 0; + case 11: + w0[2] &= 0x00ffffff; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 12: + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; break; - case 53: - w[63] = hc_byte_perm (w[10], w[ 9], selector); - w[62] = hc_byte_perm (w[ 9], w[ 8], selector); - w[61] = hc_byte_perm (w[ 8], w[ 7], selector); - w[60] = hc_byte_perm (w[ 7], w[ 6], selector); - w[59] = hc_byte_perm (w[ 6], w[ 5], selector); - w[58] = hc_byte_perm (w[ 5], w[ 4], selector); - w[57] = hc_byte_perm (w[ 4], w[ 3], selector); - w[56] = hc_byte_perm (w[ 3], w[ 2], selector); - w[55] = hc_byte_perm (w[ 2], w[ 1], selector); - w[54] = hc_byte_perm (w[ 1], w[ 0], selector); - w[53] = hc_byte_perm (w[ 0], 0, selector); - w[52] = 0; - w[51] = 0; - w[50] = 0; - w[49] = 0; - w[48] = 0; - w[47] = 0; - w[46] = 0; - w[45] = 0; - w[44] = 0; - w[43] = 0; - w[42] = 0; - w[41] = 0; - w[40] = 0; - w[39] = 0; - w[38] = 0; - w[37] = 0; - w[36] = 0; - w[35] = 0; - w[34] = 0; - w[33] = 0; - w[32] = 0; - w[31] = 0; - w[30] = 0; - w[29] = 0; - w[28] = 0; - w[27] = 0; - w[26] = 0; - w[25] = 0; - w[24] = 0; - w[23] = 0; - w[22] = 0; - w[21] = 0; - w[20] = 0; - w[19] = 0; - w[18] = 0; - w[17] = 0; - w[16] = 0; - w[15] = 0; - w[14] = 0; - w[13] = 0; - w[12] = 0; - w[11] = 0; - w[10] = 0; - w[ 9] = 0; - w[ 8] = 0; - w[ 7] = 0; - w[ 6] = 0; - w[ 5] = 0; - w[ 4] = 0; - w[ 3] = 0; - w[ 2] = 0; - w[ 1] = 0; - w[ 0] = 0; + case 13: + w0[3] &= 0x000000ff; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; break; - case 54: - w[63] = hc_byte_perm (w[ 9], w[ 8], selector); - w[62] = hc_byte_perm (w[ 8], w[ 7], selector); - w[61] = hc_byte_perm (w[ 7], w[ 6], selector); - w[60] = hc_byte_perm (w[ 6], w[ 5], selector); - w[59] = hc_byte_perm (w[ 5], w[ 4], selector); - w[58] = hc_byte_perm (w[ 4], w[ 3], selector); - w[57] = hc_byte_perm (w[ 3], w[ 2], selector); - w[56] = hc_byte_perm (w[ 2], w[ 1], selector); - w[55] = hc_byte_perm (w[ 1], w[ 0], selector); - w[54] = hc_byte_perm (w[ 0], 0, selector); - w[53] = 0; - w[52] = 0; - w[51] = 0; - w[50] = 0; - w[49] = 0; - w[48] = 0; - w[47] = 0; - w[46] = 0; - w[45] = 0; - w[44] = 0; - w[43] = 0; - w[42] = 0; - w[41] = 0; - w[40] = 0; - w[39] = 0; - w[38] = 0; - w[37] = 0; - w[36] = 0; - w[35] = 0; - w[34] = 0; - w[33] = 0; - w[32] = 0; - w[31] = 0; - w[30] = 0; - w[29] = 0; - w[28] = 0; - w[27] = 0; - w[26] = 0; - w[25] = 0; - w[24] = 0; - w[23] = 0; - w[22] = 0; - w[21] = 0; - w[20] = 0; - w[19] = 0; - w[18] = 0; - w[17] = 0; - w[16] = 0; - w[15] = 0; - w[14] = 0; - w[13] = 0; - w[12] = 0; - w[11] = 0; - w[10] = 0; - w[ 9] = 0; - w[ 8] = 0; - w[ 7] = 0; - w[ 6] = 0; - w[ 5] = 0; - w[ 4] = 0; - w[ 3] = 0; - w[ 2] = 0; - w[ 1] = 0; - w[ 0] = 0; + case 14: + w0[3] &= 0x0000ffff; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; break; - case 55: - w[63] = hc_byte_perm (w[ 8], w[ 7], selector); - w[62] = hc_byte_perm (w[ 7], w[ 6], selector); - w[61] = hc_byte_perm (w[ 6], w[ 5], selector); - w[60] = hc_byte_perm (w[ 5], w[ 4], selector); - w[59] = hc_byte_perm (w[ 4], w[ 3], selector); - w[58] = hc_byte_perm (w[ 3], w[ 2], selector); - w[57] = hc_byte_perm (w[ 2], w[ 1], selector); - w[56] = hc_byte_perm (w[ 1], w[ 0], selector); - w[55] = hc_byte_perm (w[ 0], 0, selector); - w[54] = 0; - w[53] = 0; - w[52] = 0; - w[51] = 0; - w[50] = 0; - w[49] = 0; - w[48] = 0; - w[47] = 0; - w[46] = 0; - w[45] = 0; - w[44] = 0; - w[43] = 0; - w[42] = 0; - w[41] = 0; - w[40] = 0; - w[39] = 0; - w[38] = 0; - w[37] = 0; - w[36] = 0; - w[35] = 0; - w[34] = 0; - w[33] = 0; - w[32] = 0; - w[31] = 0; - w[30] = 0; - w[29] = 0; - w[28] = 0; - w[27] = 0; - w[26] = 0; - w[25] = 0; - w[24] = 0; - w[23] = 0; - w[22] = 0; - w[21] = 0; - w[20] = 0; - w[19] = 0; - w[18] = 0; - w[17] = 0; - w[16] = 0; - w[15] = 0; - w[14] = 0; - w[13] = 0; - w[12] = 0; - w[11] = 0; - w[10] = 0; - w[ 9] = 0; - w[ 8] = 0; - w[ 7] = 0; - w[ 6] = 0; - w[ 5] = 0; - w[ 4] = 0; - w[ 3] = 0; - w[ 2] = 0; - w[ 1] = 0; - w[ 0] = 0; + case 15: + w0[3] &= 0x00ffffff; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 16: + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; break; - case 56: - w[63] = hc_byte_perm (w[ 7], w[ 6], selector); - w[62] = hc_byte_perm (w[ 6], w[ 5], selector); - w[61] = hc_byte_perm (w[ 5], w[ 4], selector); - w[60] = hc_byte_perm (w[ 4], w[ 3], selector); - w[59] = hc_byte_perm (w[ 3], w[ 2], selector); - w[58] = hc_byte_perm (w[ 2], w[ 1], selector); - w[57] = hc_byte_perm (w[ 1], w[ 0], selector); - w[56] = hc_byte_perm (w[ 0], 0, selector); - w[55] = 0; - w[54] = 0; - w[53] = 0; - w[52] = 0; - w[51] = 0; - w[50] = 0; - w[49] = 0; - w[48] = 0; - w[47] = 0; - w[46] = 0; - w[45] = 0; - w[44] = 0; - w[43] = 0; - w[42] = 0; - w[41] = 0; - w[40] = 0; - w[39] = 0; - w[38] = 0; - w[37] = 0; - w[36] = 0; - w[35] = 0; - w[34] = 0; - w[33] = 0; - w[32] = 0; - w[31] = 0; - w[30] = 0; - w[29] = 0; - w[28] = 0; - w[27] = 0; - w[26] = 0; - w[25] = 0; - w[24] = 0; - w[23] = 0; - w[22] = 0; - w[21] = 0; - w[20] = 0; - w[19] = 0; - w[18] = 0; - w[17] = 0; - w[16] = 0; - w[15] = 0; - w[14] = 0; - w[13] = 0; - w[12] = 0; - w[11] = 0; - w[10] = 0; - w[ 9] = 0; - w[ 8] = 0; - w[ 7] = 0; - w[ 6] = 0; - w[ 5] = 0; - w[ 4] = 0; - w[ 3] = 0; - w[ 2] = 0; - w[ 1] = 0; - w[ 0] = 0; + case 17: + w1[0] &= 0x000000ff; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; break; - case 57: - w[63] = hc_byte_perm (w[ 6], w[ 5], selector); - w[62] = hc_byte_perm (w[ 5], w[ 4], selector); - w[61] = hc_byte_perm (w[ 4], w[ 3], selector); - w[60] = hc_byte_perm (w[ 3], w[ 2], selector); - w[59] = hc_byte_perm (w[ 2], w[ 1], selector); - w[58] = hc_byte_perm (w[ 1], w[ 0], selector); - w[57] = hc_byte_perm (w[ 0], 0, selector); - w[56] = 0; - w[55] = 0; - w[54] = 0; - w[53] = 0; - w[52] = 0; - w[51] = 0; - w[50] = 0; - w[49] = 0; - w[48] = 0; - w[47] = 0; - w[46] = 0; - w[45] = 0; - w[44] = 0; - w[43] = 0; - w[42] = 0; - w[41] = 0; - w[40] = 0; - w[39] = 0; - w[38] = 0; - w[37] = 0; - w[36] = 0; - w[35] = 0; - w[34] = 0; - w[33] = 0; - w[32] = 0; - w[31] = 0; - w[30] = 0; - w[29] = 0; - w[28] = 0; - w[27] = 0; - w[26] = 0; - w[25] = 0; - w[24] = 0; - w[23] = 0; - w[22] = 0; - w[21] = 0; - w[20] = 0; - w[19] = 0; - w[18] = 0; - w[17] = 0; - w[16] = 0; - w[15] = 0; - w[14] = 0; - w[13] = 0; - w[12] = 0; - w[11] = 0; - w[10] = 0; - w[ 9] = 0; - w[ 8] = 0; - w[ 7] = 0; - w[ 6] = 0; - w[ 5] = 0; - w[ 4] = 0; - w[ 3] = 0; - w[ 2] = 0; - w[ 1] = 0; - w[ 0] = 0; + case 18: + w1[0] &= 0x0000ffff; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; break; - case 58: - w[63] = hc_byte_perm (w[ 5], w[ 4], selector); - w[62] = hc_byte_perm (w[ 4], w[ 3], selector); - w[61] = hc_byte_perm (w[ 3], w[ 2], selector); - w[60] = hc_byte_perm (w[ 2], w[ 1], selector); - w[59] = hc_byte_perm (w[ 1], w[ 0], selector); - w[58] = hc_byte_perm (w[ 0], 0, selector); - w[57] = 0; - w[56] = 0; - w[55] = 0; - w[54] = 0; - w[53] = 0; - w[52] = 0; - w[51] = 0; - w[50] = 0; - w[49] = 0; - w[48] = 0; - w[47] = 0; - w[46] = 0; - w[45] = 0; - w[44] = 0; - w[43] = 0; - w[42] = 0; - w[41] = 0; - w[40] = 0; - w[39] = 0; - w[38] = 0; - w[37] = 0; - w[36] = 0; - w[35] = 0; - w[34] = 0; - w[33] = 0; - w[32] = 0; - w[31] = 0; - w[30] = 0; - w[29] = 0; - w[28] = 0; - w[27] = 0; - w[26] = 0; - w[25] = 0; - w[24] = 0; - w[23] = 0; - w[22] = 0; - w[21] = 0; - w[20] = 0; - w[19] = 0; - w[18] = 0; - w[17] = 0; - w[16] = 0; - w[15] = 0; - w[14] = 0; - w[13] = 0; - w[12] = 0; - w[11] = 0; - w[10] = 0; - w[ 9] = 0; - w[ 8] = 0; - w[ 7] = 0; - w[ 6] = 0; - w[ 5] = 0; - w[ 4] = 0; - w[ 3] = 0; - w[ 2] = 0; - w[ 1] = 0; - w[ 0] = 0; + case 19: + w1[0] &= 0x00ffffff; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 20: + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; break; - case 59: - w[63] = hc_byte_perm (w[ 4], w[ 3], selector); - w[62] = hc_byte_perm (w[ 3], w[ 2], selector); - w[61] = hc_byte_perm (w[ 2], w[ 1], selector); - w[60] = hc_byte_perm (w[ 1], w[ 0], selector); - w[59] = hc_byte_perm (w[ 0], 0, selector); - w[58] = 0; - w[57] = 0; - w[56] = 0; - w[55] = 0; - w[54] = 0; - w[53] = 0; - w[52] = 0; - w[51] = 0; - w[50] = 0; - w[49] = 0; - w[48] = 0; - w[47] = 0; - w[46] = 0; - w[45] = 0; - w[44] = 0; - w[43] = 0; - w[42] = 0; - w[41] = 0; - w[40] = 0; - w[39] = 0; - w[38] = 0; - w[37] = 0; - w[36] = 0; - w[35] = 0; - w[34] = 0; - w[33] = 0; - w[32] = 0; - w[31] = 0; - w[30] = 0; - w[29] = 0; - w[28] = 0; - w[27] = 0; - w[26] = 0; - w[25] = 0; - w[24] = 0; - w[23] = 0; - w[22] = 0; - w[21] = 0; - w[20] = 0; - w[19] = 0; - w[18] = 0; - w[17] = 0; - w[16] = 0; - w[15] = 0; - w[14] = 0; - w[13] = 0; - w[12] = 0; - w[11] = 0; - w[10] = 0; - w[ 9] = 0; - w[ 8] = 0; - w[ 7] = 0; - w[ 6] = 0; - w[ 5] = 0; - w[ 4] = 0; - w[ 3] = 0; - w[ 2] = 0; - w[ 1] = 0; - w[ 0] = 0; + case 21: + w1[1] &= 0x000000ff; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; break; - case 60: - w[63] = hc_byte_perm (w[ 3], w[ 2], selector); - w[62] = hc_byte_perm (w[ 2], w[ 1], selector); - w[61] = hc_byte_perm (w[ 1], w[ 0], selector); - w[60] = hc_byte_perm (w[ 0], 0, selector); - w[59] = 0; - w[58] = 0; - w[57] = 0; - w[56] = 0; - w[55] = 0; - w[54] = 0; - w[53] = 0; - w[52] = 0; - w[51] = 0; - w[50] = 0; - w[49] = 0; - w[48] = 0; - w[47] = 0; - w[46] = 0; - w[45] = 0; - w[44] = 0; - w[43] = 0; - w[42] = 0; - w[41] = 0; - w[40] = 0; - w[39] = 0; - w[38] = 0; - w[37] = 0; - w[36] = 0; - w[35] = 0; - w[34] = 0; - w[33] = 0; - w[32] = 0; - w[31] = 0; - w[30] = 0; - w[29] = 0; - w[28] = 0; - w[27] = 0; - w[26] = 0; - w[25] = 0; - w[24] = 0; - w[23] = 0; - w[22] = 0; - w[21] = 0; - w[20] = 0; - w[19] = 0; - w[18] = 0; - w[17] = 0; - w[16] = 0; - w[15] = 0; - w[14] = 0; - w[13] = 0; - w[12] = 0; - w[11] = 0; - w[10] = 0; - w[ 9] = 0; - w[ 8] = 0; - w[ 7] = 0; - w[ 6] = 0; - w[ 5] = 0; - w[ 4] = 0; - w[ 3] = 0; - w[ 2] = 0; - w[ 1] = 0; - w[ 0] = 0; + case 22: + w1[1] &= 0x0000ffff; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; break; - case 61: - w[63] = hc_byte_perm (w[ 2], w[ 1], selector); - w[62] = hc_byte_perm (w[ 1], w[ 0], selector); - w[61] = hc_byte_perm (w[ 0], 0, selector); - w[60] = 0; - w[59] = 0; - w[58] = 0; - w[57] = 0; - w[56] = 0; - w[55] = 0; - w[54] = 0; - w[53] = 0; - w[52] = 0; - w[51] = 0; - w[50] = 0; - w[49] = 0; - w[48] = 0; - w[47] = 0; - w[46] = 0; - w[45] = 0; - w[44] = 0; - w[43] = 0; - w[42] = 0; - w[41] = 0; - w[40] = 0; - w[39] = 0; - w[38] = 0; - w[37] = 0; - w[36] = 0; - w[35] = 0; - w[34] = 0; - w[33] = 0; - w[32] = 0; - w[31] = 0; - w[30] = 0; - w[29] = 0; - w[28] = 0; - w[27] = 0; - w[26] = 0; - w[25] = 0; - w[24] = 0; - w[23] = 0; - w[22] = 0; - w[21] = 0; - w[20] = 0; - w[19] = 0; - w[18] = 0; - w[17] = 0; - w[16] = 0; - w[15] = 0; - w[14] = 0; - w[13] = 0; - w[12] = 0; - w[11] = 0; - w[10] = 0; - w[ 9] = 0; - w[ 8] = 0; - w[ 7] = 0; - w[ 6] = 0; - w[ 5] = 0; - w[ 4] = 0; - w[ 3] = 0; - w[ 2] = 0; - w[ 1] = 0; - w[ 0] = 0; + case 23: + w1[1] &= 0x00ffffff; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 24: + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 25: + w1[2] &= 0x000000ff; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; break; - case 62: - w[63] = hc_byte_perm (w[ 1], w[ 0], selector); - w[62] = hc_byte_perm (w[ 0], 0, selector); - w[61] = 0; - w[60] = 0; - w[59] = 0; - w[58] = 0; - w[57] = 0; - w[56] = 0; - w[55] = 0; - w[54] = 0; - w[53] = 0; - w[52] = 0; - w[51] = 0; - w[50] = 0; - w[49] = 0; - w[48] = 0; - w[47] = 0; - w[46] = 0; - w[45] = 0; - w[44] = 0; - w[43] = 0; - w[42] = 0; - w[41] = 0; - w[40] = 0; - w[39] = 0; - w[38] = 0; - w[37] = 0; - w[36] = 0; - w[35] = 0; - w[34] = 0; - w[33] = 0; - w[32] = 0; - w[31] = 0; - w[30] = 0; - w[29] = 0; - w[28] = 0; - w[27] = 0; - w[26] = 0; - w[25] = 0; - w[24] = 0; - w[23] = 0; - w[22] = 0; - w[21] = 0; - w[20] = 0; - w[19] = 0; - w[18] = 0; - w[17] = 0; - w[16] = 0; - w[15] = 0; - w[14] = 0; - w[13] = 0; - w[12] = 0; - w[11] = 0; - w[10] = 0; - w[ 9] = 0; - w[ 8] = 0; - w[ 7] = 0; - w[ 6] = 0; - w[ 5] = 0; - w[ 4] = 0; - w[ 3] = 0; - w[ 2] = 0; - w[ 1] = 0; - w[ 0] = 0; + case 26: + w1[2] &= 0x0000ffff; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; break; - case 63: - w[63] = hc_byte_perm (w[ 0], 0, selector); - w[62] = 0; - w[61] = 0; - w[60] = 0; - w[59] = 0; - w[58] = 0; - w[57] = 0; - w[56] = 0; - w[55] = 0; - w[54] = 0; - w[53] = 0; - w[52] = 0; - w[51] = 0; - w[50] = 0; - w[49] = 0; - w[48] = 0; - w[47] = 0; - w[46] = 0; - w[45] = 0; - w[44] = 0; - w[43] = 0; - w[42] = 0; - w[41] = 0; - w[40] = 0; - w[39] = 0; - w[38] = 0; - w[37] = 0; - w[36] = 0; - w[35] = 0; - w[34] = 0; - w[33] = 0; - w[32] = 0; - w[31] = 0; - w[30] = 0; - w[29] = 0; - w[28] = 0; - w[27] = 0; - w[26] = 0; - w[25] = 0; - w[24] = 0; - w[23] = 0; - w[22] = 0; - w[21] = 0; - w[20] = 0; - w[19] = 0; - w[18] = 0; - w[17] = 0; - w[16] = 0; - w[15] = 0; - w[14] = 0; - w[13] = 0; - w[12] = 0; - w[11] = 0; - w[10] = 0; - w[ 9] = 0; - w[ 8] = 0; - w[ 7] = 0; - w[ 6] = 0; - w[ 5] = 0; - w[ 4] = 0; - w[ 3] = 0; - w[ 2] = 0; - w[ 1] = 0; - w[ 0] = 0; + case 27: + w1[2] &= 0x00ffffff; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; break; - } - #endif -} -/** - * vector functions as scalar (for outer loop usage) - */ + case 28: + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; -DECLSPEC void truncate_block_4x4_le_S (u32 *w0, const u32 len) -{ - switch (len) - { - case 0: - w0[0] = 0; - w0[1] = 0; - w0[2] = 0; - w0[3] = 0; + break; + + case 29: + w1[3] &= 0x000000ff; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; break; - case 1: - w0[0] &= 0x000000ff; - w0[1] = 0; - w0[2] = 0; - w0[3] = 0; + case 30: + w1[3] &= 0x0000ffff; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; break; - case 2: - w0[0] &= 0x0000ffff; - w0[1] = 0; - w0[2] = 0; - w0[3] = 0; + case 31: + w1[3] &= 0x00ffffff; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 32: + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 33: + w2[0] &= 0x000000ff; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 34: + w2[0] &= 0x0000ffff; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; break; - case 3: - w0[0] &= 0x00ffffff; - w0[1] = 0; - w0[2] = 0; - w0[3] = 0; + case 35: + w2[0] &= 0x00ffffff; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; break; - case 4: - w0[1] = 0; - w0[2] = 0; - w0[3] = 0; + case 36: + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; break; - case 5: - w0[1] &= 0x000000ff; - w0[2] = 0; - w0[3] = 0; + case 37: + w2[1] &= 0x000000ff; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; break; - case 6: - w0[1] &= 0x0000ffff; - w0[2] = 0; - w0[3] = 0; + case 38: + w2[1] &= 0x0000ffff; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; break; - case 7: - w0[1] &= 0x00ffffff; - w0[2] = 0; - w0[3] = 0; + case 39: + w2[1] &= 0x00ffffff; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; break; - case 8: - w0[2] = 0; - w0[3] = 0; + case 40: + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; break; - case 9: - w0[2] &= 0x000000ff; - w0[3] = 0; + case 41: + w2[2] &= 0x000000ff; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; break; - case 10: - w0[2] &= 0x0000ffff; - w0[3] = 0; + case 42: + w2[2] &= 0x0000ffff; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; break; - case 11: - w0[2] &= 0x00ffffff; - w0[3] = 0; + case 43: + w2[2] &= 0x00ffffff; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; break; - case 12: - w0[3] = 0; + case 44: + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; break; - case 13: - w0[3] &= 0x000000ff; + case 45: + w2[3] &= 0x000000ff; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; break; - case 14: - w0[3] &= 0x0000ffff; + case 46: + w2[3] &= 0x0000ffff; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; break; - case 15: - w0[3] &= 0x00ffffff; + case 47: + w2[3] &= 0x00ffffff; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; break; - } -} -DECLSPEC void truncate_block_4x4_be_S (u32 *w0, const u32 len) -{ - switch (len) - { - case 0: - w0[0] = 0; - w0[1] = 0; - w0[2] = 0; - w0[3] = 0; + case 48: + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; break; - case 1: - w0[0] &= 0xff000000; - w0[1] = 0; - w0[2] = 0; - w0[3] = 0; + case 49: + w3[0] &= 0x000000ff; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; break; - case 2: - w0[0] &= 0xffff0000; - w0[1] = 0; - w0[2] = 0; - w0[3] = 0; + case 50: + w3[0] &= 0x0000ffff; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; break; - case 3: - w0[0] &= 0xffffff00; - w0[1] = 0; - w0[2] = 0; - w0[3] = 0; + case 51: + w3[0] &= 0x00ffffff; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; break; - case 4: - w0[1] = 0; - w0[2] = 0; - w0[3] = 0; + case 52: + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; break; - case 5: - w0[1] &= 0xff000000; - w0[2] = 0; - w0[3] = 0; + case 53: + w3[1] &= 0x000000ff; + w3[2] = 0; + w3[3] = 0; break; - case 6: - w0[1] &= 0xffff0000; - w0[2] = 0; - w0[3] = 0; + case 54: + w3[1] &= 0x0000ffff; + w3[2] = 0; + w3[3] = 0; break; - case 7: - w0[1] &= 0xffffff00; - w0[2] = 0; - w0[3] = 0; + case 55: + w3[1] &= 0x00ffffff; + w3[2] = 0; + w3[3] = 0; break; - case 8: - w0[2] = 0; - w0[3] = 0; + case 56: + w3[2] = 0; + w3[3] = 0; break; - case 9: - w0[2] &= 0xff000000; - w0[3] = 0; + case 57: + w3[2] &= 0x000000ff; + w3[3] = 0; break; - case 10: - w0[2] &= 0xffff0000; - w0[3] = 0; + case 58: + w3[2] &= 0x0000ffff; + w3[3] = 0; break; - case 11: - w0[2] &= 0xffffff00; - w0[3] = 0; + case 59: + w3[2] &= 0x00ffffff; + w3[3] = 0; break; - case 12: - w0[3] = 0; + case 60: + w3[3] = 0; break; - case 13: - w0[3] &= 0xff000000; + case 61: + w3[3] &= 0x000000ff; break; - case 14: - w0[3] &= 0xffff0000; + case 62: + w3[3] &= 0x0000ffff; break; - case 15: - w0[3] &= 0xffffff00; + case 63: + w3[3] &= 0x00ffffff; break; } } -DECLSPEC void truncate_block_16x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 len) +DECLSPEC void truncate_block_16x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 len) { switch (len) { @@ -30920,7 +35114,7 @@ DECLSPEC void truncate_block_16x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, cons break; case 1: - w0[0] &= 0x000000ff; + w0[0] &= 0xff000000; w0[1] = 0; w0[2] = 0; w0[3] = 0; @@ -30940,7 +35134,7 @@ DECLSPEC void truncate_block_16x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, cons break; case 2: - w0[0] &= 0x0000ffff; + w0[0] &= 0xffff0000; w0[1] = 0; w0[2] = 0; w0[3] = 0; @@ -30960,7 +35154,7 @@ DECLSPEC void truncate_block_16x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, cons break; case 3: - w0[0] &= 0x00ffffff; + w0[0] &= 0xffffff00; w0[1] = 0; w0[2] = 0; w0[3] = 0; @@ -30999,7 +35193,7 @@ DECLSPEC void truncate_block_16x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, cons break; case 5: - w0[1] &= 0x000000ff; + w0[1] &= 0xff000000; w0[2] = 0; w0[3] = 0; w1[0] = 0; @@ -31018,7 +35212,7 @@ DECLSPEC void truncate_block_16x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, cons break; case 6: - w0[1] &= 0x0000ffff; + w0[1] &= 0xffff0000; w0[2] = 0; w0[3] = 0; w1[0] = 0; @@ -31037,7 +35231,7 @@ DECLSPEC void truncate_block_16x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, cons break; case 7: - w0[1] &= 0x00ffffff; + w0[1] &= 0xffffff00; w0[2] = 0; w0[3] = 0; w1[0] = 0; @@ -31074,7 +35268,7 @@ DECLSPEC void truncate_block_16x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, cons break; case 9: - w0[2] &= 0x000000ff; + w0[2] &= 0xff000000; w0[3] = 0; w1[0] = 0; w1[1] = 0; @@ -31092,7 +35286,7 @@ DECLSPEC void truncate_block_16x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, cons break; case 10: - w0[2] &= 0x0000ffff; + w0[2] &= 0xffff0000; w0[3] = 0; w1[0] = 0; w1[1] = 0; @@ -31110,7 +35304,7 @@ DECLSPEC void truncate_block_16x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, cons break; case 11: - w0[2] &= 0x00ffffff; + w0[2] &= 0xffffff00; w0[3] = 0; w1[0] = 0; w1[1] = 0; @@ -31145,7 +35339,7 @@ DECLSPEC void truncate_block_16x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, cons break; case 13: - w0[3] &= 0x000000ff; + w0[3] &= 0xff000000; w1[0] = 0; w1[1] = 0; w1[2] = 0; @@ -31162,7 +35356,7 @@ DECLSPEC void truncate_block_16x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, cons break; case 14: - w0[3] &= 0x0000ffff; + w0[3] &= 0xffff0000; w1[0] = 0; w1[1] = 0; w1[2] = 0; @@ -31179,7 +35373,7 @@ DECLSPEC void truncate_block_16x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, cons break; case 15: - w0[3] &= 0x00ffffff; + w0[3] &= 0xffffff00; w1[0] = 0; w1[1] = 0; w1[2] = 0; @@ -31212,7 +35406,7 @@ DECLSPEC void truncate_block_16x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, cons break; case 17: - w1[0] &= 0x000000ff; + w1[0] &= 0xff000000; w1[1] = 0; w1[2] = 0; w1[3] = 0; @@ -31228,7 +35422,7 @@ DECLSPEC void truncate_block_16x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, cons break; case 18: - w1[0] &= 0x0000ffff; + w1[0] &= 0xffff0000; w1[1] = 0; w1[2] = 0; w1[3] = 0; @@ -31244,7 +35438,7 @@ DECLSPEC void truncate_block_16x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, cons break; case 19: - w1[0] &= 0x00ffffff; + w1[0] &= 0xffffff00; w1[1] = 0; w1[2] = 0; w1[3] = 0; @@ -31275,7 +35469,7 @@ DECLSPEC void truncate_block_16x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, cons break; case 21: - w1[1] &= 0x000000ff; + w1[1] &= 0xff000000; w1[2] = 0; w1[3] = 0; w2[0] = 0; @@ -31290,7 +35484,7 @@ DECLSPEC void truncate_block_16x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, cons break; case 22: - w1[1] &= 0x0000ffff; + w1[1] &= 0xffff0000; w1[2] = 0; w1[3] = 0; w2[0] = 0; @@ -31305,7 +35499,7 @@ DECLSPEC void truncate_block_16x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, cons break; case 23: - w1[1] &= 0x00ffffff; + w1[1] &= 0xffffff00; w1[2] = 0; w1[3] = 0; w2[0] = 0; @@ -31334,7 +35528,7 @@ DECLSPEC void truncate_block_16x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, cons break; case 25: - w1[2] &= 0x000000ff; + w1[2] &= 0xff000000; w1[3] = 0; w2[0] = 0; w2[1] = 0; @@ -31348,7 +35542,7 @@ DECLSPEC void truncate_block_16x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, cons break; case 26: - w1[2] &= 0x0000ffff; + w1[2] &= 0xffff0000; w1[3] = 0; w2[0] = 0; w2[1] = 0; @@ -31362,7 +35556,7 @@ DECLSPEC void truncate_block_16x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, cons break; case 27: - w1[2] &= 0x00ffffff; + w1[2] &= 0xffffff00; w1[3] = 0; w2[0] = 0; w2[1] = 0; @@ -31389,7 +35583,7 @@ DECLSPEC void truncate_block_16x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, cons break; case 29: - w1[3] &= 0x000000ff; + w1[3] &= 0xff000000; w2[0] = 0; w2[1] = 0; w2[2] = 0; @@ -31402,7 +35596,7 @@ DECLSPEC void truncate_block_16x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, cons break; case 30: - w1[3] &= 0x0000ffff; + w1[3] &= 0xffff0000; w2[0] = 0; w2[1] = 0; w2[2] = 0; @@ -31415,7 +35609,7 @@ DECLSPEC void truncate_block_16x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, cons break; case 31: - w1[3] &= 0x00ffffff; + w1[3] &= 0xffffff00; w2[0] = 0; w2[1] = 0; w2[2] = 0; @@ -31440,7 +35634,7 @@ DECLSPEC void truncate_block_16x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, cons break; case 33: - w2[0] &= 0x000000ff; + w2[0] &= 0xff000000; w2[1] = 0; w2[2] = 0; w2[3] = 0; @@ -31452,7 +35646,7 @@ DECLSPEC void truncate_block_16x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, cons break; case 34: - w2[0] &= 0x0000ffff; + w2[0] &= 0xffff0000; w2[1] = 0; w2[2] = 0; w2[3] = 0; @@ -31464,7 +35658,7 @@ DECLSPEC void truncate_block_16x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, cons break; case 35: - w2[0] &= 0x00ffffff; + w2[0] &= 0xffffff00; w2[1] = 0; w2[2] = 0; w2[3] = 0; @@ -31487,7 +35681,7 @@ DECLSPEC void truncate_block_16x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, cons break; case 37: - w2[1] &= 0x000000ff; + w2[1] &= 0xff000000; w2[2] = 0; w2[3] = 0; w3[0] = 0; @@ -31498,7 +35692,7 @@ DECLSPEC void truncate_block_16x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, cons break; case 38: - w2[1] &= 0x0000ffff; + w2[1] &= 0xffff0000; w2[2] = 0; w2[3] = 0; w3[0] = 0; @@ -31509,7 +35703,7 @@ DECLSPEC void truncate_block_16x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, cons break; case 39: - w2[1] &= 0x00ffffff; + w2[1] &= 0xffffff00; w2[2] = 0; w2[3] = 0; w3[0] = 0; @@ -31530,7 +35724,7 @@ DECLSPEC void truncate_block_16x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, cons break; case 41: - w2[2] &= 0x000000ff; + w2[2] &= 0xff000000; w2[3] = 0; w3[0] = 0; w3[1] = 0; @@ -31540,7 +35734,7 @@ DECLSPEC void truncate_block_16x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, cons break; case 42: - w2[2] &= 0x0000ffff; + w2[2] &= 0xffff0000; w2[3] = 0; w3[0] = 0; w3[1] = 0; @@ -31550,7 +35744,7 @@ DECLSPEC void truncate_block_16x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, cons break; case 43: - w2[2] &= 0x00ffffff; + w2[2] &= 0xffffff00; w2[3] = 0; w3[0] = 0; w3[1] = 0; @@ -31569,7 +35763,7 @@ DECLSPEC void truncate_block_16x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, cons break; case 45: - w2[3] &= 0x000000ff; + w2[3] &= 0xff000000; w3[0] = 0; w3[1] = 0; w3[2] = 0; @@ -31578,7 +35772,7 @@ DECLSPEC void truncate_block_16x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, cons break; case 46: - w2[3] &= 0x0000ffff; + w2[3] &= 0xffff0000; w3[0] = 0; w3[1] = 0; w3[2] = 0; @@ -31587,7 +35781,7 @@ DECLSPEC void truncate_block_16x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, cons break; case 47: - w2[3] &= 0x00ffffff; + w2[3] &= 0xffffff00; w3[0] = 0; w3[1] = 0; w3[2] = 0; @@ -31604,7 +35798,7 @@ DECLSPEC void truncate_block_16x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, cons break; case 49: - w3[0] &= 0x000000ff; + w3[0] &= 0xff000000; w3[1] = 0; w3[2] = 0; w3[3] = 0; @@ -31612,7 +35806,7 @@ DECLSPEC void truncate_block_16x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, cons break; case 50: - w3[0] &= 0x0000ffff; + w3[0] &= 0xffff0000; w3[1] = 0; w3[2] = 0; w3[3] = 0; @@ -31620,7 +35814,7 @@ DECLSPEC void truncate_block_16x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, cons break; case 51: - w3[0] &= 0x00ffffff; + w3[0] &= 0xffffff00; w3[1] = 0; w3[2] = 0; w3[3] = 0; @@ -31635,21 +35829,21 @@ DECLSPEC void truncate_block_16x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, cons break; case 53: - w3[1] &= 0x000000ff; + w3[1] &= 0xff000000; w3[2] = 0; w3[3] = 0; break; case 54: - w3[1] &= 0x0000ffff; + w3[1] &= 0xffff0000; w3[2] = 0; w3[3] = 0; break; case 55: - w3[1] &= 0x00ffffff; + w3[1] &= 0xffffff00; w3[2] = 0; w3[3] = 0; @@ -31662,1191 +35856,2315 @@ DECLSPEC void truncate_block_16x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, cons break; case 57: - w3[2] &= 0x000000ff; + w3[2] &= 0xff000000; w3[3] = 0; break; case 58: - w3[2] &= 0x0000ffff; + w3[2] &= 0xffff0000; + w3[3] = 0; + + break; + + case 59: + w3[2] &= 0xffffff00; + w3[3] = 0; + + break; + + case 60: w3[3] = 0; - break; + break; + + case 61: + w3[3] &= 0xff000000; + + break; + + case 62: + w3[3] &= 0xffff0000; + + break; + + case 63: + w3[3] &= 0xffffff00; + + break; + } +} + +DECLSPEC void set_mark_1x4_S (u32 *v, const u32 offset) +{ + const u32 c = (offset & 15) / 4; + const u32 r = 0xff << ((offset & 3) * 8); + + v[0] = (c == 0) ? r : 0; + v[1] = (c == 1) ? r : 0; + v[2] = (c == 2) ? r : 0; + v[3] = (c == 3) ? r : 0; +} + +DECLSPEC void append_helper_1x4_S (u32 *r, const u32 v, const u32 *m) +{ + r[0] |= v & m[0]; + r[1] |= v & m[1]; + r[2] |= v & m[2]; + r[3] |= v & m[3]; +} + +DECLSPEC void append_0x01_2x4_S (u32 *w0, u32 *w1, const u32 offset) +{ + u32 v[4]; + + set_mark_1x4_S (v, offset); + + const u32 offset16 = offset / 16; + + append_helper_1x4_S (w0, ((offset16 == 0) ? 0x01010101 : 0), v); + append_helper_1x4_S (w1, ((offset16 == 1) ? 0x01010101 : 0), v); +} + +DECLSPEC void append_0x06_2x4_S (u32 *w0, u32 *w1, const u32 offset) +{ + u32 v[4]; + + set_mark_1x4_S (v, offset); + + const u32 offset16 = offset / 16; + + append_helper_1x4_S (w0, ((offset16 == 0) ? 0x06060606 : 0), v); + append_helper_1x4_S (w1, ((offset16 == 1) ? 0x06060606 : 0), v); +} + +DECLSPEC void append_0x01_4x4_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 offset) +{ + u32 v[4]; + + set_mark_1x4_S (v, offset); + + const u32 offset16 = offset / 16; + + append_helper_1x4_S (w0, ((offset16 == 0) ? 0x01010101 : 0), v); + append_helper_1x4_S (w1, ((offset16 == 1) ? 0x01010101 : 0), v); + append_helper_1x4_S (w2, ((offset16 == 2) ? 0x01010101 : 0), v); + append_helper_1x4_S (w3, ((offset16 == 3) ? 0x01010101 : 0), v); +} + +DECLSPEC void append_0x80_1x4_S (u32 *w0, const u32 offset) +{ + u32 v[4]; + + set_mark_1x4_S (v, offset); + + append_helper_1x4_S (w0, 0x80808080, v); +} + +DECLSPEC void append_0x80_2x4_S (u32 *w0, u32 *w1, const u32 offset) +{ + u32 v[4]; + + set_mark_1x4_S (v, offset); + + const u32 offset16 = offset / 16; + + append_helper_1x4_S (w0, ((offset16 == 0) ? 0x80808080 : 0), v); + append_helper_1x4_S (w1, ((offset16 == 1) ? 0x80808080 : 0), v); +} + +DECLSPEC void append_0x80_3x4_S (u32 *w0, u32 *w1, u32 *w2, const u32 offset) +{ + u32 v[4]; + + set_mark_1x4_S (v, offset); + + const u32 offset16 = offset / 16; + + append_helper_1x4_S (w0, ((offset16 == 0) ? 0x80808080 : 0), v); + append_helper_1x4_S (w1, ((offset16 == 1) ? 0x80808080 : 0), v); + append_helper_1x4_S (w2, ((offset16 == 2) ? 0x80808080 : 0), v); +} + +DECLSPEC void append_0x80_4x4_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 offset) +{ + u32 v[4]; + + set_mark_1x4_S (v, offset); + + const u32 offset16 = offset / 16; + + append_helper_1x4_S (w0, ((offset16 == 0) ? 0x80808080 : 0), v); + append_helper_1x4_S (w1, ((offset16 == 1) ? 0x80808080 : 0), v); + append_helper_1x4_S (w2, ((offset16 == 2) ? 0x80808080 : 0), v); + append_helper_1x4_S (w3, ((offset16 == 3) ? 0x80808080 : 0), v); +} + +DECLSPEC void append_0x80_8x4_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, u32 *w4, u32 *w5, u32 *w6, u32 *w7, const u32 offset) +{ + u32 v[4]; + + set_mark_1x4_S (v, offset); + + const u32 offset16 = offset / 16; + + append_helper_1x4_S (w0, ((offset16 == 0) ? 0x80808080 : 0), v); + append_helper_1x4_S (w1, ((offset16 == 1) ? 0x80808080 : 0), v); + append_helper_1x4_S (w2, ((offset16 == 2) ? 0x80808080 : 0), v); + append_helper_1x4_S (w3, ((offset16 == 3) ? 0x80808080 : 0), v); + append_helper_1x4_S (w4, ((offset16 == 4) ? 0x80808080 : 0), v); + append_helper_1x4_S (w5, ((offset16 == 5) ? 0x80808080 : 0), v); + append_helper_1x4_S (w6, ((offset16 == 6) ? 0x80808080 : 0), v); + append_helper_1x4_S (w7, ((offset16 == 7) ? 0x80808080 : 0), v); +} + +DECLSPEC void make_utf16be_S (const u32 *in, u32 *out1, u32 *out2) +{ + #if defined IS_NV + + out2[3] = hc_byte_perm_S (in[3], 0, 0x3727); + out2[2] = hc_byte_perm_S (in[3], 0, 0x1707); + out2[1] = hc_byte_perm_S (in[2], 0, 0x3727); + out2[0] = hc_byte_perm_S (in[2], 0, 0x1707); + out1[3] = hc_byte_perm_S (in[1], 0, 0x3727); + out1[2] = hc_byte_perm_S (in[1], 0, 0x1707); + out1[1] = hc_byte_perm_S (in[0], 0, 0x3727); + out1[0] = hc_byte_perm_S (in[0], 0, 0x1707); + + #elif defined IS_AMD && HAS_VPERM + + out2[3] = hc_byte_perm_S (in[3], 0, 0x03070207); + out2[2] = hc_byte_perm_S (in[3], 0, 0x01070007); + out2[1] = hc_byte_perm_S (in[2], 0, 0x03070207); + out2[0] = hc_byte_perm_S (in[2], 0, 0x01070007); + out1[3] = hc_byte_perm_S (in[1], 0, 0x03070207); + out1[2] = hc_byte_perm_S (in[1], 0, 0x01070007); + out1[1] = hc_byte_perm_S (in[0], 0, 0x03070207); + out1[0] = hc_byte_perm_S (in[0], 0, 0x01070007); - case 59: - w3[2] &= 0x00ffffff; - w3[3] = 0; + #else - break; + out2[3] = ((in[3] >> 0) & 0xFF000000) | ((in[3] >> 8) & 0x0000FF00); + out2[2] = ((in[3] << 16) & 0xFF000000) | ((in[3] << 8) & 0x0000FF00); + out2[1] = ((in[2] >> 0) & 0xFF000000) | ((in[2] >> 8) & 0x0000FF00); + out2[0] = ((in[2] << 16) & 0xFF000000) | ((in[2] << 8) & 0x0000FF00); + out1[3] = ((in[1] >> 0) & 0xFF000000) | ((in[1] >> 8) & 0x0000FF00); + out1[2] = ((in[1] << 16) & 0xFF000000) | ((in[1] << 8) & 0x0000FF00); + out1[1] = ((in[0] >> 0) & 0xFF000000) | ((in[0] >> 8) & 0x0000FF00); + out1[0] = ((in[0] << 16) & 0xFF000000) | ((in[0] << 8) & 0x0000FF00); - case 60: - w3[3] = 0; + #endif +} - break; +DECLSPEC void make_utf16le_S (const u32 *in, u32 *out1, u32 *out2) +{ + #if defined IS_NV - case 61: - w3[3] &= 0x000000ff; + out2[3] = hc_byte_perm_S (in[3], 0, 0x7372); + out2[2] = hc_byte_perm_S (in[3], 0, 0x7170); + out2[1] = hc_byte_perm_S (in[2], 0, 0x7372); + out2[0] = hc_byte_perm_S (in[2], 0, 0x7170); + out1[3] = hc_byte_perm_S (in[1], 0, 0x7372); + out1[2] = hc_byte_perm_S (in[1], 0, 0x7170); + out1[1] = hc_byte_perm_S (in[0], 0, 0x7372); + out1[0] = hc_byte_perm_S (in[0], 0, 0x7170); - break; + #elif defined IS_AMD && HAS_VPERM - case 62: - w3[3] &= 0x0000ffff; + out2[3] = hc_byte_perm_S (in[3], 0, 0x07030702); + out2[2] = hc_byte_perm_S (in[3], 0, 0x07010700); + out2[1] = hc_byte_perm_S (in[2], 0, 0x07030702); + out2[0] = hc_byte_perm_S (in[2], 0, 0x07010700); + out1[3] = hc_byte_perm_S (in[1], 0, 0x07030702); + out1[2] = hc_byte_perm_S (in[1], 0, 0x07010700); + out1[1] = hc_byte_perm_S (in[0], 0, 0x07030702); + out1[0] = hc_byte_perm_S (in[0], 0, 0x07010700); - break; + #else - case 63: - w3[3] &= 0x00ffffff; + out2[3] = ((in[3] >> 8) & 0x00FF0000) | ((in[3] >> 16) & 0x000000FF); + out2[2] = ((in[3] << 8) & 0x00FF0000) | ((in[3] >> 0) & 0x000000FF); + out2[1] = ((in[2] >> 8) & 0x00FF0000) | ((in[2] >> 16) & 0x000000FF); + out2[0] = ((in[2] << 8) & 0x00FF0000) | ((in[2] >> 0) & 0x000000FF); + out1[3] = ((in[1] >> 8) & 0x00FF0000) | ((in[1] >> 16) & 0x000000FF); + out1[2] = ((in[1] << 8) & 0x00FF0000) | ((in[1] >> 0) & 0x000000FF); + out1[1] = ((in[0] >> 8) & 0x00FF0000) | ((in[0] >> 16) & 0x000000FF); + out1[0] = ((in[0] << 8) & 0x00FF0000) | ((in[0] >> 0) & 0x000000FF); - break; - } + #endif } -DECLSPEC void truncate_block_16x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 len) +DECLSPEC void undo_utf16be_S (const u32 *in1, const u32 *in2, u32 *out) { - switch (len) - { - case 0: - w0[0] = 0; - w0[1] = 0; - w0[2] = 0; - w0[3] = 0; - w1[0] = 0; - w1[1] = 0; - w1[2] = 0; - w1[3] = 0; - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + #if defined IS_NV - break; + out[0] = hc_byte_perm_S (in1[0], in1[1], 0x4602); + out[1] = hc_byte_perm_S (in1[2], in1[3], 0x4602); + out[2] = hc_byte_perm_S (in2[0], in2[1], 0x4602); + out[3] = hc_byte_perm_S (in2[2], in2[3], 0x4602); - case 1: - w0[0] &= 0xff000000; - w0[1] = 0; - w0[2] = 0; - w0[3] = 0; - w1[0] = 0; - w1[1] = 0; - w1[2] = 0; - w1[3] = 0; - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + #elif defined IS_AMD && HAS_VPERM - break; + out[0] = hc_byte_perm_S (in1[0], in1[1], 0x04060002); + out[1] = hc_byte_perm_S (in1[2], in1[3], 0x04060002); + out[2] = hc_byte_perm_S (in2[0], in2[1], 0x04060002); + out[3] = hc_byte_perm_S (in2[2], in2[3], 0x04060002); - case 2: - w0[0] &= 0xffff0000; - w0[1] = 0; - w0[2] = 0; - w0[3] = 0; - w1[0] = 0; - w1[1] = 0; - w1[2] = 0; - w1[3] = 0; - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + #else - break; + out[0] = ((in1[0] & 0x0000ff00) >> 8) | ((in1[0] & 0xff000000) >> 16) + | ((in1[1] & 0x0000ff00) << 8) | ((in1[1] & 0xff000000) << 0); + out[1] = ((in1[2] & 0x0000ff00) >> 8) | ((in1[2] & 0xff000000) >> 16) + | ((in1[3] & 0x0000ff00) << 8) | ((in1[3] & 0xff000000) << 0); + out[2] = ((in2[0] & 0x0000ff00) >> 8) | ((in2[0] & 0xff000000) >> 16) + | ((in2[1] & 0x0000ff00) << 8) | ((in2[1] & 0xff000000) << 0); + out[3] = ((in2[2] & 0x0000ff00) >> 8) | ((in2[2] & 0xff000000) >> 16) + | ((in2[3] & 0x0000ff00) << 8) | ((in2[3] & 0xff000000) << 0); - case 3: - w0[0] &= 0xffffff00; - w0[1] = 0; - w0[2] = 0; - w0[3] = 0; - w1[0] = 0; - w1[1] = 0; - w1[2] = 0; - w1[3] = 0; - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + #endif +} - break; +DECLSPEC void undo_utf16le_S (const u32 *in1, const u32 *in2, u32 *out) +{ + #if defined IS_NV - case 4: - w0[1] = 0; - w0[2] = 0; - w0[3] = 0; - w1[0] = 0; - w1[1] = 0; - w1[2] = 0; - w1[3] = 0; - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + out[0] = hc_byte_perm_S (in1[0], in1[1], 0x6420); + out[1] = hc_byte_perm_S (in1[2], in1[3], 0x6420); + out[2] = hc_byte_perm_S (in2[0], in2[1], 0x6420); + out[3] = hc_byte_perm_S (in2[2], in2[3], 0x6420); - break; + #elif defined IS_AMD && HAS_VPERM - case 5: - w0[1] &= 0xff000000; - w0[2] = 0; - w0[3] = 0; - w1[0] = 0; - w1[1] = 0; - w1[2] = 0; - w1[3] = 0; - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + out[0] = hc_byte_perm_S (in1[0], in1[1], 0x06040200); + out[1] = hc_byte_perm_S (in1[2], in1[3], 0x06040200); + out[2] = hc_byte_perm_S (in2[0], in2[1], 0x06040200); + out[3] = hc_byte_perm_S (in2[2], in2[3], 0x06040200); - break; + #else - case 6: - w0[1] &= 0xffff0000; - w0[2] = 0; - w0[3] = 0; - w1[0] = 0; - w1[1] = 0; - w1[2] = 0; - w1[3] = 0; - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + out[0] = ((in1[0] & 0x000000ff) >> 0) | ((in1[0] & 0x00ff0000) >> 8) + | ((in1[1] & 0x000000ff) << 16) | ((in1[1] & 0x00ff0000) << 8); + out[1] = ((in1[2] & 0x000000ff) >> 0) | ((in1[2] & 0x00ff0000) >> 8) + | ((in1[3] & 0x000000ff) << 16) | ((in1[3] & 0x00ff0000) << 8); + out[2] = ((in2[0] & 0x000000ff) >> 0) | ((in2[0] & 0x00ff0000) >> 8) + | ((in2[1] & 0x000000ff) << 16) | ((in2[1] & 0x00ff0000) << 8); + out[3] = ((in2[2] & 0x000000ff) >> 0) | ((in2[2] & 0x00ff0000) >> 8) + | ((in2[3] & 0x000000ff) << 16) | ((in2[3] & 0x00ff0000) << 8); + + #endif +} + +DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 offset) +{ + const int offset_switch = offset / 4; + + #if (defined IS_AMD && HAS_VPERM == 0) || defined IS_GENERIC + switch (offset_switch) + { + case 0: + w3[3] = hc_bytealign_S (w3[2], w3[3], offset); + w3[2] = hc_bytealign_S (w3[1], w3[2], offset); + w3[1] = hc_bytealign_S (w3[0], w3[1], offset); + w3[0] = hc_bytealign_S (w2[3], w3[0], offset); + w2[3] = hc_bytealign_S (w2[2], w2[3], offset); + w2[2] = hc_bytealign_S (w2[1], w2[2], offset); + w2[1] = hc_bytealign_S (w2[0], w2[1], offset); + w2[0] = hc_bytealign_S (w1[3], w2[0], offset); + w1[3] = hc_bytealign_S (w1[2], w1[3], offset); + w1[2] = hc_bytealign_S (w1[1], w1[2], offset); + w1[1] = hc_bytealign_S (w1[0], w1[1], offset); + w1[0] = hc_bytealign_S (w0[3], w1[0], offset); + w0[3] = hc_bytealign_S (w0[2], w0[3], offset); + w0[2] = hc_bytealign_S (w0[1], w0[2], offset); + w0[1] = hc_bytealign_S (w0[0], w0[1], offset); + w0[0] = hc_bytealign_S ( 0, w0[0], offset); break; - case 7: - w0[1] &= 0xffffff00; - w0[2] = 0; - w0[3] = 0; - w1[0] = 0; - w1[1] = 0; - w1[2] = 0; - w1[3] = 0; - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + case 1: + w3[3] = hc_bytealign_S (w3[1], w3[2], offset); + w3[2] = hc_bytealign_S (w3[0], w3[1], offset); + w3[1] = hc_bytealign_S (w2[3], w3[0], offset); + w3[0] = hc_bytealign_S (w2[2], w2[3], offset); + w2[3] = hc_bytealign_S (w2[1], w2[2], offset); + w2[2] = hc_bytealign_S (w2[0], w2[1], offset); + w2[1] = hc_bytealign_S (w1[3], w2[0], offset); + w2[0] = hc_bytealign_S (w1[2], w1[3], offset); + w1[3] = hc_bytealign_S (w1[1], w1[2], offset); + w1[2] = hc_bytealign_S (w1[0], w1[1], offset); + w1[1] = hc_bytealign_S (w0[3], w1[0], offset); + w1[0] = hc_bytealign_S (w0[2], w0[3], offset); + w0[3] = hc_bytealign_S (w0[1], w0[2], offset); + w0[2] = hc_bytealign_S (w0[0], w0[1], offset); + w0[1] = hc_bytealign_S ( 0, w0[0], offset); + w0[0] = 0; break; - case 8: - w0[2] = 0; - w0[3] = 0; - w1[0] = 0; - w1[1] = 0; - w1[2] = 0; - w1[3] = 0; - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + case 2: + w3[3] = hc_bytealign_S (w3[0], w3[1], offset); + w3[2] = hc_bytealign_S (w2[3], w3[0], offset); + w3[1] = hc_bytealign_S (w2[2], w2[3], offset); + w3[0] = hc_bytealign_S (w2[1], w2[2], offset); + w2[3] = hc_bytealign_S (w2[0], w2[1], offset); + w2[2] = hc_bytealign_S (w1[3], w2[0], offset); + w2[1] = hc_bytealign_S (w1[2], w1[3], offset); + w2[0] = hc_bytealign_S (w1[1], w1[2], offset); + w1[3] = hc_bytealign_S (w1[0], w1[1], offset); + w1[2] = hc_bytealign_S (w0[3], w1[0], offset); + w1[1] = hc_bytealign_S (w0[2], w0[3], offset); + w1[0] = hc_bytealign_S (w0[1], w0[2], offset); + w0[3] = hc_bytealign_S (w0[0], w0[1], offset); + w0[2] = hc_bytealign_S ( 0, w0[0], offset); + w0[1] = 0; + w0[0] = 0; break; - case 9: - w0[2] &= 0xff000000; - w0[3] = 0; - w1[0] = 0; - w1[1] = 0; - w1[2] = 0; - w1[3] = 0; - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + case 3: + w3[3] = hc_bytealign_S (w2[3], w3[0], offset); + w3[2] = hc_bytealign_S (w2[2], w2[3], offset); + w3[1] = hc_bytealign_S (w2[1], w2[2], offset); + w3[0] = hc_bytealign_S (w2[0], w2[1], offset); + w2[3] = hc_bytealign_S (w1[3], w2[0], offset); + w2[2] = hc_bytealign_S (w1[2], w1[3], offset); + w2[1] = hc_bytealign_S (w1[1], w1[2], offset); + w2[0] = hc_bytealign_S (w1[0], w1[1], offset); + w1[3] = hc_bytealign_S (w0[3], w1[0], offset); + w1[2] = hc_bytealign_S (w0[2], w0[3], offset); + w1[1] = hc_bytealign_S (w0[1], w0[2], offset); + w1[0] = hc_bytealign_S (w0[0], w0[1], offset); + w0[3] = hc_bytealign_S ( 0, w0[0], offset); + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 10: - w0[2] &= 0xffff0000; - w0[3] = 0; - w1[0] = 0; - w1[1] = 0; - w1[2] = 0; - w1[3] = 0; - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + case 4: + w3[3] = hc_bytealign_S (w2[2], w2[3], offset); + w3[2] = hc_bytealign_S (w2[1], w2[2], offset); + w3[1] = hc_bytealign_S (w2[0], w2[1], offset); + w3[0] = hc_bytealign_S (w1[3], w2[0], offset); + w2[3] = hc_bytealign_S (w1[2], w1[3], offset); + w2[2] = hc_bytealign_S (w1[1], w1[2], offset); + w2[1] = hc_bytealign_S (w1[0], w1[1], offset); + w2[0] = hc_bytealign_S (w0[3], w1[0], offset); + w1[3] = hc_bytealign_S (w0[2], w0[3], offset); + w1[2] = hc_bytealign_S (w0[1], w0[2], offset); + w1[1] = hc_bytealign_S (w0[0], w0[1], offset); + w1[0] = hc_bytealign_S ( 0, w0[0], offset); + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 11: - w0[2] &= 0xffffff00; - w0[3] = 0; - w1[0] = 0; - w1[1] = 0; - w1[2] = 0; - w1[3] = 0; - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + case 5: + w3[3] = hc_bytealign_S (w2[1], w2[2], offset); + w3[2] = hc_bytealign_S (w2[0], w2[1], offset); + w3[1] = hc_bytealign_S (w1[3], w2[0], offset); + w3[0] = hc_bytealign_S (w1[2], w1[3], offset); + w2[3] = hc_bytealign_S (w1[1], w1[2], offset); + w2[2] = hc_bytealign_S (w1[0], w1[1], offset); + w2[1] = hc_bytealign_S (w0[3], w1[0], offset); + w2[0] = hc_bytealign_S (w0[2], w0[3], offset); + w1[3] = hc_bytealign_S (w0[1], w0[2], offset); + w1[2] = hc_bytealign_S (w0[0], w0[1], offset); + w1[1] = hc_bytealign_S ( 0, w0[0], offset); + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 12: - w0[3] = 0; - w1[0] = 0; - w1[1] = 0; - w1[2] = 0; - w1[3] = 0; - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + case 6: + w3[3] = hc_bytealign_S (w2[0], w2[1], offset); + w3[2] = hc_bytealign_S (w1[3], w2[0], offset); + w3[1] = hc_bytealign_S (w1[2], w1[3], offset); + w3[0] = hc_bytealign_S (w1[1], w1[2], offset); + w2[3] = hc_bytealign_S (w1[0], w1[1], offset); + w2[2] = hc_bytealign_S (w0[3], w1[0], offset); + w2[1] = hc_bytealign_S (w0[2], w0[3], offset); + w2[0] = hc_bytealign_S (w0[1], w0[2], offset); + w1[3] = hc_bytealign_S (w0[0], w0[1], offset); + w1[2] = hc_bytealign_S ( 0, w0[0], offset); + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 13: - w0[3] &= 0xff000000; - w1[0] = 0; - w1[1] = 0; - w1[2] = 0; - w1[3] = 0; - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + case 7: + w3[3] = hc_bytealign_S (w1[3], w2[0], offset); + w3[2] = hc_bytealign_S (w1[2], w1[3], offset); + w3[1] = hc_bytealign_S (w1[1], w1[2], offset); + w3[0] = hc_bytealign_S (w1[0], w1[1], offset); + w2[3] = hc_bytealign_S (w0[3], w1[0], offset); + w2[2] = hc_bytealign_S (w0[2], w0[3], offset); + w2[1] = hc_bytealign_S (w0[1], w0[2], offset); + w2[0] = hc_bytealign_S (w0[0], w0[1], offset); + w1[3] = hc_bytealign_S ( 0, w0[0], offset); + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 14: - w0[3] &= 0xffff0000; - w1[0] = 0; - w1[1] = 0; - w1[2] = 0; - w1[3] = 0; - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + case 8: + w3[3] = hc_bytealign_S (w1[2], w1[3], offset); + w3[2] = hc_bytealign_S (w1[1], w1[2], offset); + w3[1] = hc_bytealign_S (w1[0], w1[1], offset); + w3[0] = hc_bytealign_S (w0[3], w1[0], offset); + w2[3] = hc_bytealign_S (w0[2], w0[3], offset); + w2[2] = hc_bytealign_S (w0[1], w0[2], offset); + w2[1] = hc_bytealign_S (w0[0], w0[1], offset); + w2[0] = hc_bytealign_S ( 0, w0[0], offset); + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 15: - w0[3] &= 0xffffff00; - w1[0] = 0; - w1[1] = 0; - w1[2] = 0; - w1[3] = 0; - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + case 9: + w3[3] = hc_bytealign_S (w1[1], w1[2], offset); + w3[2] = hc_bytealign_S (w1[0], w1[1], offset); + w3[1] = hc_bytealign_S (w0[3], w1[0], offset); + w3[0] = hc_bytealign_S (w0[2], w0[3], offset); + w2[3] = hc_bytealign_S (w0[1], w0[2], offset); + w2[2] = hc_bytealign_S (w0[0], w0[1], offset); + w2[1] = hc_bytealign_S ( 0, w0[0], offset); + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 16: - w1[0] = 0; - w1[1] = 0; - w1[2] = 0; - w1[3] = 0; - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + case 10: + w3[3] = hc_bytealign_S (w1[0], w1[1], offset); + w3[2] = hc_bytealign_S (w0[3], w1[0], offset); + w3[1] = hc_bytealign_S (w0[2], w0[3], offset); + w3[0] = hc_bytealign_S (w0[1], w0[2], offset); + w2[3] = hc_bytealign_S (w0[0], w0[1], offset); + w2[2] = hc_bytealign_S ( 0, w0[0], offset); + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 17: - w1[0] &= 0xff000000; - w1[1] = 0; - w1[2] = 0; - w1[3] = 0; - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + case 11: + w3[3] = hc_bytealign_S (w0[3], w1[0], offset); + w3[2] = hc_bytealign_S (w0[2], w0[3], offset); + w3[1] = hc_bytealign_S (w0[1], w0[2], offset); + w3[0] = hc_bytealign_S (w0[0], w0[1], offset); + w2[3] = hc_bytealign_S ( 0, w0[0], offset); + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 18: - w1[0] &= 0xffff0000; - w1[1] = 0; - w1[2] = 0; - w1[3] = 0; - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + case 12: + w3[3] = hc_bytealign_S (w0[2], w0[3], offset); + w3[2] = hc_bytealign_S (w0[1], w0[2], offset); + w3[1] = hc_bytealign_S (w0[0], w0[1], offset); + w3[0] = hc_bytealign_S ( 0, w0[0], offset); + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 19: - w1[0] &= 0xffffff00; - w1[1] = 0; - w1[2] = 0; - w1[3] = 0; - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + case 13: + w3[3] = hc_bytealign_S (w0[1], w0[2], offset); + w3[2] = hc_bytealign_S (w0[0], w0[1], offset); + w3[1] = hc_bytealign_S ( 0, w0[0], offset); + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 20: - w1[1] = 0; - w1[2] = 0; - w1[3] = 0; - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + case 14: + w3[3] = hc_bytealign_S (w0[0], w0[1], offset); + w3[2] = hc_bytealign_S ( 0, w0[0], offset); + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 21: - w1[1] &= 0xff000000; - w1[2] = 0; - w1[3] = 0; - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + case 15: + w3[3] = hc_bytealign_S ( 0, w0[0], offset); + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; + } + #endif - case 22: - w1[1] &= 0xffff0000; - w1[2] = 0; - w1[3] = 0; - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + #if (defined IS_AMD && HAS_VPERM == 1) || defined IS_NV - break; + const int offset_mod_4 = offset & 3; - case 23: - w1[1] &= 0xffffff00; - w1[2] = 0; - w1[3] = 0; - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + const int offset_minus_4 = 4 - offset_mod_4; + + #if defined IS_NV + const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; + #endif + + #if defined IS_AMD + const int selector = 0x0706050403020100 >> (offset_minus_4 * 8); + #endif + + switch (offset_switch) + { + case 0: + w3[3] = hc_byte_perm_S (w3[2], w3[3], selector); + w3[2] = hc_byte_perm_S (w3[1], w3[2], selector); + w3[1] = hc_byte_perm_S (w3[0], w3[1], selector); + w3[0] = hc_byte_perm_S (w2[3], w3[0], selector); + w2[3] = hc_byte_perm_S (w2[2], w2[3], selector); + w2[2] = hc_byte_perm_S (w2[1], w2[2], selector); + w2[1] = hc_byte_perm_S (w2[0], w2[1], selector); + w2[0] = hc_byte_perm_S (w1[3], w2[0], selector); + w1[3] = hc_byte_perm_S (w1[2], w1[3], selector); + w1[2] = hc_byte_perm_S (w1[1], w1[2], selector); + w1[1] = hc_byte_perm_S (w1[0], w1[1], selector); + w1[0] = hc_byte_perm_S (w0[3], w1[0], selector); + w0[3] = hc_byte_perm_S (w0[2], w0[3], selector); + w0[2] = hc_byte_perm_S (w0[1], w0[2], selector); + w0[1] = hc_byte_perm_S (w0[0], w0[1], selector); + w0[0] = hc_byte_perm_S ( 0, w0[0], selector); break; - case 24: - w1[2] = 0; - w1[3] = 0; - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + case 1: + w3[3] = hc_byte_perm_S (w3[1], w3[2], selector); + w3[2] = hc_byte_perm_S (w3[0], w3[1], selector); + w3[1] = hc_byte_perm_S (w2[3], w3[0], selector); + w3[0] = hc_byte_perm_S (w2[2], w2[3], selector); + w2[3] = hc_byte_perm_S (w2[1], w2[2], selector); + w2[2] = hc_byte_perm_S (w2[0], w2[1], selector); + w2[1] = hc_byte_perm_S (w1[3], w2[0], selector); + w2[0] = hc_byte_perm_S (w1[2], w1[3], selector); + w1[3] = hc_byte_perm_S (w1[1], w1[2], selector); + w1[2] = hc_byte_perm_S (w1[0], w1[1], selector); + w1[1] = hc_byte_perm_S (w0[3], w1[0], selector); + w1[0] = hc_byte_perm_S (w0[2], w0[3], selector); + w0[3] = hc_byte_perm_S (w0[1], w0[2], selector); + w0[2] = hc_byte_perm_S (w0[0], w0[1], selector); + w0[1] = hc_byte_perm_S ( 0, w0[0], selector); + w0[0] = 0; break; - case 25: - w1[2] &= 0xff000000; - w1[3] = 0; - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + case 2: + w3[3] = hc_byte_perm_S (w3[0], w3[1], selector); + w3[2] = hc_byte_perm_S (w2[3], w3[0], selector); + w3[1] = hc_byte_perm_S (w2[2], w2[3], selector); + w3[0] = hc_byte_perm_S (w2[1], w2[2], selector); + w2[3] = hc_byte_perm_S (w2[0], w2[1], selector); + w2[2] = hc_byte_perm_S (w1[3], w2[0], selector); + w2[1] = hc_byte_perm_S (w1[2], w1[3], selector); + w2[0] = hc_byte_perm_S (w1[1], w1[2], selector); + w1[3] = hc_byte_perm_S (w1[0], w1[1], selector); + w1[2] = hc_byte_perm_S (w0[3], w1[0], selector); + w1[1] = hc_byte_perm_S (w0[2], w0[3], selector); + w1[0] = hc_byte_perm_S (w0[1], w0[2], selector); + w0[3] = hc_byte_perm_S (w0[0], w0[1], selector); + w0[2] = hc_byte_perm_S ( 0, w0[0], selector); + w0[1] = 0; + w0[0] = 0; break; - case 26: - w1[2] &= 0xffff0000; - w1[3] = 0; - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + case 3: + w3[3] = hc_byte_perm_S (w2[3], w3[0], selector); + w3[2] = hc_byte_perm_S (w2[2], w2[3], selector); + w3[1] = hc_byte_perm_S (w2[1], w2[2], selector); + w3[0] = hc_byte_perm_S (w2[0], w2[1], selector); + w2[3] = hc_byte_perm_S (w1[3], w2[0], selector); + w2[2] = hc_byte_perm_S (w1[2], w1[3], selector); + w2[1] = hc_byte_perm_S (w1[1], w1[2], selector); + w2[0] = hc_byte_perm_S (w1[0], w1[1], selector); + w1[3] = hc_byte_perm_S (w0[3], w1[0], selector); + w1[2] = hc_byte_perm_S (w0[2], w0[3], selector); + w1[1] = hc_byte_perm_S (w0[1], w0[2], selector); + w1[0] = hc_byte_perm_S (w0[0], w0[1], selector); + w0[3] = hc_byte_perm_S ( 0, w0[0], selector); + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 27: - w1[2] &= 0xffffff00; - w1[3] = 0; - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + case 4: + w3[3] = hc_byte_perm_S (w2[2], w2[3], selector); + w3[2] = hc_byte_perm_S (w2[1], w2[2], selector); + w3[1] = hc_byte_perm_S (w2[0], w2[1], selector); + w3[0] = hc_byte_perm_S (w1[3], w2[0], selector); + w2[3] = hc_byte_perm_S (w1[2], w1[3], selector); + w2[2] = hc_byte_perm_S (w1[1], w1[2], selector); + w2[1] = hc_byte_perm_S (w1[0], w1[1], selector); + w2[0] = hc_byte_perm_S (w0[3], w1[0], selector); + w1[3] = hc_byte_perm_S (w0[2], w0[3], selector); + w1[2] = hc_byte_perm_S (w0[1], w0[2], selector); + w1[1] = hc_byte_perm_S (w0[0], w0[1], selector); + w1[0] = hc_byte_perm_S ( 0, w0[0], selector); + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 28: - w1[3] = 0; - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + case 5: + w3[3] = hc_byte_perm_S (w2[1], w2[2], selector); + w3[2] = hc_byte_perm_S (w2[0], w2[1], selector); + w3[1] = hc_byte_perm_S (w1[3], w2[0], selector); + w3[0] = hc_byte_perm_S (w1[2], w1[3], selector); + w2[3] = hc_byte_perm_S (w1[1], w1[2], selector); + w2[2] = hc_byte_perm_S (w1[0], w1[1], selector); + w2[1] = hc_byte_perm_S (w0[3], w1[0], selector); + w2[0] = hc_byte_perm_S (w0[2], w0[3], selector); + w1[3] = hc_byte_perm_S (w0[1], w0[2], selector); + w1[2] = hc_byte_perm_S (w0[0], w0[1], selector); + w1[1] = hc_byte_perm_S ( 0, w0[0], selector); + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 29: - w1[3] &= 0xff000000; - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + case 6: + w3[3] = hc_byte_perm_S (w2[0], w2[1], selector); + w3[2] = hc_byte_perm_S (w1[3], w2[0], selector); + w3[1] = hc_byte_perm_S (w1[2], w1[3], selector); + w3[0] = hc_byte_perm_S (w1[1], w1[2], selector); + w2[3] = hc_byte_perm_S (w1[0], w1[1], selector); + w2[2] = hc_byte_perm_S (w0[3], w1[0], selector); + w2[1] = hc_byte_perm_S (w0[2], w0[3], selector); + w2[0] = hc_byte_perm_S (w0[1], w0[2], selector); + w1[3] = hc_byte_perm_S (w0[0], w0[1], selector); + w1[2] = hc_byte_perm_S ( 0, w0[0], selector); + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 30: - w1[3] &= 0xffff0000; - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + case 7: + w3[3] = hc_byte_perm_S (w1[3], w2[0], selector); + w3[2] = hc_byte_perm_S (w1[2], w1[3], selector); + w3[1] = hc_byte_perm_S (w1[1], w1[2], selector); + w3[0] = hc_byte_perm_S (w1[0], w1[1], selector); + w2[3] = hc_byte_perm_S (w0[3], w1[0], selector); + w2[2] = hc_byte_perm_S (w0[2], w0[3], selector); + w2[1] = hc_byte_perm_S (w0[1], w0[2], selector); + w2[0] = hc_byte_perm_S (w0[0], w0[1], selector); + w1[3] = hc_byte_perm_S ( 0, w0[0], selector); + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 31: - w1[3] &= 0xffffff00; - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + case 8: + w3[3] = hc_byte_perm_S (w1[2], w1[3], selector); + w3[2] = hc_byte_perm_S (w1[1], w1[2], selector); + w3[1] = hc_byte_perm_S (w1[0], w1[1], selector); + w3[0] = hc_byte_perm_S (w0[3], w1[0], selector); + w2[3] = hc_byte_perm_S (w0[2], w0[3], selector); + w2[2] = hc_byte_perm_S (w0[1], w0[2], selector); + w2[1] = hc_byte_perm_S (w0[0], w0[1], selector); + w2[0] = hc_byte_perm_S ( 0, w0[0], selector); + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 32: - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + case 9: + w3[3] = hc_byte_perm_S (w1[1], w1[2], selector); + w3[2] = hc_byte_perm_S (w1[0], w1[1], selector); + w3[1] = hc_byte_perm_S (w0[3], w1[0], selector); + w3[0] = hc_byte_perm_S (w0[2], w0[3], selector); + w2[3] = hc_byte_perm_S (w0[1], w0[2], selector); + w2[2] = hc_byte_perm_S (w0[0], w0[1], selector); + w2[1] = hc_byte_perm_S ( 0, w0[0], selector); + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 33: - w2[0] &= 0xff000000; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + case 10: + w3[3] = hc_byte_perm_S (w1[0], w1[1], selector); + w3[2] = hc_byte_perm_S (w0[3], w1[0], selector); + w3[1] = hc_byte_perm_S (w0[2], w0[3], selector); + w3[0] = hc_byte_perm_S (w0[1], w0[2], selector); + w2[3] = hc_byte_perm_S (w0[0], w0[1], selector); + w2[2] = hc_byte_perm_S ( 0, w0[0], selector); + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 34: - w2[0] &= 0xffff0000; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + case 11: + w3[3] = hc_byte_perm_S (w0[3], w1[0], selector); + w3[2] = hc_byte_perm_S (w0[2], w0[3], selector); + w3[1] = hc_byte_perm_S (w0[1], w0[2], selector); + w3[0] = hc_byte_perm_S (w0[0], w0[1], selector); + w2[3] = hc_byte_perm_S ( 0, w0[0], selector); + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 35: - w2[0] &= 0xffffff00; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + case 12: + w3[3] = hc_byte_perm_S (w0[2], w0[3], selector); + w3[2] = hc_byte_perm_S (w0[1], w0[2], selector); + w3[1] = hc_byte_perm_S (w0[0], w0[1], selector); + w3[0] = hc_byte_perm_S ( 0, w0[0], selector); + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 36: - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + case 13: + w3[3] = hc_byte_perm_S (w0[1], w0[2], selector); + w3[2] = hc_byte_perm_S (w0[0], w0[1], selector); + w3[1] = hc_byte_perm_S ( 0, w0[0], selector); + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 37: - w2[1] &= 0xff000000; - w2[2] = 0; - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + case 14: + w3[3] = hc_byte_perm_S (w0[0], w0[1], selector); + w3[2] = hc_byte_perm_S ( 0, w0[0], selector); + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 38: - w2[1] &= 0xffff0000; - w2[2] = 0; - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + case 15: + w3[3] = hc_byte_perm_S ( 0, w0[0], selector); + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; + } + #endif +} - case 39: - w2[1] &= 0xffffff00; - w2[2] = 0; - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; - - break; +DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, u32 *c0, u32 *c1, u32 *c2, u32 *c3, const u32 offset) +{ + const int offset_switch = offset / 4; - case 40: - w2[2] = 0; - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + #if defined IS_AMD || defined IS_GENERIC + switch (offset_switch) + { + case 0: + c0[0] = hc_bytealign_S (w3[3], 0, offset); + w3[3] = hc_bytealign_S (w3[2], w3[3], offset); + w3[2] = hc_bytealign_S (w3[1], w3[2], offset); + w3[1] = hc_bytealign_S (w3[0], w3[1], offset); + w3[0] = hc_bytealign_S (w2[3], w3[0], offset); + w2[3] = hc_bytealign_S (w2[2], w2[3], offset); + w2[2] = hc_bytealign_S (w2[1], w2[2], offset); + w2[1] = hc_bytealign_S (w2[0], w2[1], offset); + w2[0] = hc_bytealign_S (w1[3], w2[0], offset); + w1[3] = hc_bytealign_S (w1[2], w1[3], offset); + w1[2] = hc_bytealign_S (w1[1], w1[2], offset); + w1[1] = hc_bytealign_S (w1[0], w1[1], offset); + w1[0] = hc_bytealign_S (w0[3], w1[0], offset); + w0[3] = hc_bytealign_S (w0[2], w0[3], offset); + w0[2] = hc_bytealign_S (w0[1], w0[2], offset); + w0[1] = hc_bytealign_S (w0[0], w0[1], offset); + w0[0] = hc_bytealign_S ( 0, w0[0], offset); break; - case 41: - w2[2] &= 0xff000000; - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + case 1: + c0[1] = hc_bytealign_S (w3[3], 0, offset); + c0[0] = hc_bytealign_S (w3[2], w3[3], offset); + w3[3] = hc_bytealign_S (w3[1], w3[2], offset); + w3[2] = hc_bytealign_S (w3[0], w3[1], offset); + w3[1] = hc_bytealign_S (w2[3], w3[0], offset); + w3[0] = hc_bytealign_S (w2[2], w2[3], offset); + w2[3] = hc_bytealign_S (w2[1], w2[2], offset); + w2[2] = hc_bytealign_S (w2[0], w2[1], offset); + w2[1] = hc_bytealign_S (w1[3], w2[0], offset); + w2[0] = hc_bytealign_S (w1[2], w1[3], offset); + w1[3] = hc_bytealign_S (w1[1], w1[2], offset); + w1[2] = hc_bytealign_S (w1[0], w1[1], offset); + w1[1] = hc_bytealign_S (w0[3], w1[0], offset); + w1[0] = hc_bytealign_S (w0[2], w0[3], offset); + w0[3] = hc_bytealign_S (w0[1], w0[2], offset); + w0[2] = hc_bytealign_S (w0[0], w0[1], offset); + w0[1] = hc_bytealign_S ( 0, w0[0], offset); + w0[0] = 0; break; - case 42: - w2[2] &= 0xffff0000; - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + case 2: + c0[2] = hc_bytealign_S (w3[3], 0, offset); + c0[1] = hc_bytealign_S (w3[2], w3[3], offset); + c0[0] = hc_bytealign_S (w3[1], w3[2], offset); + w3[3] = hc_bytealign_S (w3[0], w3[1], offset); + w3[2] = hc_bytealign_S (w2[3], w3[0], offset); + w3[1] = hc_bytealign_S (w2[2], w2[3], offset); + w3[0] = hc_bytealign_S (w2[1], w2[2], offset); + w2[3] = hc_bytealign_S (w2[0], w2[1], offset); + w2[2] = hc_bytealign_S (w1[3], w2[0], offset); + w2[1] = hc_bytealign_S (w1[2], w1[3], offset); + w2[0] = hc_bytealign_S (w1[1], w1[2], offset); + w1[3] = hc_bytealign_S (w1[0], w1[1], offset); + w1[2] = hc_bytealign_S (w0[3], w1[0], offset); + w1[1] = hc_bytealign_S (w0[2], w0[3], offset); + w1[0] = hc_bytealign_S (w0[1], w0[2], offset); + w0[3] = hc_bytealign_S (w0[0], w0[1], offset); + w0[2] = hc_bytealign_S ( 0, w0[0], offset); + w0[1] = 0; + w0[0] = 0; break; - case 43: - w2[2] &= 0xffffff00; - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + case 3: + c0[3] = hc_bytealign_S (w3[3], 0, offset); + c0[2] = hc_bytealign_S (w3[2], w3[3], offset); + c0[1] = hc_bytealign_S (w3[1], w3[2], offset); + c0[0] = hc_bytealign_S (w3[0], w3[1], offset); + w3[3] = hc_bytealign_S (w2[3], w3[0], offset); + w3[2] = hc_bytealign_S (w2[2], w2[3], offset); + w3[1] = hc_bytealign_S (w2[1], w2[2], offset); + w3[0] = hc_bytealign_S (w2[0], w2[1], offset); + w2[3] = hc_bytealign_S (w1[3], w2[0], offset); + w2[2] = hc_bytealign_S (w1[2], w1[3], offset); + w2[1] = hc_bytealign_S (w1[1], w1[2], offset); + w2[0] = hc_bytealign_S (w1[0], w1[1], offset); + w1[3] = hc_bytealign_S (w0[3], w1[0], offset); + w1[2] = hc_bytealign_S (w0[2], w0[3], offset); + w1[1] = hc_bytealign_S (w0[1], w0[2], offset); + w1[0] = hc_bytealign_S (w0[0], w0[1], offset); + w0[3] = hc_bytealign_S ( 0, w0[0], offset); + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 44: - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + case 4: + c1[0] = hc_bytealign_S (w3[3], 0, offset); + c0[3] = hc_bytealign_S (w3[2], w3[3], offset); + c0[2] = hc_bytealign_S (w3[1], w3[2], offset); + c0[1] = hc_bytealign_S (w3[0], w3[1], offset); + c0[0] = hc_bytealign_S (w2[3], w3[0], offset); + w3[3] = hc_bytealign_S (w2[2], w2[3], offset); + w3[2] = hc_bytealign_S (w2[1], w2[2], offset); + w3[1] = hc_bytealign_S (w2[0], w2[1], offset); + w3[0] = hc_bytealign_S (w1[3], w2[0], offset); + w2[3] = hc_bytealign_S (w1[2], w1[3], offset); + w2[2] = hc_bytealign_S (w1[1], w1[2], offset); + w2[1] = hc_bytealign_S (w1[0], w1[1], offset); + w2[0] = hc_bytealign_S (w0[3], w1[0], offset); + w1[3] = hc_bytealign_S (w0[2], w0[3], offset); + w1[2] = hc_bytealign_S (w0[1], w0[2], offset); + w1[1] = hc_bytealign_S (w0[0], w0[1], offset); + w1[0] = hc_bytealign_S ( 0, w0[0], offset); + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 45: - w2[3] &= 0xff000000; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + case 5: + c1[1] = hc_bytealign_S (w3[3], 0, offset); + c1[0] = hc_bytealign_S (w3[2], w3[3], offset); + c0[3] = hc_bytealign_S (w3[1], w3[2], offset); + c0[2] = hc_bytealign_S (w3[0], w3[1], offset); + c0[1] = hc_bytealign_S (w2[3], w3[0], offset); + c0[0] = hc_bytealign_S (w2[2], w2[3], offset); + w3[3] = hc_bytealign_S (w2[1], w2[2], offset); + w3[2] = hc_bytealign_S (w2[0], w2[1], offset); + w3[1] = hc_bytealign_S (w1[3], w2[0], offset); + w3[0] = hc_bytealign_S (w1[2], w1[3], offset); + w2[3] = hc_bytealign_S (w1[1], w1[2], offset); + w2[2] = hc_bytealign_S (w1[0], w1[1], offset); + w2[1] = hc_bytealign_S (w0[3], w1[0], offset); + w2[0] = hc_bytealign_S (w0[2], w0[3], offset); + w1[3] = hc_bytealign_S (w0[1], w0[2], offset); + w1[2] = hc_bytealign_S (w0[0], w0[1], offset); + w1[1] = hc_bytealign_S ( 0, w0[0], offset); + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 46: - w2[3] &= 0xffff0000; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + case 6: + c1[2] = hc_bytealign_S (w3[3], 0, offset); + c1[1] = hc_bytealign_S (w3[2], w3[3], offset); + c1[0] = hc_bytealign_S (w3[1], w3[2], offset); + c0[3] = hc_bytealign_S (w3[0], w3[1], offset); + c0[2] = hc_bytealign_S (w2[3], w3[0], offset); + c0[1] = hc_bytealign_S (w2[2], w2[3], offset); + c0[0] = hc_bytealign_S (w2[1], w2[2], offset); + w3[3] = hc_bytealign_S (w2[0], w2[1], offset); + w3[2] = hc_bytealign_S (w1[3], w2[0], offset); + w3[1] = hc_bytealign_S (w1[2], w1[3], offset); + w3[0] = hc_bytealign_S (w1[1], w1[2], offset); + w2[3] = hc_bytealign_S (w1[0], w1[1], offset); + w2[2] = hc_bytealign_S (w0[3], w1[0], offset); + w2[1] = hc_bytealign_S (w0[2], w0[3], offset); + w2[0] = hc_bytealign_S (w0[1], w0[2], offset); + w1[3] = hc_bytealign_S (w0[0], w0[1], offset); + w1[2] = hc_bytealign_S ( 0, w0[0], offset); + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 47: - w2[3] &= 0xffffff00; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + case 7: + c1[3] = hc_bytealign_S (w3[3], 0, offset); + c1[2] = hc_bytealign_S (w3[2], w3[3], offset); + c1[1] = hc_bytealign_S (w3[1], w3[2], offset); + c1[0] = hc_bytealign_S (w3[0], w3[1], offset); + c0[3] = hc_bytealign_S (w2[3], w3[0], offset); + c0[2] = hc_bytealign_S (w2[2], w2[3], offset); + c0[1] = hc_bytealign_S (w2[1], w2[2], offset); + c0[0] = hc_bytealign_S (w2[0], w2[1], offset); + w3[3] = hc_bytealign_S (w1[3], w2[0], offset); + w3[2] = hc_bytealign_S (w1[2], w1[3], offset); + w3[1] = hc_bytealign_S (w1[1], w1[2], offset); + w3[0] = hc_bytealign_S (w1[0], w1[1], offset); + w2[3] = hc_bytealign_S (w0[3], w1[0], offset); + w2[2] = hc_bytealign_S (w0[2], w0[3], offset); + w2[1] = hc_bytealign_S (w0[1], w0[2], offset); + w2[0] = hc_bytealign_S (w0[0], w0[1], offset); + w1[3] = hc_bytealign_S ( 0, w0[0], offset); + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 48: - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + case 8: + c2[0] = hc_bytealign_S (w3[3], 0, offset); + c1[3] = hc_bytealign_S (w3[2], w3[3], offset); + c1[2] = hc_bytealign_S (w3[1], w3[2], offset); + c1[1] = hc_bytealign_S (w3[0], w3[1], offset); + c1[0] = hc_bytealign_S (w2[3], w3[0], offset); + c0[3] = hc_bytealign_S (w2[2], w2[3], offset); + c0[2] = hc_bytealign_S (w2[1], w2[2], offset); + c0[1] = hc_bytealign_S (w2[0], w2[1], offset); + c0[0] = hc_bytealign_S (w1[3], w2[0], offset); + w3[3] = hc_bytealign_S (w1[2], w1[3], offset); + w3[2] = hc_bytealign_S (w1[1], w1[2], offset); + w3[1] = hc_bytealign_S (w1[0], w1[1], offset); + w3[0] = hc_bytealign_S (w0[3], w1[0], offset); + w2[3] = hc_bytealign_S (w0[2], w0[3], offset); + w2[2] = hc_bytealign_S (w0[1], w0[2], offset); + w2[1] = hc_bytealign_S (w0[0], w0[1], offset); + w2[0] = hc_bytealign_S ( 0, w0[0], offset); + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 49: - w3[0] &= 0xff000000; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + case 9: + c2[1] = hc_bytealign_S (w3[3], 0, offset); + c2[0] = hc_bytealign_S (w3[2], w3[3], offset); + c1[3] = hc_bytealign_S (w3[1], w3[2], offset); + c1[2] = hc_bytealign_S (w3[0], w3[1], offset); + c1[1] = hc_bytealign_S (w2[3], w3[0], offset); + c1[0] = hc_bytealign_S (w2[2], w2[3], offset); + c0[3] = hc_bytealign_S (w2[1], w2[2], offset); + c0[2] = hc_bytealign_S (w2[0], w2[1], offset); + c0[1] = hc_bytealign_S (w1[3], w2[0], offset); + c0[0] = hc_bytealign_S (w1[2], w1[3], offset); + w3[3] = hc_bytealign_S (w1[1], w1[2], offset); + w3[2] = hc_bytealign_S (w1[0], w1[1], offset); + w3[1] = hc_bytealign_S (w0[3], w1[0], offset); + w3[0] = hc_bytealign_S (w0[2], w0[3], offset); + w2[3] = hc_bytealign_S (w0[1], w0[2], offset); + w2[2] = hc_bytealign_S (w0[0], w0[1], offset); + w2[1] = hc_bytealign_S ( 0, w0[0], offset); + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 50: - w3[0] &= 0xffff0000; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + case 10: + c2[2] = hc_bytealign_S (w3[3], 0, offset); + c2[1] = hc_bytealign_S (w3[2], w3[3], offset); + c2[0] = hc_bytealign_S (w3[1], w3[2], offset); + c1[3] = hc_bytealign_S (w3[0], w3[1], offset); + c1[2] = hc_bytealign_S (w2[3], w3[0], offset); + c1[1] = hc_bytealign_S (w2[2], w2[3], offset); + c1[0] = hc_bytealign_S (w2[1], w2[2], offset); + c0[3] = hc_bytealign_S (w2[0], w2[1], offset); + c0[2] = hc_bytealign_S (w1[3], w2[0], offset); + c0[1] = hc_bytealign_S (w1[2], w1[3], offset); + c0[0] = hc_bytealign_S (w1[1], w1[2], offset); + w3[3] = hc_bytealign_S (w1[0], w1[1], offset); + w3[2] = hc_bytealign_S (w0[3], w1[0], offset); + w3[1] = hc_bytealign_S (w0[2], w0[3], offset); + w3[0] = hc_bytealign_S (w0[1], w0[2], offset); + w2[3] = hc_bytealign_S (w0[0], w0[1], offset); + w2[2] = hc_bytealign_S ( 0, w0[0], offset); + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 11: + c2[3] = hc_bytealign_S (w3[3], 0, offset); + c2[2] = hc_bytealign_S (w3[2], w3[3], offset); + c2[1] = hc_bytealign_S (w3[1], w3[2], offset); + c2[0] = hc_bytealign_S (w3[0], w3[1], offset); + c1[3] = hc_bytealign_S (w2[3], w3[0], offset); + c1[2] = hc_bytealign_S (w2[2], w2[3], offset); + c1[1] = hc_bytealign_S (w2[1], w2[2], offset); + c1[0] = hc_bytealign_S (w2[0], w2[1], offset); + c0[3] = hc_bytealign_S (w1[3], w2[0], offset); + c0[2] = hc_bytealign_S (w1[2], w1[3], offset); + c0[1] = hc_bytealign_S (w1[1], w1[2], offset); + c0[0] = hc_bytealign_S (w1[0], w1[1], offset); + w3[3] = hc_bytealign_S (w0[3], w1[0], offset); + w3[2] = hc_bytealign_S (w0[2], w0[3], offset); + w3[1] = hc_bytealign_S (w0[1], w0[2], offset); + w3[0] = hc_bytealign_S (w0[0], w0[1], offset); + w2[3] = hc_bytealign_S ( 0, w0[0], offset); + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 51: - w3[0] &= 0xffffff00; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + case 12: + c3[0] = hc_bytealign_S (w3[3], 0, offset); + c2[3] = hc_bytealign_S (w3[2], w3[3], offset); + c2[2] = hc_bytealign_S (w3[1], w3[2], offset); + c2[1] = hc_bytealign_S (w3[0], w3[1], offset); + c2[0] = hc_bytealign_S (w2[3], w3[0], offset); + c1[3] = hc_bytealign_S (w2[2], w2[3], offset); + c1[2] = hc_bytealign_S (w2[1], w2[2], offset); + c1[1] = hc_bytealign_S (w2[0], w2[1], offset); + c1[0] = hc_bytealign_S (w1[3], w2[0], offset); + c0[3] = hc_bytealign_S (w1[2], w1[3], offset); + c0[2] = hc_bytealign_S (w1[1], w1[2], offset); + c0[1] = hc_bytealign_S (w1[0], w1[1], offset); + c0[0] = hc_bytealign_S (w0[3], w1[0], offset); + w3[3] = hc_bytealign_S (w0[2], w0[3], offset); + w3[2] = hc_bytealign_S (w0[1], w0[2], offset); + w3[1] = hc_bytealign_S (w0[0], w0[1], offset); + w3[0] = hc_bytealign_S ( 0, w0[0], offset); + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 52: - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + case 13: + c3[1] = hc_bytealign_S (w3[3], 0, offset); + c3[0] = hc_bytealign_S (w3[2], w3[3], offset); + c2[3] = hc_bytealign_S (w3[1], w3[2], offset); + c2[2] = hc_bytealign_S (w3[0], w3[1], offset); + c2[1] = hc_bytealign_S (w2[3], w3[0], offset); + c2[0] = hc_bytealign_S (w2[2], w2[3], offset); + c1[3] = hc_bytealign_S (w2[1], w2[2], offset); + c1[2] = hc_bytealign_S (w2[0], w2[1], offset); + c1[1] = hc_bytealign_S (w1[3], w2[0], offset); + c1[0] = hc_bytealign_S (w1[2], w1[3], offset); + c0[3] = hc_bytealign_S (w1[1], w1[2], offset); + c0[2] = hc_bytealign_S (w1[0], w1[1], offset); + c0[1] = hc_bytealign_S (w0[3], w1[0], offset); + c0[0] = hc_bytealign_S (w0[2], w0[3], offset); + w3[3] = hc_bytealign_S (w0[1], w0[2], offset); + w3[2] = hc_bytealign_S (w0[0], w0[1], offset); + w3[1] = hc_bytealign_S ( 0, w0[0], offset); + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 53: - w3[1] &= 0xff000000; - w3[2] = 0; - w3[3] = 0; + case 14: + c3[2] = hc_bytealign_S (w3[3], 0, offset); + c3[1] = hc_bytealign_S (w3[2], w3[3], offset); + c3[0] = hc_bytealign_S (w3[1], w3[2], offset); + c2[3] = hc_bytealign_S (w3[0], w3[1], offset); + c2[2] = hc_bytealign_S (w2[3], w3[0], offset); + c2[1] = hc_bytealign_S (w2[2], w2[3], offset); + c2[0] = hc_bytealign_S (w2[1], w2[2], offset); + c1[3] = hc_bytealign_S (w2[0], w2[1], offset); + c1[2] = hc_bytealign_S (w1[3], w2[0], offset); + c1[1] = hc_bytealign_S (w1[2], w1[3], offset); + c1[0] = hc_bytealign_S (w1[1], w1[2], offset); + c0[3] = hc_bytealign_S (w1[0], w1[1], offset); + c0[2] = hc_bytealign_S (w0[3], w1[0], offset); + c0[1] = hc_bytealign_S (w0[2], w0[3], offset); + c0[0] = hc_bytealign_S (w0[1], w0[2], offset); + w3[3] = hc_bytealign_S (w0[0], w0[1], offset); + w3[2] = hc_bytealign_S ( 0, w0[0], offset); + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 54: - w3[1] &= 0xffff0000; - w3[2] = 0; - w3[3] = 0; + case 15: + c3[3] = hc_bytealign_S (w3[3], 0, offset); + c3[2] = hc_bytealign_S (w3[2], w3[3], offset); + c3[1] = hc_bytealign_S (w3[1], w3[2], offset); + c3[0] = hc_bytealign_S (w3[0], w3[1], offset); + c2[3] = hc_bytealign_S (w2[3], w3[0], offset); + c2[2] = hc_bytealign_S (w2[2], w2[3], offset); + c2[1] = hc_bytealign_S (w2[1], w2[2], offset); + c2[0] = hc_bytealign_S (w2[0], w2[1], offset); + c1[3] = hc_bytealign_S (w1[3], w2[0], offset); + c1[2] = hc_bytealign_S (w1[2], w1[3], offset); + c1[1] = hc_bytealign_S (w1[1], w1[2], offset); + c1[0] = hc_bytealign_S (w1[0], w1[1], offset); + c0[3] = hc_bytealign_S (w0[3], w1[0], offset); + c0[2] = hc_bytealign_S (w0[2], w0[3], offset); + c0[1] = hc_bytealign_S (w0[1], w0[2], offset); + c0[0] = hc_bytealign_S (w0[0], w0[1], offset); + w3[3] = hc_bytealign_S ( 0, w0[0], offset); + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; + } + #endif - case 55: - w3[1] &= 0xffffff00; - w3[2] = 0; - w3[3] = 0; + #ifdef IS_NV + // could be improved, too + switch (offset_switch) + { + case 0: + c0[0] = hc_bytealign_S (w3[3], 0, offset); + w3[3] = hc_bytealign_S (w3[2], w3[3], offset); + w3[2] = hc_bytealign_S (w3[1], w3[2], offset); + w3[1] = hc_bytealign_S (w3[0], w3[1], offset); + w3[0] = hc_bytealign_S (w2[3], w3[0], offset); + w2[3] = hc_bytealign_S (w2[2], w2[3], offset); + w2[2] = hc_bytealign_S (w2[1], w2[2], offset); + w2[1] = hc_bytealign_S (w2[0], w2[1], offset); + w2[0] = hc_bytealign_S (w1[3], w2[0], offset); + w1[3] = hc_bytealign_S (w1[2], w1[3], offset); + w1[2] = hc_bytealign_S (w1[1], w1[2], offset); + w1[1] = hc_bytealign_S (w1[0], w1[1], offset); + w1[0] = hc_bytealign_S (w0[3], w1[0], offset); + w0[3] = hc_bytealign_S (w0[2], w0[3], offset); + w0[2] = hc_bytealign_S (w0[1], w0[2], offset); + w0[1] = hc_bytealign_S (w0[0], w0[1], offset); + w0[0] = hc_bytealign_S ( 0, w0[0], offset); break; - case 56: - w3[2] = 0; - w3[3] = 0; + case 1: + c0[1] = hc_bytealign_S (w3[3], 0, offset); + c0[0] = hc_bytealign_S (w3[2], w3[3], offset); + w3[3] = hc_bytealign_S (w3[1], w3[2], offset); + w3[2] = hc_bytealign_S (w3[0], w3[1], offset); + w3[1] = hc_bytealign_S (w2[3], w3[0], offset); + w3[0] = hc_bytealign_S (w2[2], w2[3], offset); + w2[3] = hc_bytealign_S (w2[1], w2[2], offset); + w2[2] = hc_bytealign_S (w2[0], w2[1], offset); + w2[1] = hc_bytealign_S (w1[3], w2[0], offset); + w2[0] = hc_bytealign_S (w1[2], w1[3], offset); + w1[3] = hc_bytealign_S (w1[1], w1[2], offset); + w1[2] = hc_bytealign_S (w1[0], w1[1], offset); + w1[1] = hc_bytealign_S (w0[3], w1[0], offset); + w1[0] = hc_bytealign_S (w0[2], w0[3], offset); + w0[3] = hc_bytealign_S (w0[1], w0[2], offset); + w0[2] = hc_bytealign_S (w0[0], w0[1], offset); + w0[1] = hc_bytealign_S ( 0, w0[0], offset); + w0[0] = 0; break; - case 57: - w3[2] &= 0xff000000; - w3[3] = 0; + case 2: + c0[2] = hc_bytealign_S (w3[3], 0, offset); + c0[1] = hc_bytealign_S (w3[2], w3[3], offset); + c0[0] = hc_bytealign_S (w3[1], w3[2], offset); + w3[3] = hc_bytealign_S (w3[0], w3[1], offset); + w3[2] = hc_bytealign_S (w2[3], w3[0], offset); + w3[1] = hc_bytealign_S (w2[2], w2[3], offset); + w3[0] = hc_bytealign_S (w2[1], w2[2], offset); + w2[3] = hc_bytealign_S (w2[0], w2[1], offset); + w2[2] = hc_bytealign_S (w1[3], w2[0], offset); + w2[1] = hc_bytealign_S (w1[2], w1[3], offset); + w2[0] = hc_bytealign_S (w1[1], w1[2], offset); + w1[3] = hc_bytealign_S (w1[0], w1[1], offset); + w1[2] = hc_bytealign_S (w0[3], w1[0], offset); + w1[1] = hc_bytealign_S (w0[2], w0[3], offset); + w1[0] = hc_bytealign_S (w0[1], w0[2], offset); + w0[3] = hc_bytealign_S (w0[0], w0[1], offset); + w0[2] = hc_bytealign_S ( 0, w0[0], offset); + w0[1] = 0; + w0[0] = 0; break; - case 58: - w3[2] &= 0xffff0000; - w3[3] = 0; + case 3: + c0[3] = hc_bytealign_S (w3[3], 0, offset); + c0[2] = hc_bytealign_S (w3[2], w3[3], offset); + c0[1] = hc_bytealign_S (w3[1], w3[2], offset); + c0[0] = hc_bytealign_S (w3[0], w3[1], offset); + w3[3] = hc_bytealign_S (w2[3], w3[0], offset); + w3[2] = hc_bytealign_S (w2[2], w2[3], offset); + w3[1] = hc_bytealign_S (w2[1], w2[2], offset); + w3[0] = hc_bytealign_S (w2[0], w2[1], offset); + w2[3] = hc_bytealign_S (w1[3], w2[0], offset); + w2[2] = hc_bytealign_S (w1[2], w1[3], offset); + w2[1] = hc_bytealign_S (w1[1], w1[2], offset); + w2[0] = hc_bytealign_S (w1[0], w1[1], offset); + w1[3] = hc_bytealign_S (w0[3], w1[0], offset); + w1[2] = hc_bytealign_S (w0[2], w0[3], offset); + w1[1] = hc_bytealign_S (w0[1], w0[2], offset); + w1[0] = hc_bytealign_S (w0[0], w0[1], offset); + w0[3] = hc_bytealign_S ( 0, w0[0], offset); + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 59: - w3[2] &= 0xffffff00; - w3[3] = 0; + case 4: + c1[0] = hc_bytealign_S (w3[3], 0, offset); + c0[3] = hc_bytealign_S (w3[2], w3[3], offset); + c0[2] = hc_bytealign_S (w3[1], w3[2], offset); + c0[1] = hc_bytealign_S (w3[0], w3[1], offset); + c0[0] = hc_bytealign_S (w2[3], w3[0], offset); + w3[3] = hc_bytealign_S (w2[2], w2[3], offset); + w3[2] = hc_bytealign_S (w2[1], w2[2], offset); + w3[1] = hc_bytealign_S (w2[0], w2[1], offset); + w3[0] = hc_bytealign_S (w1[3], w2[0], offset); + w2[3] = hc_bytealign_S (w1[2], w1[3], offset); + w2[2] = hc_bytealign_S (w1[1], w1[2], offset); + w2[1] = hc_bytealign_S (w1[0], w1[1], offset); + w2[0] = hc_bytealign_S (w0[3], w1[0], offset); + w1[3] = hc_bytealign_S (w0[2], w0[3], offset); + w1[2] = hc_bytealign_S (w0[1], w0[2], offset); + w1[1] = hc_bytealign_S (w0[0], w0[1], offset); + w1[0] = hc_bytealign_S ( 0, w0[0], offset); + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 60: - w3[3] = 0; + case 5: + c1[1] = hc_bytealign_S (w3[3], 0, offset); + c1[0] = hc_bytealign_S (w3[2], w3[3], offset); + c0[3] = hc_bytealign_S (w3[1], w3[2], offset); + c0[2] = hc_bytealign_S (w3[0], w3[1], offset); + c0[1] = hc_bytealign_S (w2[3], w3[0], offset); + c0[0] = hc_bytealign_S (w2[2], w2[3], offset); + w3[3] = hc_bytealign_S (w2[1], w2[2], offset); + w3[2] = hc_bytealign_S (w2[0], w2[1], offset); + w3[1] = hc_bytealign_S (w1[3], w2[0], offset); + w3[0] = hc_bytealign_S (w1[2], w1[3], offset); + w2[3] = hc_bytealign_S (w1[1], w1[2], offset); + w2[2] = hc_bytealign_S (w1[0], w1[1], offset); + w2[1] = hc_bytealign_S (w0[3], w1[0], offset); + w2[0] = hc_bytealign_S (w0[2], w0[3], offset); + w1[3] = hc_bytealign_S (w0[1], w0[2], offset); + w1[2] = hc_bytealign_S (w0[0], w0[1], offset); + w1[1] = hc_bytealign_S ( 0, w0[0], offset); + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 61: - w3[3] &= 0xff000000; + case 6: + c1[2] = hc_bytealign_S (w3[3], 0, offset); + c1[1] = hc_bytealign_S (w3[2], w3[3], offset); + c1[0] = hc_bytealign_S (w3[1], w3[2], offset); + c0[3] = hc_bytealign_S (w3[0], w3[1], offset); + c0[2] = hc_bytealign_S (w2[3], w3[0], offset); + c0[1] = hc_bytealign_S (w2[2], w2[3], offset); + c0[0] = hc_bytealign_S (w2[1], w2[2], offset); + w3[3] = hc_bytealign_S (w2[0], w2[1], offset); + w3[2] = hc_bytealign_S (w1[3], w2[0], offset); + w3[1] = hc_bytealign_S (w1[2], w1[3], offset); + w3[0] = hc_bytealign_S (w1[1], w1[2], offset); + w2[3] = hc_bytealign_S (w1[0], w1[1], offset); + w2[2] = hc_bytealign_S (w0[3], w1[0], offset); + w2[1] = hc_bytealign_S (w0[2], w0[3], offset); + w2[0] = hc_bytealign_S (w0[1], w0[2], offset); + w1[3] = hc_bytealign_S (w0[0], w0[1], offset); + w1[2] = hc_bytealign_S ( 0, w0[0], offset); + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 62: - w3[3] &= 0xffff0000; + case 7: + c1[3] = hc_bytealign_S (w3[3], 0, offset); + c1[2] = hc_bytealign_S (w3[2], w3[3], offset); + c1[1] = hc_bytealign_S (w3[1], w3[2], offset); + c1[0] = hc_bytealign_S (w3[0], w3[1], offset); + c0[3] = hc_bytealign_S (w2[3], w3[0], offset); + c0[2] = hc_bytealign_S (w2[2], w2[3], offset); + c0[1] = hc_bytealign_S (w2[1], w2[2], offset); + c0[0] = hc_bytealign_S (w2[0], w2[1], offset); + w3[3] = hc_bytealign_S (w1[3], w2[0], offset); + w3[2] = hc_bytealign_S (w1[2], w1[3], offset); + w3[1] = hc_bytealign_S (w1[1], w1[2], offset); + w3[0] = hc_bytealign_S (w1[0], w1[1], offset); + w2[3] = hc_bytealign_S (w0[3], w1[0], offset); + w2[2] = hc_bytealign_S (w0[2], w0[3], offset); + w2[1] = hc_bytealign_S (w0[1], w0[2], offset); + w2[0] = hc_bytealign_S (w0[0], w0[1], offset); + w1[3] = hc_bytealign_S ( 0, w0[0], offset); + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 63: - w3[3] &= 0xffffff00; + case 8: + c2[0] = hc_bytealign_S (w3[3], 0, offset); + c1[3] = hc_bytealign_S (w3[2], w3[3], offset); + c1[2] = hc_bytealign_S (w3[1], w3[2], offset); + c1[1] = hc_bytealign_S (w3[0], w3[1], offset); + c1[0] = hc_bytealign_S (w2[3], w3[0], offset); + c0[3] = hc_bytealign_S (w2[2], w2[3], offset); + c0[2] = hc_bytealign_S (w2[1], w2[2], offset); + c0[1] = hc_bytealign_S (w2[0], w2[1], offset); + c0[0] = hc_bytealign_S (w1[3], w2[0], offset); + w3[3] = hc_bytealign_S (w1[2], w1[3], offset); + w3[2] = hc_bytealign_S (w1[1], w1[2], offset); + w3[1] = hc_bytealign_S (w1[0], w1[1], offset); + w3[0] = hc_bytealign_S (w0[3], w1[0], offset); + w2[3] = hc_bytealign_S (w0[2], w0[3], offset); + w2[2] = hc_bytealign_S (w0[1], w0[2], offset); + w2[1] = hc_bytealign_S (w0[0], w0[1], offset); + w2[0] = hc_bytealign_S ( 0, w0[0], offset); + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - } -} - -DECLSPEC void set_mark_1x4_S (u32 *v, const u32 offset) -{ - const u32 c = (offset & 15) / 4; - const u32 r = 0xff << ((offset & 3) * 8); - - v[0] = (c == 0) ? r : 0; - v[1] = (c == 1) ? r : 0; - v[2] = (c == 2) ? r : 0; - v[3] = (c == 3) ? r : 0; -} - -DECLSPEC void append_helper_1x4_S (u32 *r, const u32 v, const u32 *m) -{ - r[0] |= v & m[0]; - r[1] |= v & m[1]; - r[2] |= v & m[2]; - r[3] |= v & m[3]; -} - -DECLSPEC void append_0x01_2x4_S (u32 *w0, u32 *w1, const u32 offset) -{ - u32 v[4]; - - set_mark_1x4_S (v, offset); - - const u32 offset16 = offset / 16; - - append_helper_1x4_S (w0, ((offset16 == 0) ? 0x01010101 : 0), v); - append_helper_1x4_S (w1, ((offset16 == 1) ? 0x01010101 : 0), v); -} - -DECLSPEC void append_0x06_2x4_S (u32 *w0, u32 *w1, const u32 offset) -{ - u32 v[4]; - - set_mark_1x4_S (v, offset); - - const u32 offset16 = offset / 16; - - append_helper_1x4_S (w0, ((offset16 == 0) ? 0x06060606 : 0), v); - append_helper_1x4_S (w1, ((offset16 == 1) ? 0x06060606 : 0), v); -} - -DECLSPEC void append_0x01_4x4_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 offset) -{ - u32 v[4]; - set_mark_1x4_S (v, offset); + case 9: + c2[1] = hc_bytealign_S (w3[3], 0, offset); + c2[0] = hc_bytealign_S (w3[2], w3[3], offset); + c1[3] = hc_bytealign_S (w3[1], w3[2], offset); + c1[2] = hc_bytealign_S (w3[0], w3[1], offset); + c1[1] = hc_bytealign_S (w2[3], w3[0], offset); + c1[0] = hc_bytealign_S (w2[2], w2[3], offset); + c0[3] = hc_bytealign_S (w2[1], w2[2], offset); + c0[2] = hc_bytealign_S (w2[0], w2[1], offset); + c0[1] = hc_bytealign_S (w1[3], w2[0], offset); + c0[0] = hc_bytealign_S (w1[2], w1[3], offset); + w3[3] = hc_bytealign_S (w1[1], w1[2], offset); + w3[2] = hc_bytealign_S (w1[0], w1[1], offset); + w3[1] = hc_bytealign_S (w0[3], w1[0], offset); + w3[0] = hc_bytealign_S (w0[2], w0[3], offset); + w2[3] = hc_bytealign_S (w0[1], w0[2], offset); + w2[2] = hc_bytealign_S (w0[0], w0[1], offset); + w2[1] = hc_bytealign_S ( 0, w0[0], offset); + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - const u32 offset16 = offset / 16; + break; - append_helper_1x4_S (w0, ((offset16 == 0) ? 0x01010101 : 0), v); - append_helper_1x4_S (w1, ((offset16 == 1) ? 0x01010101 : 0), v); - append_helper_1x4_S (w2, ((offset16 == 2) ? 0x01010101 : 0), v); - append_helper_1x4_S (w3, ((offset16 == 3) ? 0x01010101 : 0), v); -} + case 10: + c2[2] = hc_bytealign_S (w3[3], 0, offset); + c2[1] = hc_bytealign_S (w3[2], w3[3], offset); + c2[0] = hc_bytealign_S (w3[1], w3[2], offset); + c1[3] = hc_bytealign_S (w3[0], w3[1], offset); + c1[2] = hc_bytealign_S (w2[3], w3[0], offset); + c1[1] = hc_bytealign_S (w2[2], w2[3], offset); + c1[0] = hc_bytealign_S (w2[1], w2[2], offset); + c0[3] = hc_bytealign_S (w2[0], w2[1], offset); + c0[2] = hc_bytealign_S (w1[3], w2[0], offset); + c0[1] = hc_bytealign_S (w1[2], w1[3], offset); + c0[0] = hc_bytealign_S (w1[1], w1[2], offset); + w3[3] = hc_bytealign_S (w1[0], w1[1], offset); + w3[2] = hc_bytealign_S (w0[3], w1[0], offset); + w3[1] = hc_bytealign_S (w0[2], w0[3], offset); + w3[0] = hc_bytealign_S (w0[1], w0[2], offset); + w2[3] = hc_bytealign_S (w0[0], w0[1], offset); + w2[2] = hc_bytealign_S ( 0, w0[0], offset); + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; -DECLSPEC void append_0x80_1x4_S (u32 *w0, const u32 offset) -{ - u32 v[4]; + break; - set_mark_1x4_S (v, offset); + case 11: + c2[3] = hc_bytealign_S (w3[3], 0, offset); + c2[2] = hc_bytealign_S (w3[2], w3[3], offset); + c2[1] = hc_bytealign_S (w3[1], w3[2], offset); + c2[0] = hc_bytealign_S (w3[0], w3[1], offset); + c1[3] = hc_bytealign_S (w2[3], w3[0], offset); + c1[2] = hc_bytealign_S (w2[2], w2[3], offset); + c1[1] = hc_bytealign_S (w2[1], w2[2], offset); + c1[0] = hc_bytealign_S (w2[0], w2[1], offset); + c0[3] = hc_bytealign_S (w1[3], w2[0], offset); + c0[2] = hc_bytealign_S (w1[2], w1[3], offset); + c0[1] = hc_bytealign_S (w1[1], w1[2], offset); + c0[0] = hc_bytealign_S (w1[0], w1[1], offset); + w3[3] = hc_bytealign_S (w0[3], w1[0], offset); + w3[2] = hc_bytealign_S (w0[2], w0[3], offset); + w3[1] = hc_bytealign_S (w0[1], w0[2], offset); + w3[0] = hc_bytealign_S (w0[0], w0[1], offset); + w2[3] = hc_bytealign_S ( 0, w0[0], offset); + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - append_helper_1x4_S (w0, 0x80808080, v); -} + break; -DECLSPEC void append_0x80_2x4_S (u32 *w0, u32 *w1, const u32 offset) -{ - u32 v[4]; + case 12: + c3[0] = hc_bytealign_S (w3[3], 0, offset); + c2[3] = hc_bytealign_S (w3[2], w3[3], offset); + c2[2] = hc_bytealign_S (w3[1], w3[2], offset); + c2[1] = hc_bytealign_S (w3[0], w3[1], offset); + c2[0] = hc_bytealign_S (w2[3], w3[0], offset); + c1[3] = hc_bytealign_S (w2[2], w2[3], offset); + c1[2] = hc_bytealign_S (w2[1], w2[2], offset); + c1[1] = hc_bytealign_S (w2[0], w2[1], offset); + c1[0] = hc_bytealign_S (w1[3], w2[0], offset); + c0[3] = hc_bytealign_S (w1[2], w1[3], offset); + c0[2] = hc_bytealign_S (w1[1], w1[2], offset); + c0[1] = hc_bytealign_S (w1[0], w1[1], offset); + c0[0] = hc_bytealign_S (w0[3], w1[0], offset); + w3[3] = hc_bytealign_S (w0[2], w0[3], offset); + w3[2] = hc_bytealign_S (w0[1], w0[2], offset); + w3[1] = hc_bytealign_S (w0[0], w0[1], offset); + w3[0] = hc_bytealign_S ( 0, w0[0], offset); + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - set_mark_1x4_S (v, offset); + break; - const u32 offset16 = offset / 16; + case 13: + c3[1] = hc_bytealign_S (w3[3], 0, offset); + c3[0] = hc_bytealign_S (w3[2], w3[3], offset); + c2[3] = hc_bytealign_S (w3[1], w3[2], offset); + c2[2] = hc_bytealign_S (w3[0], w3[1], offset); + c2[1] = hc_bytealign_S (w2[3], w3[0], offset); + c2[0] = hc_bytealign_S (w2[2], w2[3], offset); + c1[3] = hc_bytealign_S (w2[1], w2[2], offset); + c1[2] = hc_bytealign_S (w2[0], w2[1], offset); + c1[1] = hc_bytealign_S (w1[3], w2[0], offset); + c1[0] = hc_bytealign_S (w1[2], w1[3], offset); + c0[3] = hc_bytealign_S (w1[1], w1[2], offset); + c0[2] = hc_bytealign_S (w1[0], w1[1], offset); + c0[1] = hc_bytealign_S (w0[3], w1[0], offset); + c0[0] = hc_bytealign_S (w0[2], w0[3], offset); + w3[3] = hc_bytealign_S (w0[1], w0[2], offset); + w3[2] = hc_bytealign_S (w0[0], w0[1], offset); + w3[1] = hc_bytealign_S ( 0, w0[0], offset); + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - append_helper_1x4_S (w0, ((offset16 == 0) ? 0x80808080 : 0), v); - append_helper_1x4_S (w1, ((offset16 == 1) ? 0x80808080 : 0), v); -} + break; -DECLSPEC void append_0x80_3x4_S (u32 *w0, u32 *w1, u32 *w2, const u32 offset) -{ - u32 v[4]; + case 14: + c3[2] = hc_bytealign_S (w3[3], 0, offset); + c3[1] = hc_bytealign_S (w3[2], w3[3], offset); + c3[0] = hc_bytealign_S (w3[1], w3[2], offset); + c2[3] = hc_bytealign_S (w3[0], w3[1], offset); + c2[2] = hc_bytealign_S (w2[3], w3[0], offset); + c2[1] = hc_bytealign_S (w2[2], w2[3], offset); + c2[0] = hc_bytealign_S (w2[1], w2[2], offset); + c1[3] = hc_bytealign_S (w2[0], w2[1], offset); + c1[2] = hc_bytealign_S (w1[3], w2[0], offset); + c1[1] = hc_bytealign_S (w1[2], w1[3], offset); + c1[0] = hc_bytealign_S (w1[1], w1[2], offset); + c0[3] = hc_bytealign_S (w1[0], w1[1], offset); + c0[2] = hc_bytealign_S (w0[3], w1[0], offset); + c0[1] = hc_bytealign_S (w0[2], w0[3], offset); + c0[0] = hc_bytealign_S (w0[1], w0[2], offset); + w3[3] = hc_bytealign_S (w0[0], w0[1], offset); + w3[2] = hc_bytealign_S ( 0, w0[0], offset); + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - set_mark_1x4_S (v, offset); + break; - const u32 offset16 = offset / 16; + case 15: + c3[3] = hc_bytealign_S (w3[3], 0, offset); + c3[2] = hc_bytealign_S (w3[2], w3[3], offset); + c3[1] = hc_bytealign_S (w3[1], w3[2], offset); + c3[0] = hc_bytealign_S (w3[0], w3[1], offset); + c2[3] = hc_bytealign_S (w2[3], w3[0], offset); + c2[2] = hc_bytealign_S (w2[2], w2[3], offset); + c2[1] = hc_bytealign_S (w2[1], w2[2], offset); + c2[0] = hc_bytealign_S (w2[0], w2[1], offset); + c1[3] = hc_bytealign_S (w1[3], w2[0], offset); + c1[2] = hc_bytealign_S (w1[2], w1[3], offset); + c1[1] = hc_bytealign_S (w1[1], w1[2], offset); + c1[0] = hc_bytealign_S (w1[0], w1[1], offset); + c0[3] = hc_bytealign_S (w0[3], w1[0], offset); + c0[2] = hc_bytealign_S (w0[2], w0[3], offset); + c0[1] = hc_bytealign_S (w0[1], w0[2], offset); + c0[0] = hc_bytealign_S (w0[0], w0[1], offset); + w3[3] = hc_bytealign_S ( 0, w0[0], offset); + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - append_helper_1x4_S (w0, ((offset16 == 0) ? 0x80808080 : 0), v); - append_helper_1x4_S (w1, ((offset16 == 1) ? 0x80808080 : 0), v); - append_helper_1x4_S (w2, ((offset16 == 2) ? 0x80808080 : 0), v); + break; + } + #endif } -DECLSPEC void append_0x80_4x4_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 offset) +DECLSPEC void switch_buffer_by_offset_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 offset) { - u32 v[4]; - - set_mark_1x4_S (v, offset); - - const u32 offset16 = offset / 16; + const int offset_switch = offset / 4; - append_helper_1x4_S (w0, ((offset16 == 0) ? 0x80808080 : 0), v); - append_helper_1x4_S (w1, ((offset16 == 1) ? 0x80808080 : 0), v); - append_helper_1x4_S (w2, ((offset16 == 2) ? 0x80808080 : 0), v); - append_helper_1x4_S (w3, ((offset16 == 3) ? 0x80808080 : 0), v); -} + #if (defined IS_AMD && HAS_VPERM == 0) || defined IS_GENERIC + switch (offset_switch) + { + case 0: + w3[3] = hc_bytealign_be_S (w3[2], w3[3], offset); + w3[2] = hc_bytealign_be_S (w3[1], w3[2], offset); + w3[1] = hc_bytealign_be_S (w3[0], w3[1], offset); + w3[0] = hc_bytealign_be_S (w2[3], w3[0], offset); + w2[3] = hc_bytealign_be_S (w2[2], w2[3], offset); + w2[2] = hc_bytealign_be_S (w2[1], w2[2], offset); + w2[1] = hc_bytealign_be_S (w2[0], w2[1], offset); + w2[0] = hc_bytealign_be_S (w1[3], w2[0], offset); + w1[3] = hc_bytealign_be_S (w1[2], w1[3], offset); + w1[2] = hc_bytealign_be_S (w1[1], w1[2], offset); + w1[1] = hc_bytealign_be_S (w1[0], w1[1], offset); + w1[0] = hc_bytealign_be_S (w0[3], w1[0], offset); + w0[3] = hc_bytealign_be_S (w0[2], w0[3], offset); + w0[2] = hc_bytealign_be_S (w0[1], w0[2], offset); + w0[1] = hc_bytealign_be_S (w0[0], w0[1], offset); + w0[0] = hc_bytealign_be_S ( 0, w0[0], offset); -DECLSPEC void append_0x80_8x4_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, u32 *w4, u32 *w5, u32 *w6, u32 *w7, const u32 offset) -{ - u32 v[4]; + break; - set_mark_1x4_S (v, offset); + case 1: + w3[3] = hc_bytealign_be_S (w3[1], w3[2], offset); + w3[2] = hc_bytealign_be_S (w3[0], w3[1], offset); + w3[1] = hc_bytealign_be_S (w2[3], w3[0], offset); + w3[0] = hc_bytealign_be_S (w2[2], w2[3], offset); + w2[3] = hc_bytealign_be_S (w2[1], w2[2], offset); + w2[2] = hc_bytealign_be_S (w2[0], w2[1], offset); + w2[1] = hc_bytealign_be_S (w1[3], w2[0], offset); + w2[0] = hc_bytealign_be_S (w1[2], w1[3], offset); + w1[3] = hc_bytealign_be_S (w1[1], w1[2], offset); + w1[2] = hc_bytealign_be_S (w1[0], w1[1], offset); + w1[1] = hc_bytealign_be_S (w0[3], w1[0], offset); + w1[0] = hc_bytealign_be_S (w0[2], w0[3], offset); + w0[3] = hc_bytealign_be_S (w0[1], w0[2], offset); + w0[2] = hc_bytealign_be_S (w0[0], w0[1], offset); + w0[1] = hc_bytealign_be_S ( 0, w0[0], offset); + w0[0] = 0; - const u32 offset16 = offset / 16; + break; - append_helper_1x4_S (w0, ((offset16 == 0) ? 0x80808080 : 0), v); - append_helper_1x4_S (w1, ((offset16 == 1) ? 0x80808080 : 0), v); - append_helper_1x4_S (w2, ((offset16 == 2) ? 0x80808080 : 0), v); - append_helper_1x4_S (w3, ((offset16 == 3) ? 0x80808080 : 0), v); - append_helper_1x4_S (w4, ((offset16 == 4) ? 0x80808080 : 0), v); - append_helper_1x4_S (w5, ((offset16 == 5) ? 0x80808080 : 0), v); - append_helper_1x4_S (w6, ((offset16 == 6) ? 0x80808080 : 0), v); - append_helper_1x4_S (w7, ((offset16 == 7) ? 0x80808080 : 0), v); -} + case 2: + w3[3] = hc_bytealign_be_S (w3[0], w3[1], offset); + w3[2] = hc_bytealign_be_S (w2[3], w3[0], offset); + w3[1] = hc_bytealign_be_S (w2[2], w2[3], offset); + w3[0] = hc_bytealign_be_S (w2[1], w2[2], offset); + w2[3] = hc_bytealign_be_S (w2[0], w2[1], offset); + w2[2] = hc_bytealign_be_S (w1[3], w2[0], offset); + w2[1] = hc_bytealign_be_S (w1[2], w1[3], offset); + w2[0] = hc_bytealign_be_S (w1[1], w1[2], offset); + w1[3] = hc_bytealign_be_S (w1[0], w1[1], offset); + w1[2] = hc_bytealign_be_S (w0[3], w1[0], offset); + w1[1] = hc_bytealign_be_S (w0[2], w0[3], offset); + w1[0] = hc_bytealign_be_S (w0[1], w0[2], offset); + w0[3] = hc_bytealign_be_S (w0[0], w0[1], offset); + w0[2] = hc_bytealign_be_S ( 0, w0[0], offset); + w0[1] = 0; + w0[0] = 0; -DECLSPEC void make_utf16be_S (const u32 *in, u32 *out1, u32 *out2) -{ - #if defined IS_NV + break; - out2[3] = hc_byte_perm_S (in[3], 0, 0x3727); - out2[2] = hc_byte_perm_S (in[3], 0, 0x1707); - out2[1] = hc_byte_perm_S (in[2], 0, 0x3727); - out2[0] = hc_byte_perm_S (in[2], 0, 0x1707); - out1[3] = hc_byte_perm_S (in[1], 0, 0x3727); - out1[2] = hc_byte_perm_S (in[1], 0, 0x1707); - out1[1] = hc_byte_perm_S (in[0], 0, 0x3727); - out1[0] = hc_byte_perm_S (in[0], 0, 0x1707); + case 3: + w3[3] = hc_bytealign_be_S (w2[3], w3[0], offset); + w3[2] = hc_bytealign_be_S (w2[2], w2[3], offset); + w3[1] = hc_bytealign_be_S (w2[1], w2[2], offset); + w3[0] = hc_bytealign_be_S (w2[0], w2[1], offset); + w2[3] = hc_bytealign_be_S (w1[3], w2[0], offset); + w2[2] = hc_bytealign_be_S (w1[2], w1[3], offset); + w2[1] = hc_bytealign_be_S (w1[1], w1[2], offset); + w2[0] = hc_bytealign_be_S (w1[0], w1[1], offset); + w1[3] = hc_bytealign_be_S (w0[3], w1[0], offset); + w1[2] = hc_bytealign_be_S (w0[2], w0[3], offset); + w1[1] = hc_bytealign_be_S (w0[1], w0[2], offset); + w1[0] = hc_bytealign_be_S (w0[0], w0[1], offset); + w0[3] = hc_bytealign_be_S ( 0, w0[0], offset); + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - #elif defined IS_AMD && HAS_VPERM + break; - out2[3] = hc_byte_perm_S (in[3], 0, 0x03070207); - out2[2] = hc_byte_perm_S (in[3], 0, 0x01070007); - out2[1] = hc_byte_perm_S (in[2], 0, 0x03070207); - out2[0] = hc_byte_perm_S (in[2], 0, 0x01070007); - out1[3] = hc_byte_perm_S (in[1], 0, 0x03070207); - out1[2] = hc_byte_perm_S (in[1], 0, 0x01070007); - out1[1] = hc_byte_perm_S (in[0], 0, 0x03070207); - out1[0] = hc_byte_perm_S (in[0], 0, 0x01070007); + case 4: + w3[3] = hc_bytealign_be_S (w2[2], w2[3], offset); + w3[2] = hc_bytealign_be_S (w2[1], w2[2], offset); + w3[1] = hc_bytealign_be_S (w2[0], w2[1], offset); + w3[0] = hc_bytealign_be_S (w1[3], w2[0], offset); + w2[3] = hc_bytealign_be_S (w1[2], w1[3], offset); + w2[2] = hc_bytealign_be_S (w1[1], w1[2], offset); + w2[1] = hc_bytealign_be_S (w1[0], w1[1], offset); + w2[0] = hc_bytealign_be_S (w0[3], w1[0], offset); + w1[3] = hc_bytealign_be_S (w0[2], w0[3], offset); + w1[2] = hc_bytealign_be_S (w0[1], w0[2], offset); + w1[1] = hc_bytealign_be_S (w0[0], w0[1], offset); + w1[0] = hc_bytealign_be_S ( 0, w0[0], offset); + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - #else + break; - out2[3] = ((in[3] >> 0) & 0xFF000000) | ((in[3] >> 8) & 0x0000FF00); - out2[2] = ((in[3] << 16) & 0xFF000000) | ((in[3] << 8) & 0x0000FF00); - out2[1] = ((in[2] >> 0) & 0xFF000000) | ((in[2] >> 8) & 0x0000FF00); - out2[0] = ((in[2] << 16) & 0xFF000000) | ((in[2] << 8) & 0x0000FF00); - out1[3] = ((in[1] >> 0) & 0xFF000000) | ((in[1] >> 8) & 0x0000FF00); - out1[2] = ((in[1] << 16) & 0xFF000000) | ((in[1] << 8) & 0x0000FF00); - out1[1] = ((in[0] >> 0) & 0xFF000000) | ((in[0] >> 8) & 0x0000FF00); - out1[0] = ((in[0] << 16) & 0xFF000000) | ((in[0] << 8) & 0x0000FF00); + case 5: + w3[3] = hc_bytealign_be_S (w2[1], w2[2], offset); + w3[2] = hc_bytealign_be_S (w2[0], w2[1], offset); + w3[1] = hc_bytealign_be_S (w1[3], w2[0], offset); + w3[0] = hc_bytealign_be_S (w1[2], w1[3], offset); + w2[3] = hc_bytealign_be_S (w1[1], w1[2], offset); + w2[2] = hc_bytealign_be_S (w1[0], w1[1], offset); + w2[1] = hc_bytealign_be_S (w0[3], w1[0], offset); + w2[0] = hc_bytealign_be_S (w0[2], w0[3], offset); + w1[3] = hc_bytealign_be_S (w0[1], w0[2], offset); + w1[2] = hc_bytealign_be_S (w0[0], w0[1], offset); + w1[1] = hc_bytealign_be_S ( 0, w0[0], offset); + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - #endif -} + break; -DECLSPEC void make_utf16le_S (const u32 *in, u32 *out1, u32 *out2) -{ - #if defined IS_NV + case 6: + w3[3] = hc_bytealign_be_S (w2[0], w2[1], offset); + w3[2] = hc_bytealign_be_S (w1[3], w2[0], offset); + w3[1] = hc_bytealign_be_S (w1[2], w1[3], offset); + w3[0] = hc_bytealign_be_S (w1[1], w1[2], offset); + w2[3] = hc_bytealign_be_S (w1[0], w1[1], offset); + w2[2] = hc_bytealign_be_S (w0[3], w1[0], offset); + w2[1] = hc_bytealign_be_S (w0[2], w0[3], offset); + w2[0] = hc_bytealign_be_S (w0[1], w0[2], offset); + w1[3] = hc_bytealign_be_S (w0[0], w0[1], offset); + w1[2] = hc_bytealign_be_S ( 0, w0[0], offset); + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - out2[3] = hc_byte_perm_S (in[3], 0, 0x7372); - out2[2] = hc_byte_perm_S (in[3], 0, 0x7170); - out2[1] = hc_byte_perm_S (in[2], 0, 0x7372); - out2[0] = hc_byte_perm_S (in[2], 0, 0x7170); - out1[3] = hc_byte_perm_S (in[1], 0, 0x7372); - out1[2] = hc_byte_perm_S (in[1], 0, 0x7170); - out1[1] = hc_byte_perm_S (in[0], 0, 0x7372); - out1[0] = hc_byte_perm_S (in[0], 0, 0x7170); + break; - #elif defined IS_AMD && HAS_VPERM + case 7: + w3[3] = hc_bytealign_be_S (w1[3], w2[0], offset); + w3[2] = hc_bytealign_be_S (w1[2], w1[3], offset); + w3[1] = hc_bytealign_be_S (w1[1], w1[2], offset); + w3[0] = hc_bytealign_be_S (w1[0], w1[1], offset); + w2[3] = hc_bytealign_be_S (w0[3], w1[0], offset); + w2[2] = hc_bytealign_be_S (w0[2], w0[3], offset); + w2[1] = hc_bytealign_be_S (w0[1], w0[2], offset); + w2[0] = hc_bytealign_be_S (w0[0], w0[1], offset); + w1[3] = hc_bytealign_be_S ( 0, w0[0], offset); + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - out2[3] = hc_byte_perm_S (in[3], 0, 0x07030702); - out2[2] = hc_byte_perm_S (in[3], 0, 0x07010700); - out2[1] = hc_byte_perm_S (in[2], 0, 0x07030702); - out2[0] = hc_byte_perm_S (in[2], 0, 0x07010700); - out1[3] = hc_byte_perm_S (in[1], 0, 0x07030702); - out1[2] = hc_byte_perm_S (in[1], 0, 0x07010700); - out1[1] = hc_byte_perm_S (in[0], 0, 0x07030702); - out1[0] = hc_byte_perm_S (in[0], 0, 0x07010700); + break; - #else + case 8: + w3[3] = hc_bytealign_be_S (w1[2], w1[3], offset); + w3[2] = hc_bytealign_be_S (w1[1], w1[2], offset); + w3[1] = hc_bytealign_be_S (w1[0], w1[1], offset); + w3[0] = hc_bytealign_be_S (w0[3], w1[0], offset); + w2[3] = hc_bytealign_be_S (w0[2], w0[3], offset); + w2[2] = hc_bytealign_be_S (w0[1], w0[2], offset); + w2[1] = hc_bytealign_be_S (w0[0], w0[1], offset); + w2[0] = hc_bytealign_be_S ( 0, w0[0], offset); + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - out2[3] = ((in[3] >> 8) & 0x00FF0000) | ((in[3] >> 16) & 0x000000FF); - out2[2] = ((in[3] << 8) & 0x00FF0000) | ((in[3] >> 0) & 0x000000FF); - out2[1] = ((in[2] >> 8) & 0x00FF0000) | ((in[2] >> 16) & 0x000000FF); - out2[0] = ((in[2] << 8) & 0x00FF0000) | ((in[2] >> 0) & 0x000000FF); - out1[3] = ((in[1] >> 8) & 0x00FF0000) | ((in[1] >> 16) & 0x000000FF); - out1[2] = ((in[1] << 8) & 0x00FF0000) | ((in[1] >> 0) & 0x000000FF); - out1[1] = ((in[0] >> 8) & 0x00FF0000) | ((in[0] >> 16) & 0x000000FF); - out1[0] = ((in[0] << 8) & 0x00FF0000) | ((in[0] >> 0) & 0x000000FF); + break; - #endif -} + case 9: + w3[3] = hc_bytealign_be_S (w1[1], w1[2], offset); + w3[2] = hc_bytealign_be_S (w1[0], w1[1], offset); + w3[1] = hc_bytealign_be_S (w0[3], w1[0], offset); + w3[0] = hc_bytealign_be_S (w0[2], w0[3], offset); + w2[3] = hc_bytealign_be_S (w0[1], w0[2], offset); + w2[2] = hc_bytealign_be_S (w0[0], w0[1], offset); + w2[1] = hc_bytealign_be_S ( 0, w0[0], offset); + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; -DECLSPEC void undo_utf16be_S (const u32 *in1, const u32 *in2, u32 *out) -{ - #if defined IS_NV + break; - out[0] = hc_byte_perm_S (in1[0], in1[1], 0x4602); - out[1] = hc_byte_perm_S (in1[2], in1[3], 0x4602); - out[2] = hc_byte_perm_S (in2[0], in2[1], 0x4602); - out[3] = hc_byte_perm_S (in2[2], in2[3], 0x4602); + case 10: + w3[3] = hc_bytealign_be_S (w1[0], w1[1], offset); + w3[2] = hc_bytealign_be_S (w0[3], w1[0], offset); + w3[1] = hc_bytealign_be_S (w0[2], w0[3], offset); + w3[0] = hc_bytealign_be_S (w0[1], w0[2], offset); + w2[3] = hc_bytealign_be_S (w0[0], w0[1], offset); + w2[2] = hc_bytealign_be_S ( 0, w0[0], offset); + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - #elif defined IS_AMD && HAS_VPERM + break; - out[0] = hc_byte_perm_S (in1[0], in1[1], 0x04060002); - out[1] = hc_byte_perm_S (in1[2], in1[3], 0x04060002); - out[2] = hc_byte_perm_S (in2[0], in2[1], 0x04060002); - out[3] = hc_byte_perm_S (in2[2], in2[3], 0x04060002); + case 11: + w3[3] = hc_bytealign_be_S (w0[3], w1[0], offset); + w3[2] = hc_bytealign_be_S (w0[2], w0[3], offset); + w3[1] = hc_bytealign_be_S (w0[1], w0[2], offset); + w3[0] = hc_bytealign_be_S (w0[0], w0[1], offset); + w2[3] = hc_bytealign_be_S ( 0, w0[0], offset); + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - #else + break; + + case 12: + w3[3] = hc_bytealign_be_S (w0[2], w0[3], offset); + w3[2] = hc_bytealign_be_S (w0[1], w0[2], offset); + w3[1] = hc_bytealign_be_S (w0[0], w0[1], offset); + w3[0] = hc_bytealign_be_S ( 0, w0[0], offset); + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - out[0] = ((in1[0] & 0x0000ff00) >> 8) | ((in1[0] & 0xff000000) >> 16) - | ((in1[1] & 0x0000ff00) << 8) | ((in1[1] & 0xff000000) << 0); - out[1] = ((in1[2] & 0x0000ff00) >> 8) | ((in1[2] & 0xff000000) >> 16) - | ((in1[3] & 0x0000ff00) << 8) | ((in1[3] & 0xff000000) << 0); - out[2] = ((in2[0] & 0x0000ff00) >> 8) | ((in2[0] & 0xff000000) >> 16) - | ((in2[1] & 0x0000ff00) << 8) | ((in2[1] & 0xff000000) << 0); - out[3] = ((in2[2] & 0x0000ff00) >> 8) | ((in2[2] & 0xff000000) >> 16) - | ((in2[3] & 0x0000ff00) << 8) | ((in2[3] & 0xff000000) << 0); + break; - #endif -} + case 13: + w3[3] = hc_bytealign_be_S (w0[1], w0[2], offset); + w3[2] = hc_bytealign_be_S (w0[0], w0[1], offset); + w3[1] = hc_bytealign_be_S ( 0, w0[0], offset); + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; -DECLSPEC void undo_utf16le_S (const u32 *in1, const u32 *in2, u32 *out) -{ - #if defined IS_NV + break; - out[0] = hc_byte_perm_S (in1[0], in1[1], 0x6420); - out[1] = hc_byte_perm_S (in1[2], in1[3], 0x6420); - out[2] = hc_byte_perm_S (in2[0], in2[1], 0x6420); - out[3] = hc_byte_perm_S (in2[2], in2[3], 0x6420); + case 14: + w3[3] = hc_bytealign_be_S (w0[0], w0[1], offset); + w3[2] = hc_bytealign_be_S ( 0, w0[0], offset); + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - #elif defined IS_AMD && HAS_VPERM + break; - out[0] = hc_byte_perm_S (in1[0], in1[1], 0x06040200); - out[1] = hc_byte_perm_S (in1[2], in1[3], 0x06040200); - out[2] = hc_byte_perm_S (in2[0], in2[1], 0x06040200); - out[3] = hc_byte_perm_S (in2[2], in2[3], 0x06040200); + case 15: + w3[3] = hc_bytealign_be_S ( 0, w0[0], offset); + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - #else + break; + } + #endif - out[0] = ((in1[0] & 0x000000ff) >> 0) | ((in1[0] & 0x00ff0000) >> 8) - | ((in1[1] & 0x000000ff) << 16) | ((in1[1] & 0x00ff0000) << 8); - out[1] = ((in1[2] & 0x000000ff) >> 0) | ((in1[2] & 0x00ff0000) >> 8) - | ((in1[3] & 0x000000ff) << 16) | ((in1[3] & 0x00ff0000) << 8); - out[2] = ((in2[0] & 0x000000ff) >> 0) | ((in2[0] & 0x00ff0000) >> 8) - | ((in2[1] & 0x000000ff) << 16) | ((in2[1] & 0x00ff0000) << 8); - out[3] = ((in2[2] & 0x000000ff) >> 0) | ((in2[2] & 0x00ff0000) >> 8) - | ((in2[3] & 0x000000ff) << 16) | ((in2[3] & 0x00ff0000) << 8); + #if (defined IS_AMD && HAS_VPERM == 1) || defined IS_NV + #if defined IS_NV + const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; #endif -} -DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 offset) -{ - const int offset_switch = offset / 4; + #if defined IS_AMD + const int selector = 0x0706050403020100 >> ((offset & 3) * 8); + #endif - #if (defined IS_AMD && HAS_VPERM == 0) || defined IS_GENERIC switch (offset_switch) { case 0: - w3[3] = hc_bytealign_S (w3[2], w3[3], offset); - w3[2] = hc_bytealign_S (w3[1], w3[2], offset); - w3[1] = hc_bytealign_S (w3[0], w3[1], offset); - w3[0] = hc_bytealign_S (w2[3], w3[0], offset); - w2[3] = hc_bytealign_S (w2[2], w2[3], offset); - w2[2] = hc_bytealign_S (w2[1], w2[2], offset); - w2[1] = hc_bytealign_S (w2[0], w2[1], offset); - w2[0] = hc_bytealign_S (w1[3], w2[0], offset); - w1[3] = hc_bytealign_S (w1[2], w1[3], offset); - w1[2] = hc_bytealign_S (w1[1], w1[2], offset); - w1[1] = hc_bytealign_S (w1[0], w1[1], offset); - w1[0] = hc_bytealign_S (w0[3], w1[0], offset); - w0[3] = hc_bytealign_S (w0[2], w0[3], offset); - w0[2] = hc_bytealign_S (w0[1], w0[2], offset); - w0[1] = hc_bytealign_S (w0[0], w0[1], offset); - w0[0] = hc_bytealign_S ( 0, w0[0], offset); + w3[3] = hc_byte_perm_S (w3[3], w3[2], selector); + w3[2] = hc_byte_perm_S (w3[2], w3[1], selector); + w3[1] = hc_byte_perm_S (w3[1], w3[0], selector); + w3[0] = hc_byte_perm_S (w3[0], w2[3], selector); + w2[3] = hc_byte_perm_S (w2[3], w2[2], selector); + w2[2] = hc_byte_perm_S (w2[2], w2[1], selector); + w2[1] = hc_byte_perm_S (w2[1], w2[0], selector); + w2[0] = hc_byte_perm_S (w2[0], w1[3], selector); + w1[3] = hc_byte_perm_S (w1[3], w1[2], selector); + w1[2] = hc_byte_perm_S (w1[2], w1[1], selector); + w1[1] = hc_byte_perm_S (w1[1], w1[0], selector); + w1[0] = hc_byte_perm_S (w1[0], w0[3], selector); + w0[3] = hc_byte_perm_S (w0[3], w0[2], selector); + w0[2] = hc_byte_perm_S (w0[2], w0[1], selector); + w0[1] = hc_byte_perm_S (w0[1], w0[0], selector); + w0[0] = hc_byte_perm_S (w0[0], 0, selector); break; case 1: - w3[3] = hc_bytealign_S (w3[1], w3[2], offset); - w3[2] = hc_bytealign_S (w3[0], w3[1], offset); - w3[1] = hc_bytealign_S (w2[3], w3[0], offset); - w3[0] = hc_bytealign_S (w2[2], w2[3], offset); - w2[3] = hc_bytealign_S (w2[1], w2[2], offset); - w2[2] = hc_bytealign_S (w2[0], w2[1], offset); - w2[1] = hc_bytealign_S (w1[3], w2[0], offset); - w2[0] = hc_bytealign_S (w1[2], w1[3], offset); - w1[3] = hc_bytealign_S (w1[1], w1[2], offset); - w1[2] = hc_bytealign_S (w1[0], w1[1], offset); - w1[1] = hc_bytealign_S (w0[3], w1[0], offset); - w1[0] = hc_bytealign_S (w0[2], w0[3], offset); - w0[3] = hc_bytealign_S (w0[1], w0[2], offset); - w0[2] = hc_bytealign_S (w0[0], w0[1], offset); - w0[1] = hc_bytealign_S ( 0, w0[0], offset); + w3[3] = hc_byte_perm_S (w3[2], w3[1], selector); + w3[2] = hc_byte_perm_S (w3[1], w3[0], selector); + w3[1] = hc_byte_perm_S (w3[0], w2[3], selector); + w3[0] = hc_byte_perm_S (w2[3], w2[2], selector); + w2[3] = hc_byte_perm_S (w2[2], w2[1], selector); + w2[2] = hc_byte_perm_S (w2[1], w2[0], selector); + w2[1] = hc_byte_perm_S (w2[0], w1[3], selector); + w2[0] = hc_byte_perm_S (w1[3], w1[2], selector); + w1[3] = hc_byte_perm_S (w1[2], w1[1], selector); + w1[2] = hc_byte_perm_S (w1[1], w1[0], selector); + w1[1] = hc_byte_perm_S (w1[0], w0[3], selector); + w1[0] = hc_byte_perm_S (w0[3], w0[2], selector); + w0[3] = hc_byte_perm_S (w0[2], w0[1], selector); + w0[2] = hc_byte_perm_S (w0[1], w0[0], selector); + w0[1] = hc_byte_perm_S (w0[0], 0, selector); w0[0] = 0; break; case 2: - w3[3] = hc_bytealign_S (w3[0], w3[1], offset); - w3[2] = hc_bytealign_S (w2[3], w3[0], offset); - w3[1] = hc_bytealign_S (w2[2], w2[3], offset); - w3[0] = hc_bytealign_S (w2[1], w2[2], offset); - w2[3] = hc_bytealign_S (w2[0], w2[1], offset); - w2[2] = hc_bytealign_S (w1[3], w2[0], offset); - w2[1] = hc_bytealign_S (w1[2], w1[3], offset); - w2[0] = hc_bytealign_S (w1[1], w1[2], offset); - w1[3] = hc_bytealign_S (w1[0], w1[1], offset); - w1[2] = hc_bytealign_S (w0[3], w1[0], offset); - w1[1] = hc_bytealign_S (w0[2], w0[3], offset); - w1[0] = hc_bytealign_S (w0[1], w0[2], offset); - w0[3] = hc_bytealign_S (w0[0], w0[1], offset); - w0[2] = hc_bytealign_S ( 0, w0[0], offset); + w3[3] = hc_byte_perm_S (w3[1], w3[0], selector); + w3[2] = hc_byte_perm_S (w3[0], w2[3], selector); + w3[1] = hc_byte_perm_S (w2[3], w2[2], selector); + w3[0] = hc_byte_perm_S (w2[2], w2[1], selector); + w2[3] = hc_byte_perm_S (w2[1], w2[0], selector); + w2[2] = hc_byte_perm_S (w2[0], w1[3], selector); + w2[1] = hc_byte_perm_S (w1[3], w1[2], selector); + w2[0] = hc_byte_perm_S (w1[2], w1[1], selector); + w1[3] = hc_byte_perm_S (w1[1], w1[0], selector); + w1[2] = hc_byte_perm_S (w1[0], w0[3], selector); + w1[1] = hc_byte_perm_S (w0[3], w0[2], selector); + w1[0] = hc_byte_perm_S (w0[2], w0[1], selector); + w0[3] = hc_byte_perm_S (w0[1], w0[0], selector); + w0[2] = hc_byte_perm_S (w0[0], 0, selector); w0[1] = 0; w0[0] = 0; break; case 3: - w3[3] = hc_bytealign_S (w2[3], w3[0], offset); - w3[2] = hc_bytealign_S (w2[2], w2[3], offset); - w3[1] = hc_bytealign_S (w2[1], w2[2], offset); - w3[0] = hc_bytealign_S (w2[0], w2[1], offset); - w2[3] = hc_bytealign_S (w1[3], w2[0], offset); - w2[2] = hc_bytealign_S (w1[2], w1[3], offset); - w2[1] = hc_bytealign_S (w1[1], w1[2], offset); - w2[0] = hc_bytealign_S (w1[0], w1[1], offset); - w1[3] = hc_bytealign_S (w0[3], w1[0], offset); - w1[2] = hc_bytealign_S (w0[2], w0[3], offset); - w1[1] = hc_bytealign_S (w0[1], w0[2], offset); - w1[0] = hc_bytealign_S (w0[0], w0[1], offset); - w0[3] = hc_bytealign_S ( 0, w0[0], offset); + w3[3] = hc_byte_perm_S (w3[0], w2[3], selector); + w3[2] = hc_byte_perm_S (w2[3], w2[2], selector); + w3[1] = hc_byte_perm_S (w2[2], w2[1], selector); + w3[0] = hc_byte_perm_S (w2[1], w2[0], selector); + w2[3] = hc_byte_perm_S (w2[0], w1[3], selector); + w2[2] = hc_byte_perm_S (w1[3], w1[2], selector); + w2[1] = hc_byte_perm_S (w1[2], w1[1], selector); + w2[0] = hc_byte_perm_S (w1[1], w1[0], selector); + w1[3] = hc_byte_perm_S (w1[0], w0[3], selector); + w1[2] = hc_byte_perm_S (w0[3], w0[2], selector); + w1[1] = hc_byte_perm_S (w0[2], w0[1], selector); + w1[0] = hc_byte_perm_S (w0[1], w0[0], selector); + w0[3] = hc_byte_perm_S (w0[0], 0, selector); w0[2] = 0; w0[1] = 0; w0[0] = 0; @@ -32854,37 +38172,37 @@ DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 4: - w3[3] = hc_bytealign_S (w2[2], w2[3], offset); - w3[2] = hc_bytealign_S (w2[1], w2[2], offset); - w3[1] = hc_bytealign_S (w2[0], w2[1], offset); - w3[0] = hc_bytealign_S (w1[3], w2[0], offset); - w2[3] = hc_bytealign_S (w1[2], w1[3], offset); - w2[2] = hc_bytealign_S (w1[1], w1[2], offset); - w2[1] = hc_bytealign_S (w1[0], w1[1], offset); - w2[0] = hc_bytealign_S (w0[3], w1[0], offset); - w1[3] = hc_bytealign_S (w0[2], w0[3], offset); - w1[2] = hc_bytealign_S (w0[1], w0[2], offset); - w1[1] = hc_bytealign_S (w0[0], w0[1], offset); - w1[0] = hc_bytealign_S ( 0, w0[0], offset); + w3[3] = hc_byte_perm_S (w2[3], w2[2], selector); + w3[2] = hc_byte_perm_S (w2[2], w2[1], selector); + w3[1] = hc_byte_perm_S (w2[1], w2[0], selector); + w3[0] = hc_byte_perm_S (w2[0], w1[3], selector); + w2[3] = hc_byte_perm_S (w1[3], w1[2], selector); + w2[2] = hc_byte_perm_S (w1[2], w1[1], selector); + w2[1] = hc_byte_perm_S (w1[1], w1[0], selector); + w2[0] = hc_byte_perm_S (w1[0], w0[3], selector); + w1[3] = hc_byte_perm_S (w0[3], w0[2], selector); + w1[2] = hc_byte_perm_S (w0[2], w0[1], selector); + w1[1] = hc_byte_perm_S (w0[1], w0[0], selector); + w1[0] = hc_byte_perm_S (w0[0], 0, selector); w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; - - break; - - case 5: - w3[3] = hc_bytealign_S (w2[1], w2[2], offset); - w3[2] = hc_bytealign_S (w2[0], w2[1], offset); - w3[1] = hc_bytealign_S (w1[3], w2[0], offset); - w3[0] = hc_bytealign_S (w1[2], w1[3], offset); - w2[3] = hc_bytealign_S (w1[1], w1[2], offset); - w2[2] = hc_bytealign_S (w1[0], w1[1], offset); - w2[1] = hc_bytealign_S (w0[3], w1[0], offset); - w2[0] = hc_bytealign_S (w0[2], w0[3], offset); - w1[3] = hc_bytealign_S (w0[1], w0[2], offset); - w1[2] = hc_bytealign_S (w0[0], w0[1], offset); - w1[1] = hc_bytealign_S ( 0, w0[0], offset); + + break; + + case 5: + w3[3] = hc_byte_perm_S (w2[2], w2[1], selector); + w3[2] = hc_byte_perm_S (w2[1], w2[0], selector); + w3[1] = hc_byte_perm_S (w2[0], w1[3], selector); + w3[0] = hc_byte_perm_S (w1[3], w1[2], selector); + w2[3] = hc_byte_perm_S (w1[2], w1[1], selector); + w2[2] = hc_byte_perm_S (w1[1], w1[0], selector); + w2[1] = hc_byte_perm_S (w1[0], w0[3], selector); + w2[0] = hc_byte_perm_S (w0[3], w0[2], selector); + w1[3] = hc_byte_perm_S (w0[2], w0[1], selector); + w1[2] = hc_byte_perm_S (w0[1], w0[0], selector); + w1[1] = hc_byte_perm_S (w0[0], 0, selector); w1[0] = 0; w0[3] = 0; w0[2] = 0; @@ -32894,16 +38212,16 @@ DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 6: - w3[3] = hc_bytealign_S (w2[0], w2[1], offset); - w3[2] = hc_bytealign_S (w1[3], w2[0], offset); - w3[1] = hc_bytealign_S (w1[2], w1[3], offset); - w3[0] = hc_bytealign_S (w1[1], w1[2], offset); - w2[3] = hc_bytealign_S (w1[0], w1[1], offset); - w2[2] = hc_bytealign_S (w0[3], w1[0], offset); - w2[1] = hc_bytealign_S (w0[2], w0[3], offset); - w2[0] = hc_bytealign_S (w0[1], w0[2], offset); - w1[3] = hc_bytealign_S (w0[0], w0[1], offset); - w1[2] = hc_bytealign_S ( 0, w0[0], offset); + w3[3] = hc_byte_perm_S (w2[1], w2[0], selector); + w3[2] = hc_byte_perm_S (w2[0], w1[3], selector); + w3[1] = hc_byte_perm_S (w1[3], w1[2], selector); + w3[0] = hc_byte_perm_S (w1[2], w1[1], selector); + w2[3] = hc_byte_perm_S (w1[1], w1[0], selector); + w2[2] = hc_byte_perm_S (w1[0], w0[3], selector); + w2[1] = hc_byte_perm_S (w0[3], w0[2], selector); + w2[0] = hc_byte_perm_S (w0[2], w0[1], selector); + w1[3] = hc_byte_perm_S (w0[1], w0[0], selector); + w1[2] = hc_byte_perm_S (w0[0], 0, selector); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -32914,15 +38232,15 @@ DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 7: - w3[3] = hc_bytealign_S (w1[3], w2[0], offset); - w3[2] = hc_bytealign_S (w1[2], w1[3], offset); - w3[1] = hc_bytealign_S (w1[1], w1[2], offset); - w3[0] = hc_bytealign_S (w1[0], w1[1], offset); - w2[3] = hc_bytealign_S (w0[3], w1[0], offset); - w2[2] = hc_bytealign_S (w0[2], w0[3], offset); - w2[1] = hc_bytealign_S (w0[1], w0[2], offset); - w2[0] = hc_bytealign_S (w0[0], w0[1], offset); - w1[3] = hc_bytealign_S ( 0, w0[0], offset); + w3[3] = hc_byte_perm_S (w2[0], w1[3], selector); + w3[2] = hc_byte_perm_S (w1[3], w1[2], selector); + w3[1] = hc_byte_perm_S (w1[2], w1[1], selector); + w3[0] = hc_byte_perm_S (w1[1], w1[0], selector); + w2[3] = hc_byte_perm_S (w1[0], w0[3], selector); + w2[2] = hc_byte_perm_S (w0[3], w0[2], selector); + w2[1] = hc_byte_perm_S (w0[2], w0[1], selector); + w2[0] = hc_byte_perm_S (w0[1], w0[0], selector); + w1[3] = hc_byte_perm_S (w0[0], 0, selector); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -32934,14 +38252,14 @@ DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 8: - w3[3] = hc_bytealign_S (w1[2], w1[3], offset); - w3[2] = hc_bytealign_S (w1[1], w1[2], offset); - w3[1] = hc_bytealign_S (w1[0], w1[1], offset); - w3[0] = hc_bytealign_S (w0[3], w1[0], offset); - w2[3] = hc_bytealign_S (w0[2], w0[3], offset); - w2[2] = hc_bytealign_S (w0[1], w0[2], offset); - w2[1] = hc_bytealign_S (w0[0], w0[1], offset); - w2[0] = hc_bytealign_S ( 0, w0[0], offset); + w3[3] = hc_byte_perm_S (w1[3], w1[2], selector); + w3[2] = hc_byte_perm_S (w1[2], w1[1], selector); + w3[1] = hc_byte_perm_S (w1[1], w1[0], selector); + w3[0] = hc_byte_perm_S (w1[0], w0[3], selector); + w2[3] = hc_byte_perm_S (w0[3], w0[2], selector); + w2[2] = hc_byte_perm_S (w0[2], w0[1], selector); + w2[1] = hc_byte_perm_S (w0[1], w0[0], selector); + w2[0] = hc_byte_perm_S (w0[0], 0, selector); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -32954,13 +38272,13 @@ DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 9: - w3[3] = hc_bytealign_S (w1[1], w1[2], offset); - w3[2] = hc_bytealign_S (w1[0], w1[1], offset); - w3[1] = hc_bytealign_S (w0[3], w1[0], offset); - w3[0] = hc_bytealign_S (w0[2], w0[3], offset); - w2[3] = hc_bytealign_S (w0[1], w0[2], offset); - w2[2] = hc_bytealign_S (w0[0], w0[1], offset); - w2[1] = hc_bytealign_S ( 0, w0[0], offset); + w3[3] = hc_byte_perm_S (w1[2], w1[1], selector); + w3[2] = hc_byte_perm_S (w1[1], w1[0], selector); + w3[1] = hc_byte_perm_S (w1[0], w0[3], selector); + w3[0] = hc_byte_perm_S (w0[3], w0[2], selector); + w2[3] = hc_byte_perm_S (w0[2], w0[1], selector); + w2[2] = hc_byte_perm_S (w0[1], w0[0], selector); + w2[1] = hc_byte_perm_S (w0[0], 0, selector); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -32974,12 +38292,12 @@ DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 10: - w3[3] = hc_bytealign_S (w1[0], w1[1], offset); - w3[2] = hc_bytealign_S (w0[3], w1[0], offset); - w3[1] = hc_bytealign_S (w0[2], w0[3], offset); - w3[0] = hc_bytealign_S (w0[1], w0[2], offset); - w2[3] = hc_bytealign_S (w0[0], w0[1], offset); - w2[2] = hc_bytealign_S ( 0, w0[0], offset); + w3[3] = hc_byte_perm_S (w1[1], w1[0], selector); + w3[2] = hc_byte_perm_S (w1[0], w0[3], selector); + w3[1] = hc_byte_perm_S (w0[3], w0[2], selector); + w3[0] = hc_byte_perm_S (w0[2], w0[1], selector); + w2[3] = hc_byte_perm_S (w0[1], w0[0], selector); + w2[2] = hc_byte_perm_S (w0[0], 0, selector); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -32994,11 +38312,11 @@ DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 11: - w3[3] = hc_bytealign_S (w0[3], w1[0], offset); - w3[2] = hc_bytealign_S (w0[2], w0[3], offset); - w3[1] = hc_bytealign_S (w0[1], w0[2], offset); - w3[0] = hc_bytealign_S (w0[0], w0[1], offset); - w2[3] = hc_bytealign_S ( 0, w0[0], offset); + w3[3] = hc_byte_perm_S (w1[0], w0[3], selector); + w3[2] = hc_byte_perm_S (w0[3], w0[2], selector); + w3[1] = hc_byte_perm_S (w0[2], w0[1], selector); + w3[0] = hc_byte_perm_S (w0[1], w0[0], selector); + w2[3] = hc_byte_perm_S (w0[0], 0, selector); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -33014,10 +38332,10 @@ DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 12: - w3[3] = hc_bytealign_S (w0[2], w0[3], offset); - w3[2] = hc_bytealign_S (w0[1], w0[2], offset); - w3[1] = hc_bytealign_S (w0[0], w0[1], offset); - w3[0] = hc_bytealign_S ( 0, w0[0], offset); + w3[3] = hc_byte_perm_S (w0[3], w0[2], selector); + w3[2] = hc_byte_perm_S (w0[2], w0[1], selector); + w3[1] = hc_byte_perm_S (w0[1], w0[0], selector); + w3[0] = hc_byte_perm_S (w0[0], 0, selector); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -33034,9 +38352,9 @@ DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 13: - w3[3] = hc_bytealign_S (w0[1], w0[2], offset); - w3[2] = hc_bytealign_S (w0[0], w0[1], offset); - w3[1] = hc_bytealign_S ( 0, w0[0], offset); + w3[3] = hc_byte_perm_S (w0[2], w0[1], selector); + w3[2] = hc_byte_perm_S (w0[1], w0[0], selector); + w3[1] = hc_byte_perm_S (w0[0], 0, selector); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -33054,8 +38372,8 @@ DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 14: - w3[3] = hc_bytealign_S (w0[0], w0[1], offset); - w3[2] = hc_bytealign_S ( 0, w0[0], offset); + w3[3] = hc_byte_perm_S (w0[1], w0[0], selector); + w3[2] = hc_byte_perm_S (w0[0], 0, selector); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -33074,7 +38392,7 @@ DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 15: - w3[3] = hc_bytealign_S ( 0, w0[0], offset); + w3[3] = hc_byte_perm_S (w0[0], 0, selector); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -33094,97 +38412,99 @@ DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; } #endif +} - #if (defined IS_AMD && HAS_VPERM == 1) || defined IS_NV - - const int offset_mod_4 = offset & 3; - - const int offset_minus_4 = 4 - offset_mod_4; - - #if defined IS_NV - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - #endif - - #if defined IS_AMD - const int selector = 0x0706050403020100 >> (offset_minus_4 * 8); - #endif +DECLSPEC void switch_buffer_by_offset_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, u32 *c0, u32 *c1, u32 *c2, u32 *c3, const u32 offset) +{ + const int offset_switch = offset / 4; + #if (defined IS_AMD && HAS_VPERM == 0) || defined IS_GENERIC switch (offset_switch) { case 0: - w3[3] = hc_byte_perm_S (w3[2], w3[3], selector); - w3[2] = hc_byte_perm_S (w3[1], w3[2], selector); - w3[1] = hc_byte_perm_S (w3[0], w3[1], selector); - w3[0] = hc_byte_perm_S (w2[3], w3[0], selector); - w2[3] = hc_byte_perm_S (w2[2], w2[3], selector); - w2[2] = hc_byte_perm_S (w2[1], w2[2], selector); - w2[1] = hc_byte_perm_S (w2[0], w2[1], selector); - w2[0] = hc_byte_perm_S (w1[3], w2[0], selector); - w1[3] = hc_byte_perm_S (w1[2], w1[3], selector); - w1[2] = hc_byte_perm_S (w1[1], w1[2], selector); - w1[1] = hc_byte_perm_S (w1[0], w1[1], selector); - w1[0] = hc_byte_perm_S (w0[3], w1[0], selector); - w0[3] = hc_byte_perm_S (w0[2], w0[3], selector); - w0[2] = hc_byte_perm_S (w0[1], w0[2], selector); - w0[1] = hc_byte_perm_S (w0[0], w0[1], selector); - w0[0] = hc_byte_perm_S ( 0, w0[0], selector); + c0[0] = hc_bytealign_be_S (w3[3], 0, offset); + w3[3] = hc_bytealign_be_S (w3[2], w3[3], offset); + w3[2] = hc_bytealign_be_S (w3[1], w3[2], offset); + w3[1] = hc_bytealign_be_S (w3[0], w3[1], offset); + w3[0] = hc_bytealign_be_S (w2[3], w3[0], offset); + w2[3] = hc_bytealign_be_S (w2[2], w2[3], offset); + w2[2] = hc_bytealign_be_S (w2[1], w2[2], offset); + w2[1] = hc_bytealign_be_S (w2[0], w2[1], offset); + w2[0] = hc_bytealign_be_S (w1[3], w2[0], offset); + w1[3] = hc_bytealign_be_S (w1[2], w1[3], offset); + w1[2] = hc_bytealign_be_S (w1[1], w1[2], offset); + w1[1] = hc_bytealign_be_S (w1[0], w1[1], offset); + w1[0] = hc_bytealign_be_S (w0[3], w1[0], offset); + w0[3] = hc_bytealign_be_S (w0[2], w0[3], offset); + w0[2] = hc_bytealign_be_S (w0[1], w0[2], offset); + w0[1] = hc_bytealign_be_S (w0[0], w0[1], offset); + w0[0] = hc_bytealign_be_S ( 0, w0[0], offset); break; case 1: - w3[3] = hc_byte_perm_S (w3[1], w3[2], selector); - w3[2] = hc_byte_perm_S (w3[0], w3[1], selector); - w3[1] = hc_byte_perm_S (w2[3], w3[0], selector); - w3[0] = hc_byte_perm_S (w2[2], w2[3], selector); - w2[3] = hc_byte_perm_S (w2[1], w2[2], selector); - w2[2] = hc_byte_perm_S (w2[0], w2[1], selector); - w2[1] = hc_byte_perm_S (w1[3], w2[0], selector); - w2[0] = hc_byte_perm_S (w1[2], w1[3], selector); - w1[3] = hc_byte_perm_S (w1[1], w1[2], selector); - w1[2] = hc_byte_perm_S (w1[0], w1[1], selector); - w1[1] = hc_byte_perm_S (w0[3], w1[0], selector); - w1[0] = hc_byte_perm_S (w0[2], w0[3], selector); - w0[3] = hc_byte_perm_S (w0[1], w0[2], selector); - w0[2] = hc_byte_perm_S (w0[0], w0[1], selector); - w0[1] = hc_byte_perm_S ( 0, w0[0], selector); + c0[1] = hc_bytealign_be_S (w3[3], 0, offset); + c0[0] = hc_bytealign_be_S (w3[2], w3[3], offset); + w3[3] = hc_bytealign_be_S (w3[1], w3[2], offset); + w3[2] = hc_bytealign_be_S (w3[0], w3[1], offset); + w3[1] = hc_bytealign_be_S (w2[3], w3[0], offset); + w3[0] = hc_bytealign_be_S (w2[2], w2[3], offset); + w2[3] = hc_bytealign_be_S (w2[1], w2[2], offset); + w2[2] = hc_bytealign_be_S (w2[0], w2[1], offset); + w2[1] = hc_bytealign_be_S (w1[3], w2[0], offset); + w2[0] = hc_bytealign_be_S (w1[2], w1[3], offset); + w1[3] = hc_bytealign_be_S (w1[1], w1[2], offset); + w1[2] = hc_bytealign_be_S (w1[0], w1[1], offset); + w1[1] = hc_bytealign_be_S (w0[3], w1[0], offset); + w1[0] = hc_bytealign_be_S (w0[2], w0[3], offset); + w0[3] = hc_bytealign_be_S (w0[1], w0[2], offset); + w0[2] = hc_bytealign_be_S (w0[0], w0[1], offset); + w0[1] = hc_bytealign_be_S ( 0, w0[0], offset); w0[0] = 0; break; case 2: - w3[3] = hc_byte_perm_S (w3[0], w3[1], selector); - w3[2] = hc_byte_perm_S (w2[3], w3[0], selector); - w3[1] = hc_byte_perm_S (w2[2], w2[3], selector); - w3[0] = hc_byte_perm_S (w2[1], w2[2], selector); - w2[3] = hc_byte_perm_S (w2[0], w2[1], selector); - w2[2] = hc_byte_perm_S (w1[3], w2[0], selector); - w2[1] = hc_byte_perm_S (w1[2], w1[3], selector); - w2[0] = hc_byte_perm_S (w1[1], w1[2], selector); - w1[3] = hc_byte_perm_S (w1[0], w1[1], selector); - w1[2] = hc_byte_perm_S (w0[3], w1[0], selector); - w1[1] = hc_byte_perm_S (w0[2], w0[3], selector); - w1[0] = hc_byte_perm_S (w0[1], w0[2], selector); - w0[3] = hc_byte_perm_S (w0[0], w0[1], selector); - w0[2] = hc_byte_perm_S ( 0, w0[0], selector); - w0[1] = 0; - w0[0] = 0; - - break; - - case 3: - w3[3] = hc_byte_perm_S (w2[3], w3[0], selector); - w3[2] = hc_byte_perm_S (w2[2], w2[3], selector); - w3[1] = hc_byte_perm_S (w2[1], w2[2], selector); - w3[0] = hc_byte_perm_S (w2[0], w2[1], selector); - w2[3] = hc_byte_perm_S (w1[3], w2[0], selector); - w2[2] = hc_byte_perm_S (w1[2], w1[3], selector); - w2[1] = hc_byte_perm_S (w1[1], w1[2], selector); - w2[0] = hc_byte_perm_S (w1[0], w1[1], selector); - w1[3] = hc_byte_perm_S (w0[3], w1[0], selector); - w1[2] = hc_byte_perm_S (w0[2], w0[3], selector); - w1[1] = hc_byte_perm_S (w0[1], w0[2], selector); - w1[0] = hc_byte_perm_S (w0[0], w0[1], selector); - w0[3] = hc_byte_perm_S ( 0, w0[0], selector); + c0[2] = hc_bytealign_be_S (w3[3], 0, offset); + c0[1] = hc_bytealign_be_S (w3[2], w3[3], offset); + c0[0] = hc_bytealign_be_S (w3[1], w3[2], offset); + w3[3] = hc_bytealign_be_S (w3[0], w3[1], offset); + w3[2] = hc_bytealign_be_S (w2[3], w3[0], offset); + w3[1] = hc_bytealign_be_S (w2[2], w2[3], offset); + w3[0] = hc_bytealign_be_S (w2[1], w2[2], offset); + w2[3] = hc_bytealign_be_S (w2[0], w2[1], offset); + w2[2] = hc_bytealign_be_S (w1[3], w2[0], offset); + w2[1] = hc_bytealign_be_S (w1[2], w1[3], offset); + w2[0] = hc_bytealign_be_S (w1[1], w1[2], offset); + w1[3] = hc_bytealign_be_S (w1[0], w1[1], offset); + w1[2] = hc_bytealign_be_S (w0[3], w1[0], offset); + w1[1] = hc_bytealign_be_S (w0[2], w0[3], offset); + w1[0] = hc_bytealign_be_S (w0[1], w0[2], offset); + w0[3] = hc_bytealign_be_S (w0[0], w0[1], offset); + w0[2] = hc_bytealign_be_S ( 0, w0[0], offset); + w0[1] = 0; + w0[0] = 0; + + break; + + case 3: + c0[3] = hc_bytealign_be_S (w3[3], 0, offset); + c0[2] = hc_bytealign_be_S (w3[2], w3[3], offset); + c0[1] = hc_bytealign_be_S (w3[1], w3[2], offset); + c0[0] = hc_bytealign_be_S (w3[0], w3[1], offset); + w3[3] = hc_bytealign_be_S (w2[3], w3[0], offset); + w3[2] = hc_bytealign_be_S (w2[2], w2[3], offset); + w3[1] = hc_bytealign_be_S (w2[1], w2[2], offset); + w3[0] = hc_bytealign_be_S (w2[0], w2[1], offset); + w2[3] = hc_bytealign_be_S (w1[3], w2[0], offset); + w2[2] = hc_bytealign_be_S (w1[2], w1[3], offset); + w2[1] = hc_bytealign_be_S (w1[1], w1[2], offset); + w2[0] = hc_bytealign_be_S (w1[0], w1[1], offset); + w1[3] = hc_bytealign_be_S (w0[3], w1[0], offset); + w1[2] = hc_bytealign_be_S (w0[2], w0[3], offset); + w1[1] = hc_bytealign_be_S (w0[1], w0[2], offset); + w1[0] = hc_bytealign_be_S (w0[0], w0[1], offset); + w0[3] = hc_bytealign_be_S ( 0, w0[0], offset); w0[2] = 0; w0[1] = 0; w0[0] = 0; @@ -33192,18 +38512,23 @@ DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 4: - w3[3] = hc_byte_perm_S (w2[2], w2[3], selector); - w3[2] = hc_byte_perm_S (w2[1], w2[2], selector); - w3[1] = hc_byte_perm_S (w2[0], w2[1], selector); - w3[0] = hc_byte_perm_S (w1[3], w2[0], selector); - w2[3] = hc_byte_perm_S (w1[2], w1[3], selector); - w2[2] = hc_byte_perm_S (w1[1], w1[2], selector); - w2[1] = hc_byte_perm_S (w1[0], w1[1], selector); - w2[0] = hc_byte_perm_S (w0[3], w1[0], selector); - w1[3] = hc_byte_perm_S (w0[2], w0[3], selector); - w1[2] = hc_byte_perm_S (w0[1], w0[2], selector); - w1[1] = hc_byte_perm_S (w0[0], w0[1], selector); - w1[0] = hc_byte_perm_S ( 0, w0[0], selector); + c1[0] = hc_bytealign_be_S (w3[3], 0, offset); + c0[3] = hc_bytealign_be_S (w3[2], w3[3], offset); + c0[2] = hc_bytealign_be_S (w3[1], w3[2], offset); + c0[1] = hc_bytealign_be_S (w3[0], w3[1], offset); + c0[0] = hc_bytealign_be_S (w2[3], w3[0], offset); + w3[3] = hc_bytealign_be_S (w2[2], w2[3], offset); + w3[2] = hc_bytealign_be_S (w2[1], w2[2], offset); + w3[1] = hc_bytealign_be_S (w2[0], w2[1], offset); + w3[0] = hc_bytealign_be_S (w1[3], w2[0], offset); + w2[3] = hc_bytealign_be_S (w1[2], w1[3], offset); + w2[2] = hc_bytealign_be_S (w1[1], w1[2], offset); + w2[1] = hc_bytealign_be_S (w1[0], w1[1], offset); + w2[0] = hc_bytealign_be_S (w0[3], w1[0], offset); + w1[3] = hc_bytealign_be_S (w0[2], w0[3], offset); + w1[2] = hc_bytealign_be_S (w0[1], w0[2], offset); + w1[1] = hc_bytealign_be_S (w0[0], w0[1], offset); + w1[0] = hc_bytealign_be_S ( 0, w0[0], offset); w0[3] = 0; w0[2] = 0; w0[1] = 0; @@ -33212,17 +38537,23 @@ DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 5: - w3[3] = hc_byte_perm_S (w2[1], w2[2], selector); - w3[2] = hc_byte_perm_S (w2[0], w2[1], selector); - w3[1] = hc_byte_perm_S (w1[3], w2[0], selector); - w3[0] = hc_byte_perm_S (w1[2], w1[3], selector); - w2[3] = hc_byte_perm_S (w1[1], w1[2], selector); - w2[2] = hc_byte_perm_S (w1[0], w1[1], selector); - w2[1] = hc_byte_perm_S (w0[3], w1[0], selector); - w2[0] = hc_byte_perm_S (w0[2], w0[3], selector); - w1[3] = hc_byte_perm_S (w0[1], w0[2], selector); - w1[2] = hc_byte_perm_S (w0[0], w0[1], selector); - w1[1] = hc_byte_perm_S ( 0, w0[0], selector); + c1[1] = hc_bytealign_be_S (w3[3], 0, offset); + c1[0] = hc_bytealign_be_S (w3[2], w3[3], offset); + c0[3] = hc_bytealign_be_S (w3[1], w3[2], offset); + c0[2] = hc_bytealign_be_S (w3[0], w3[1], offset); + c0[1] = hc_bytealign_be_S (w2[3], w3[0], offset); + c0[0] = hc_bytealign_be_S (w2[2], w2[3], offset); + w3[3] = hc_bytealign_be_S (w2[1], w2[2], offset); + w3[2] = hc_bytealign_be_S (w2[0], w2[1], offset); + w3[1] = hc_bytealign_be_S (w1[3], w2[0], offset); + w3[0] = hc_bytealign_be_S (w1[2], w1[3], offset); + w2[3] = hc_bytealign_be_S (w1[1], w1[2], offset); + w2[2] = hc_bytealign_be_S (w1[0], w1[1], offset); + w2[1] = hc_bytealign_be_S (w0[3], w1[0], offset); + w2[0] = hc_bytealign_be_S (w0[2], w0[3], offset); + w1[3] = hc_bytealign_be_S (w0[1], w0[2], offset); + w1[2] = hc_bytealign_be_S (w0[0], w0[1], offset); + w1[1] = hc_bytealign_be_S ( 0, w0[0], offset); w1[0] = 0; w0[3] = 0; w0[2] = 0; @@ -33232,16 +38563,23 @@ DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 6: - w3[3] = hc_byte_perm_S (w2[0], w2[1], selector); - w3[2] = hc_byte_perm_S (w1[3], w2[0], selector); - w3[1] = hc_byte_perm_S (w1[2], w1[3], selector); - w3[0] = hc_byte_perm_S (w1[1], w1[2], selector); - w2[3] = hc_byte_perm_S (w1[0], w1[1], selector); - w2[2] = hc_byte_perm_S (w0[3], w1[0], selector); - w2[1] = hc_byte_perm_S (w0[2], w0[3], selector); - w2[0] = hc_byte_perm_S (w0[1], w0[2], selector); - w1[3] = hc_byte_perm_S (w0[0], w0[1], selector); - w1[2] = hc_byte_perm_S ( 0, w0[0], selector); + c1[2] = hc_bytealign_be_S (w3[3], 0, offset); + c1[1] = hc_bytealign_be_S (w3[2], w3[3], offset); + c1[0] = hc_bytealign_be_S (w3[1], w3[2], offset); + c0[3] = hc_bytealign_be_S (w3[0], w3[1], offset); + c0[2] = hc_bytealign_be_S (w2[3], w3[0], offset); + c0[1] = hc_bytealign_be_S (w2[2], w2[3], offset); + c0[0] = hc_bytealign_be_S (w2[1], w2[2], offset); + w3[3] = hc_bytealign_be_S (w2[0], w2[1], offset); + w3[2] = hc_bytealign_be_S (w1[3], w2[0], offset); + w3[1] = hc_bytealign_be_S (w1[2], w1[3], offset); + w3[0] = hc_bytealign_be_S (w1[1], w1[2], offset); + w2[3] = hc_bytealign_be_S (w1[0], w1[1], offset); + w2[2] = hc_bytealign_be_S (w0[3], w1[0], offset); + w2[1] = hc_bytealign_be_S (w0[2], w0[3], offset); + w2[0] = hc_bytealign_be_S (w0[1], w0[2], offset); + w1[3] = hc_bytealign_be_S (w0[0], w0[1], offset); + w1[2] = hc_bytealign_be_S ( 0, w0[0], offset); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -33252,15 +38590,23 @@ DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 7: - w3[3] = hc_byte_perm_S (w1[3], w2[0], selector); - w3[2] = hc_byte_perm_S (w1[2], w1[3], selector); - w3[1] = hc_byte_perm_S (w1[1], w1[2], selector); - w3[0] = hc_byte_perm_S (w1[0], w1[1], selector); - w2[3] = hc_byte_perm_S (w0[3], w1[0], selector); - w2[2] = hc_byte_perm_S (w0[2], w0[3], selector); - w2[1] = hc_byte_perm_S (w0[1], w0[2], selector); - w2[0] = hc_byte_perm_S (w0[0], w0[1], selector); - w1[3] = hc_byte_perm_S ( 0, w0[0], selector); + c1[3] = hc_bytealign_be_S (w3[3], 0, offset); + c1[2] = hc_bytealign_be_S (w3[2], w3[3], offset); + c1[1] = hc_bytealign_be_S (w3[1], w3[2], offset); + c1[0] = hc_bytealign_be_S (w3[0], w3[1], offset); + c0[3] = hc_bytealign_be_S (w2[3], w3[0], offset); + c0[2] = hc_bytealign_be_S (w2[2], w2[3], offset); + c0[1] = hc_bytealign_be_S (w2[1], w2[2], offset); + c0[0] = hc_bytealign_be_S (w2[0], w2[1], offset); + w3[3] = hc_bytealign_be_S (w1[3], w2[0], offset); + w3[2] = hc_bytealign_be_S (w1[2], w1[3], offset); + w3[1] = hc_bytealign_be_S (w1[1], w1[2], offset); + w3[0] = hc_bytealign_be_S (w1[0], w1[1], offset); + w2[3] = hc_bytealign_be_S (w0[3], w1[0], offset); + w2[2] = hc_bytealign_be_S (w0[2], w0[3], offset); + w2[1] = hc_bytealign_be_S (w0[1], w0[2], offset); + w2[0] = hc_bytealign_be_S (w0[0], w0[1], offset); + w1[3] = hc_bytealign_be_S ( 0, w0[0], offset); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -33272,14 +38618,23 @@ DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 8: - w3[3] = hc_byte_perm_S (w1[2], w1[3], selector); - w3[2] = hc_byte_perm_S (w1[1], w1[2], selector); - w3[1] = hc_byte_perm_S (w1[0], w1[1], selector); - w3[0] = hc_byte_perm_S (w0[3], w1[0], selector); - w2[3] = hc_byte_perm_S (w0[2], w0[3], selector); - w2[2] = hc_byte_perm_S (w0[1], w0[2], selector); - w2[1] = hc_byte_perm_S (w0[0], w0[1], selector); - w2[0] = hc_byte_perm_S ( 0, w0[0], selector); + c2[0] = hc_bytealign_be_S (w3[3], 0, offset); + c1[3] = hc_bytealign_be_S (w3[2], w3[3], offset); + c1[2] = hc_bytealign_be_S (w3[1], w3[2], offset); + c1[1] = hc_bytealign_be_S (w3[0], w3[1], offset); + c1[0] = hc_bytealign_be_S (w2[3], w3[0], offset); + c0[3] = hc_bytealign_be_S (w2[2], w2[3], offset); + c0[2] = hc_bytealign_be_S (w2[1], w2[2], offset); + c0[1] = hc_bytealign_be_S (w2[0], w2[1], offset); + c0[0] = hc_bytealign_be_S (w1[3], w2[0], offset); + w3[3] = hc_bytealign_be_S (w1[2], w1[3], offset); + w3[2] = hc_bytealign_be_S (w1[1], w1[2], offset); + w3[1] = hc_bytealign_be_S (w1[0], w1[1], offset); + w3[0] = hc_bytealign_be_S (w0[3], w1[0], offset); + w2[3] = hc_bytealign_be_S (w0[2], w0[3], offset); + w2[2] = hc_bytealign_be_S (w0[1], w0[2], offset); + w2[1] = hc_bytealign_be_S (w0[0], w0[1], offset); + w2[0] = hc_bytealign_be_S ( 0, w0[0], offset); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -33292,13 +38647,23 @@ DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 9: - w3[3] = hc_byte_perm_S (w1[1], w1[2], selector); - w3[2] = hc_byte_perm_S (w1[0], w1[1], selector); - w3[1] = hc_byte_perm_S (w0[3], w1[0], selector); - w3[0] = hc_byte_perm_S (w0[2], w0[3], selector); - w2[3] = hc_byte_perm_S (w0[1], w0[2], selector); - w2[2] = hc_byte_perm_S (w0[0], w0[1], selector); - w2[1] = hc_byte_perm_S ( 0, w0[0], selector); + c2[1] = hc_bytealign_be_S (w3[3], 0, offset); + c2[0] = hc_bytealign_be_S (w3[2], w3[3], offset); + c1[3] = hc_bytealign_be_S (w3[1], w3[2], offset); + c1[2] = hc_bytealign_be_S (w3[0], w3[1], offset); + c1[1] = hc_bytealign_be_S (w2[3], w3[0], offset); + c1[0] = hc_bytealign_be_S (w2[2], w2[3], offset); + c0[3] = hc_bytealign_be_S (w2[1], w2[2], offset); + c0[2] = hc_bytealign_be_S (w2[0], w2[1], offset); + c0[1] = hc_bytealign_be_S (w1[3], w2[0], offset); + c0[0] = hc_bytealign_be_S (w1[2], w1[3], offset); + w3[3] = hc_bytealign_be_S (w1[1], w1[2], offset); + w3[2] = hc_bytealign_be_S (w1[0], w1[1], offset); + w3[1] = hc_bytealign_be_S (w0[3], w1[0], offset); + w3[0] = hc_bytealign_be_S (w0[2], w0[3], offset); + w2[3] = hc_bytealign_be_S (w0[1], w0[2], offset); + w2[2] = hc_bytealign_be_S (w0[0], w0[1], offset); + w2[1] = hc_bytealign_be_S ( 0, w0[0], offset); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -33312,12 +38677,23 @@ DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 10: - w3[3] = hc_byte_perm_S (w1[0], w1[1], selector); - w3[2] = hc_byte_perm_S (w0[3], w1[0], selector); - w3[1] = hc_byte_perm_S (w0[2], w0[3], selector); - w3[0] = hc_byte_perm_S (w0[1], w0[2], selector); - w2[3] = hc_byte_perm_S (w0[0], w0[1], selector); - w2[2] = hc_byte_perm_S ( 0, w0[0], selector); + c2[2] = hc_bytealign_be_S (w3[3], 0, offset); + c2[1] = hc_bytealign_be_S (w3[2], w3[3], offset); + c2[0] = hc_bytealign_be_S (w3[1], w3[2], offset); + c1[3] = hc_bytealign_be_S (w3[0], w3[1], offset); + c1[2] = hc_bytealign_be_S (w2[3], w3[0], offset); + c1[1] = hc_bytealign_be_S (w2[2], w2[3], offset); + c1[0] = hc_bytealign_be_S (w2[1], w2[2], offset); + c0[3] = hc_bytealign_be_S (w2[0], w2[1], offset); + c0[2] = hc_bytealign_be_S (w1[3], w2[0], offset); + c0[1] = hc_bytealign_be_S (w1[2], w1[3], offset); + c0[0] = hc_bytealign_be_S (w1[1], w1[2], offset); + w3[3] = hc_bytealign_be_S (w1[0], w1[1], offset); + w3[2] = hc_bytealign_be_S (w0[3], w1[0], offset); + w3[1] = hc_bytealign_be_S (w0[2], w0[3], offset); + w3[0] = hc_bytealign_be_S (w0[1], w0[2], offset); + w2[3] = hc_bytealign_be_S (w0[0], w0[1], offset); + w2[2] = hc_bytealign_be_S ( 0, w0[0], offset); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -33332,11 +38708,23 @@ DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 11: - w3[3] = hc_byte_perm_S (w0[3], w1[0], selector); - w3[2] = hc_byte_perm_S (w0[2], w0[3], selector); - w3[1] = hc_byte_perm_S (w0[1], w0[2], selector); - w3[0] = hc_byte_perm_S (w0[0], w0[1], selector); - w2[3] = hc_byte_perm_S ( 0, w0[0], selector); + c2[3] = hc_bytealign_be_S (w3[3], 0, offset); + c2[2] = hc_bytealign_be_S (w3[2], w3[3], offset); + c2[1] = hc_bytealign_be_S (w3[1], w3[2], offset); + c2[0] = hc_bytealign_be_S (w3[0], w3[1], offset); + c1[3] = hc_bytealign_be_S (w2[3], w3[0], offset); + c1[2] = hc_bytealign_be_S (w2[2], w2[3], offset); + c1[1] = hc_bytealign_be_S (w2[1], w2[2], offset); + c1[0] = hc_bytealign_be_S (w2[0], w2[1], offset); + c0[3] = hc_bytealign_be_S (w1[3], w2[0], offset); + c0[2] = hc_bytealign_be_S (w1[2], w1[3], offset); + c0[1] = hc_bytealign_be_S (w1[1], w1[2], offset); + c0[0] = hc_bytealign_be_S (w1[0], w1[1], offset); + w3[3] = hc_bytealign_be_S (w0[3], w1[0], offset); + w3[2] = hc_bytealign_be_S (w0[2], w0[3], offset); + w3[1] = hc_bytealign_be_S (w0[1], w0[2], offset); + w3[0] = hc_bytealign_be_S (w0[0], w0[1], offset); + w2[3] = hc_bytealign_be_S ( 0, w0[0], offset); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -33352,10 +38740,23 @@ DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 12: - w3[3] = hc_byte_perm_S (w0[2], w0[3], selector); - w3[2] = hc_byte_perm_S (w0[1], w0[2], selector); - w3[1] = hc_byte_perm_S (w0[0], w0[1], selector); - w3[0] = hc_byte_perm_S ( 0, w0[0], selector); + c3[0] = hc_bytealign_be_S (w3[3], 0, offset); + c2[3] = hc_bytealign_be_S (w3[2], w3[3], offset); + c2[2] = hc_bytealign_be_S (w3[1], w3[2], offset); + c2[1] = hc_bytealign_be_S (w3[0], w3[1], offset); + c2[0] = hc_bytealign_be_S (w2[3], w3[0], offset); + c1[3] = hc_bytealign_be_S (w2[2], w2[3], offset); + c1[2] = hc_bytealign_be_S (w2[1], w2[2], offset); + c1[1] = hc_bytealign_be_S (w2[0], w2[1], offset); + c1[0] = hc_bytealign_be_S (w1[3], w2[0], offset); + c0[3] = hc_bytealign_be_S (w1[2], w1[3], offset); + c0[2] = hc_bytealign_be_S (w1[1], w1[2], offset); + c0[1] = hc_bytealign_be_S (w1[0], w1[1], offset); + c0[0] = hc_bytealign_be_S (w0[3], w1[0], offset); + w3[3] = hc_bytealign_be_S (w0[2], w0[3], offset); + w3[2] = hc_bytealign_be_S (w0[1], w0[2], offset); + w3[1] = hc_bytealign_be_S (w0[0], w0[1], offset); + w3[0] = hc_bytealign_be_S ( 0, w0[0], offset); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -33372,9 +38773,23 @@ DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 13: - w3[3] = hc_byte_perm_S (w0[1], w0[2], selector); - w3[2] = hc_byte_perm_S (w0[0], w0[1], selector); - w3[1] = hc_byte_perm_S ( 0, w0[0], selector); + c3[1] = hc_bytealign_be_S (w3[3], 0, offset); + c3[0] = hc_bytealign_be_S (w3[2], w3[3], offset); + c2[3] = hc_bytealign_be_S (w3[1], w3[2], offset); + c2[2] = hc_bytealign_be_S (w3[0], w3[1], offset); + c2[1] = hc_bytealign_be_S (w2[3], w3[0], offset); + c2[0] = hc_bytealign_be_S (w2[2], w2[3], offset); + c1[3] = hc_bytealign_be_S (w2[1], w2[2], offset); + c1[2] = hc_bytealign_be_S (w2[0], w2[1], offset); + c1[1] = hc_bytealign_be_S (w1[3], w2[0], offset); + c1[0] = hc_bytealign_be_S (w1[2], w1[3], offset); + c0[3] = hc_bytealign_be_S (w1[1], w1[2], offset); + c0[2] = hc_bytealign_be_S (w1[0], w1[1], offset); + c0[1] = hc_bytealign_be_S (w0[3], w1[0], offset); + c0[0] = hc_bytealign_be_S (w0[2], w0[3], offset); + w3[3] = hc_bytealign_be_S (w0[1], w0[2], offset); + w3[2] = hc_bytealign_be_S (w0[0], w0[1], offset); + w3[1] = hc_bytealign_be_S ( 0, w0[0], offset); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -33392,8 +38807,23 @@ DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 14: - w3[3] = hc_byte_perm_S (w0[0], w0[1], selector); - w3[2] = hc_byte_perm_S ( 0, w0[0], selector); + c3[2] = hc_bytealign_be_S (w3[3], 0, offset); + c3[1] = hc_bytealign_be_S (w3[2], w3[3], offset); + c3[0] = hc_bytealign_be_S (w3[1], w3[2], offset); + c2[3] = hc_bytealign_be_S (w3[0], w3[1], offset); + c2[2] = hc_bytealign_be_S (w2[3], w3[0], offset); + c2[1] = hc_bytealign_be_S (w2[2], w2[3], offset); + c2[0] = hc_bytealign_be_S (w2[1], w2[2], offset); + c1[3] = hc_bytealign_be_S (w2[0], w2[1], offset); + c1[2] = hc_bytealign_be_S (w1[3], w2[0], offset); + c1[1] = hc_bytealign_be_S (w1[2], w1[3], offset); + c1[0] = hc_bytealign_be_S (w1[1], w1[2], offset); + c0[3] = hc_bytealign_be_S (w1[0], w1[1], offset); + c0[2] = hc_bytealign_be_S (w0[3], w1[0], offset); + c0[1] = hc_bytealign_be_S (w0[2], w0[3], offset); + c0[0] = hc_bytealign_be_S (w0[1], w0[2], offset); + w3[3] = hc_bytealign_be_S (w0[0], w0[1], offset); + w3[2] = hc_bytealign_be_S ( 0, w0[0], offset); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -33412,7 +38842,23 @@ DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; case 15: - w3[3] = hc_byte_perm_S ( 0, w0[0], selector); + c3[3] = hc_bytealign_be_S (w3[3], 0, offset); + c3[2] = hc_bytealign_be_S (w3[2], w3[3], offset); + c3[1] = hc_bytealign_be_S (w3[1], w3[2], offset); + c3[0] = hc_bytealign_be_S (w3[0], w3[1], offset); + c2[3] = hc_bytealign_be_S (w2[3], w3[0], offset); + c2[2] = hc_bytealign_be_S (w2[2], w2[3], offset); + c2[1] = hc_bytealign_be_S (w2[1], w2[2], offset); + c2[0] = hc_bytealign_be_S (w2[0], w2[1], offset); + c1[3] = hc_bytealign_be_S (w1[3], w2[0], offset); + c1[2] = hc_bytealign_be_S (w1[2], w1[3], offset); + c1[1] = hc_bytealign_be_S (w1[1], w1[2], offset); + c1[0] = hc_bytealign_be_S (w1[0], w1[1], offset); + c0[3] = hc_bytealign_be_S (w0[3], w1[0], offset); + c0[2] = hc_bytealign_be_S (w0[2], w0[3], offset); + c0[1] = hc_bytealign_be_S (w0[1], w0[2], offset); + c0[0] = hc_bytealign_be_S (w0[0], w0[1], offset); + w3[3] = hc_bytealign_be_S ( 0, w0[0], offset); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -33432,99 +38878,103 @@ DECLSPEC void switch_buffer_by_offset_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; } #endif -} -DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, u32 *c0, u32 *c1, u32 *c2, u32 *c3, const u32 offset) -{ - const int offset_switch = offset / 4; + #if (defined IS_AMD && HAS_VPERM == 1) || defined IS_NV + + #if defined IS_NV + const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; + #endif + + #if defined IS_AMD + const int selector = 0x0706050403020100 >> ((offset & 3) * 8); + #endif - #if defined IS_AMD || defined IS_GENERIC switch (offset_switch) { case 0: - c0[0] = hc_bytealign_S (w3[3], 0, offset); - w3[3] = hc_bytealign_S (w3[2], w3[3], offset); - w3[2] = hc_bytealign_S (w3[1], w3[2], offset); - w3[1] = hc_bytealign_S (w3[0], w3[1], offset); - w3[0] = hc_bytealign_S (w2[3], w3[0], offset); - w2[3] = hc_bytealign_S (w2[2], w2[3], offset); - w2[2] = hc_bytealign_S (w2[1], w2[2], offset); - w2[1] = hc_bytealign_S (w2[0], w2[1], offset); - w2[0] = hc_bytealign_S (w1[3], w2[0], offset); - w1[3] = hc_bytealign_S (w1[2], w1[3], offset); - w1[2] = hc_bytealign_S (w1[1], w1[2], offset); - w1[1] = hc_bytealign_S (w1[0], w1[1], offset); - w1[0] = hc_bytealign_S (w0[3], w1[0], offset); - w0[3] = hc_bytealign_S (w0[2], w0[3], offset); - w0[2] = hc_bytealign_S (w0[1], w0[2], offset); - w0[1] = hc_bytealign_S (w0[0], w0[1], offset); - w0[0] = hc_bytealign_S ( 0, w0[0], offset); + c0[0] = hc_byte_perm_S ( 0, w3[3], selector); + w3[3] = hc_byte_perm_S (w3[3], w3[2], selector); + w3[2] = hc_byte_perm_S (w3[2], w3[1], selector); + w3[1] = hc_byte_perm_S (w3[1], w3[0], selector); + w3[0] = hc_byte_perm_S (w3[0], w2[3], selector); + w2[3] = hc_byte_perm_S (w2[3], w2[2], selector); + w2[2] = hc_byte_perm_S (w2[2], w2[1], selector); + w2[1] = hc_byte_perm_S (w2[1], w2[0], selector); + w2[0] = hc_byte_perm_S (w2[0], w1[3], selector); + w1[3] = hc_byte_perm_S (w1[3], w1[2], selector); + w1[2] = hc_byte_perm_S (w1[2], w1[1], selector); + w1[1] = hc_byte_perm_S (w1[1], w1[0], selector); + w1[0] = hc_byte_perm_S (w1[0], w0[3], selector); + w0[3] = hc_byte_perm_S (w0[3], w0[2], selector); + w0[2] = hc_byte_perm_S (w0[2], w0[1], selector); + w0[1] = hc_byte_perm_S (w0[1], w0[0], selector); + w0[0] = hc_byte_perm_S (w0[0], 0, selector); break; case 1: - c0[1] = hc_bytealign_S (w3[3], 0, offset); - c0[0] = hc_bytealign_S (w3[2], w3[3], offset); - w3[3] = hc_bytealign_S (w3[1], w3[2], offset); - w3[2] = hc_bytealign_S (w3[0], w3[1], offset); - w3[1] = hc_bytealign_S (w2[3], w3[0], offset); - w3[0] = hc_bytealign_S (w2[2], w2[3], offset); - w2[3] = hc_bytealign_S (w2[1], w2[2], offset); - w2[2] = hc_bytealign_S (w2[0], w2[1], offset); - w2[1] = hc_bytealign_S (w1[3], w2[0], offset); - w2[0] = hc_bytealign_S (w1[2], w1[3], offset); - w1[3] = hc_bytealign_S (w1[1], w1[2], offset); - w1[2] = hc_bytealign_S (w1[0], w1[1], offset); - w1[1] = hc_bytealign_S (w0[3], w1[0], offset); - w1[0] = hc_bytealign_S (w0[2], w0[3], offset); - w0[3] = hc_bytealign_S (w0[1], w0[2], offset); - w0[2] = hc_bytealign_S (w0[0], w0[1], offset); - w0[1] = hc_bytealign_S ( 0, w0[0], offset); + c0[1] = hc_byte_perm_S ( 0, w3[3], selector); + c0[0] = hc_byte_perm_S (w3[3], w3[2], selector); + w3[3] = hc_byte_perm_S (w3[2], w3[1], selector); + w3[2] = hc_byte_perm_S (w3[1], w3[0], selector); + w3[1] = hc_byte_perm_S (w3[0], w2[3], selector); + w3[0] = hc_byte_perm_S (w2[3], w2[2], selector); + w2[3] = hc_byte_perm_S (w2[2], w2[1], selector); + w2[2] = hc_byte_perm_S (w2[1], w2[0], selector); + w2[1] = hc_byte_perm_S (w2[0], w1[3], selector); + w2[0] = hc_byte_perm_S (w1[3], w1[2], selector); + w1[3] = hc_byte_perm_S (w1[2], w1[1], selector); + w1[2] = hc_byte_perm_S (w1[1], w1[0], selector); + w1[1] = hc_byte_perm_S (w1[0], w0[3], selector); + w1[0] = hc_byte_perm_S (w0[3], w0[2], selector); + w0[3] = hc_byte_perm_S (w0[2], w0[1], selector); + w0[2] = hc_byte_perm_S (w0[1], w0[0], selector); + w0[1] = hc_byte_perm_S (w0[0], 0, selector); w0[0] = 0; break; case 2: - c0[2] = hc_bytealign_S (w3[3], 0, offset); - c0[1] = hc_bytealign_S (w3[2], w3[3], offset); - c0[0] = hc_bytealign_S (w3[1], w3[2], offset); - w3[3] = hc_bytealign_S (w3[0], w3[1], offset); - w3[2] = hc_bytealign_S (w2[3], w3[0], offset); - w3[1] = hc_bytealign_S (w2[2], w2[3], offset); - w3[0] = hc_bytealign_S (w2[1], w2[2], offset); - w2[3] = hc_bytealign_S (w2[0], w2[1], offset); - w2[2] = hc_bytealign_S (w1[3], w2[0], offset); - w2[1] = hc_bytealign_S (w1[2], w1[3], offset); - w2[0] = hc_bytealign_S (w1[1], w1[2], offset); - w1[3] = hc_bytealign_S (w1[0], w1[1], offset); - w1[2] = hc_bytealign_S (w0[3], w1[0], offset); - w1[1] = hc_bytealign_S (w0[2], w0[3], offset); - w1[0] = hc_bytealign_S (w0[1], w0[2], offset); - w0[3] = hc_bytealign_S (w0[0], w0[1], offset); - w0[2] = hc_bytealign_S ( 0, w0[0], offset); + c0[2] = hc_byte_perm_S ( 0, w3[3], selector); + c0[1] = hc_byte_perm_S (w3[3], w3[2], selector); + c0[0] = hc_byte_perm_S (w3[2], w3[1], selector); + w3[3] = hc_byte_perm_S (w3[1], w3[0], selector); + w3[2] = hc_byte_perm_S (w3[0], w2[3], selector); + w3[1] = hc_byte_perm_S (w2[3], w2[2], selector); + w3[0] = hc_byte_perm_S (w2[2], w2[1], selector); + w2[3] = hc_byte_perm_S (w2[1], w2[0], selector); + w2[2] = hc_byte_perm_S (w2[0], w1[3], selector); + w2[1] = hc_byte_perm_S (w1[3], w1[2], selector); + w2[0] = hc_byte_perm_S (w1[2], w1[1], selector); + w1[3] = hc_byte_perm_S (w1[1], w1[0], selector); + w1[2] = hc_byte_perm_S (w1[0], w0[3], selector); + w1[1] = hc_byte_perm_S (w0[3], w0[2], selector); + w1[0] = hc_byte_perm_S (w0[2], w0[1], selector); + w0[3] = hc_byte_perm_S (w0[1], w0[0], selector); + w0[2] = hc_byte_perm_S (w0[0], 0, selector); w0[1] = 0; w0[0] = 0; break; case 3: - c0[3] = hc_bytealign_S (w3[3], 0, offset); - c0[2] = hc_bytealign_S (w3[2], w3[3], offset); - c0[1] = hc_bytealign_S (w3[1], w3[2], offset); - c0[0] = hc_bytealign_S (w3[0], w3[1], offset); - w3[3] = hc_bytealign_S (w2[3], w3[0], offset); - w3[2] = hc_bytealign_S (w2[2], w2[3], offset); - w3[1] = hc_bytealign_S (w2[1], w2[2], offset); - w3[0] = hc_bytealign_S (w2[0], w2[1], offset); - w2[3] = hc_bytealign_S (w1[3], w2[0], offset); - w2[2] = hc_bytealign_S (w1[2], w1[3], offset); - w2[1] = hc_bytealign_S (w1[1], w1[2], offset); - w2[0] = hc_bytealign_S (w1[0], w1[1], offset); - w1[3] = hc_bytealign_S (w0[3], w1[0], offset); - w1[2] = hc_bytealign_S (w0[2], w0[3], offset); - w1[1] = hc_bytealign_S (w0[1], w0[2], offset); - w1[0] = hc_bytealign_S (w0[0], w0[1], offset); - w0[3] = hc_bytealign_S ( 0, w0[0], offset); + c0[3] = hc_byte_perm_S ( 0, w3[3], selector); + c0[2] = hc_byte_perm_S (w3[3], w3[2], selector); + c0[1] = hc_byte_perm_S (w3[2], w3[1], selector); + c0[0] = hc_byte_perm_S (w3[1], w3[0], selector); + w3[3] = hc_byte_perm_S (w3[0], w2[3], selector); + w3[2] = hc_byte_perm_S (w2[3], w2[2], selector); + w3[1] = hc_byte_perm_S (w2[2], w2[1], selector); + w3[0] = hc_byte_perm_S (w2[1], w2[0], selector); + w2[3] = hc_byte_perm_S (w2[0], w1[3], selector); + w2[2] = hc_byte_perm_S (w1[3], w1[2], selector); + w2[1] = hc_byte_perm_S (w1[2], w1[1], selector); + w2[0] = hc_byte_perm_S (w1[1], w1[0], selector); + w1[3] = hc_byte_perm_S (w1[0], w0[3], selector); + w1[2] = hc_byte_perm_S (w0[3], w0[2], selector); + w1[1] = hc_byte_perm_S (w0[2], w0[1], selector); + w1[0] = hc_byte_perm_S (w0[1], w0[0], selector); + w0[3] = hc_byte_perm_S (w0[0], 0, selector); w0[2] = 0; w0[1] = 0; w0[0] = 0; @@ -33532,48 +38982,48 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 4: - c1[0] = hc_bytealign_S (w3[3], 0, offset); - c0[3] = hc_bytealign_S (w3[2], w3[3], offset); - c0[2] = hc_bytealign_S (w3[1], w3[2], offset); - c0[1] = hc_bytealign_S (w3[0], w3[1], offset); - c0[0] = hc_bytealign_S (w2[3], w3[0], offset); - w3[3] = hc_bytealign_S (w2[2], w2[3], offset); - w3[2] = hc_bytealign_S (w2[1], w2[2], offset); - w3[1] = hc_bytealign_S (w2[0], w2[1], offset); - w3[0] = hc_bytealign_S (w1[3], w2[0], offset); - w2[3] = hc_bytealign_S (w1[2], w1[3], offset); - w2[2] = hc_bytealign_S (w1[1], w1[2], offset); - w2[1] = hc_bytealign_S (w1[0], w1[1], offset); - w2[0] = hc_bytealign_S (w0[3], w1[0], offset); - w1[3] = hc_bytealign_S (w0[2], w0[3], offset); - w1[2] = hc_bytealign_S (w0[1], w0[2], offset); - w1[1] = hc_bytealign_S (w0[0], w0[1], offset); - w1[0] = hc_bytealign_S ( 0, w0[0], offset); + c1[0] = hc_byte_perm_S ( 0, w3[3], selector); + c0[3] = hc_byte_perm_S (w3[3], w3[2], selector); + c0[2] = hc_byte_perm_S (w3[2], w3[1], selector); + c0[1] = hc_byte_perm_S (w3[1], w3[0], selector); + c0[0] = hc_byte_perm_S (w3[0], w2[3], selector); + w3[3] = hc_byte_perm_S (w2[3], w2[2], selector); + w3[2] = hc_byte_perm_S (w2[2], w2[1], selector); + w3[1] = hc_byte_perm_S (w2[1], w2[0], selector); + w3[0] = hc_byte_perm_S (w2[0], w1[3], selector); + w2[3] = hc_byte_perm_S (w1[3], w1[2], selector); + w2[2] = hc_byte_perm_S (w1[2], w1[1], selector); + w2[1] = hc_byte_perm_S (w1[1], w1[0], selector); + w2[0] = hc_byte_perm_S (w1[0], w0[3], selector); + w1[3] = hc_byte_perm_S (w0[3], w0[2], selector); + w1[2] = hc_byte_perm_S (w0[2], w0[1], selector); + w1[1] = hc_byte_perm_S (w0[1], w0[0], selector); + w1[0] = hc_byte_perm_S (w0[0], 0, selector); w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; - - break; - - case 5: - c1[1] = hc_bytealign_S (w3[3], 0, offset); - c1[0] = hc_bytealign_S (w3[2], w3[3], offset); - c0[3] = hc_bytealign_S (w3[1], w3[2], offset); - c0[2] = hc_bytealign_S (w3[0], w3[1], offset); - c0[1] = hc_bytealign_S (w2[3], w3[0], offset); - c0[0] = hc_bytealign_S (w2[2], w2[3], offset); - w3[3] = hc_bytealign_S (w2[1], w2[2], offset); - w3[2] = hc_bytealign_S (w2[0], w2[1], offset); - w3[1] = hc_bytealign_S (w1[3], w2[0], offset); - w3[0] = hc_bytealign_S (w1[2], w1[3], offset); - w2[3] = hc_bytealign_S (w1[1], w1[2], offset); - w2[2] = hc_bytealign_S (w1[0], w1[1], offset); - w2[1] = hc_bytealign_S (w0[3], w1[0], offset); - w2[0] = hc_bytealign_S (w0[2], w0[3], offset); - w1[3] = hc_bytealign_S (w0[1], w0[2], offset); - w1[2] = hc_bytealign_S (w0[0], w0[1], offset); - w1[1] = hc_bytealign_S ( 0, w0[0], offset); + + break; + + case 5: + c1[1] = hc_byte_perm_S ( 0, w3[3], selector); + c1[0] = hc_byte_perm_S (w3[3], w3[2], selector); + c0[3] = hc_byte_perm_S (w3[2], w3[1], selector); + c0[2] = hc_byte_perm_S (w3[1], w3[0], selector); + c0[1] = hc_byte_perm_S (w3[0], w2[3], selector); + c0[0] = hc_byte_perm_S (w2[3], w2[2], selector); + w3[3] = hc_byte_perm_S (w2[2], w2[1], selector); + w3[2] = hc_byte_perm_S (w2[1], w2[0], selector); + w3[1] = hc_byte_perm_S (w2[0], w1[3], selector); + w3[0] = hc_byte_perm_S (w1[3], w1[2], selector); + w2[3] = hc_byte_perm_S (w1[2], w1[1], selector); + w2[2] = hc_byte_perm_S (w1[1], w1[0], selector); + w2[1] = hc_byte_perm_S (w1[0], w0[3], selector); + w2[0] = hc_byte_perm_S (w0[3], w0[2], selector); + w1[3] = hc_byte_perm_S (w0[2], w0[1], selector); + w1[2] = hc_byte_perm_S (w0[1], w0[0], selector); + w1[1] = hc_byte_perm_S (w0[0], 0, selector); w1[0] = 0; w0[3] = 0; w0[2] = 0; @@ -33583,23 +39033,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 6: - c1[2] = hc_bytealign_S (w3[3], 0, offset); - c1[1] = hc_bytealign_S (w3[2], w3[3], offset); - c1[0] = hc_bytealign_S (w3[1], w3[2], offset); - c0[3] = hc_bytealign_S (w3[0], w3[1], offset); - c0[2] = hc_bytealign_S (w2[3], w3[0], offset); - c0[1] = hc_bytealign_S (w2[2], w2[3], offset); - c0[0] = hc_bytealign_S (w2[1], w2[2], offset); - w3[3] = hc_bytealign_S (w2[0], w2[1], offset); - w3[2] = hc_bytealign_S (w1[3], w2[0], offset); - w3[1] = hc_bytealign_S (w1[2], w1[3], offset); - w3[0] = hc_bytealign_S (w1[1], w1[2], offset); - w2[3] = hc_bytealign_S (w1[0], w1[1], offset); - w2[2] = hc_bytealign_S (w0[3], w1[0], offset); - w2[1] = hc_bytealign_S (w0[2], w0[3], offset); - w2[0] = hc_bytealign_S (w0[1], w0[2], offset); - w1[3] = hc_bytealign_S (w0[0], w0[1], offset); - w1[2] = hc_bytealign_S ( 0, w0[0], offset); + c1[2] = hc_byte_perm_S ( 0, w3[3], selector); + c1[1] = hc_byte_perm_S (w3[3], w3[2], selector); + c1[0] = hc_byte_perm_S (w3[2], w3[1], selector); + c0[3] = hc_byte_perm_S (w3[1], w3[0], selector); + c0[2] = hc_byte_perm_S (w3[0], w2[3], selector); + c0[1] = hc_byte_perm_S (w2[3], w2[2], selector); + c0[0] = hc_byte_perm_S (w2[2], w2[1], selector); + w3[3] = hc_byte_perm_S (w2[1], w2[0], selector); + w3[2] = hc_byte_perm_S (w2[0], w1[3], selector); + w3[1] = hc_byte_perm_S (w1[3], w1[2], selector); + w3[0] = hc_byte_perm_S (w1[2], w1[1], selector); + w2[3] = hc_byte_perm_S (w1[1], w1[0], selector); + w2[2] = hc_byte_perm_S (w1[0], w0[3], selector); + w2[1] = hc_byte_perm_S (w0[3], w0[2], selector); + w2[0] = hc_byte_perm_S (w0[2], w0[1], selector); + w1[3] = hc_byte_perm_S (w0[1], w0[0], selector); + w1[2] = hc_byte_perm_S (w0[0], 0, selector); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -33610,23 +39060,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 7: - c1[3] = hc_bytealign_S (w3[3], 0, offset); - c1[2] = hc_bytealign_S (w3[2], w3[3], offset); - c1[1] = hc_bytealign_S (w3[1], w3[2], offset); - c1[0] = hc_bytealign_S (w3[0], w3[1], offset); - c0[3] = hc_bytealign_S (w2[3], w3[0], offset); - c0[2] = hc_bytealign_S (w2[2], w2[3], offset); - c0[1] = hc_bytealign_S (w2[1], w2[2], offset); - c0[0] = hc_bytealign_S (w2[0], w2[1], offset); - w3[3] = hc_bytealign_S (w1[3], w2[0], offset); - w3[2] = hc_bytealign_S (w1[2], w1[3], offset); - w3[1] = hc_bytealign_S (w1[1], w1[2], offset); - w3[0] = hc_bytealign_S (w1[0], w1[1], offset); - w2[3] = hc_bytealign_S (w0[3], w1[0], offset); - w2[2] = hc_bytealign_S (w0[2], w0[3], offset); - w2[1] = hc_bytealign_S (w0[1], w0[2], offset); - w2[0] = hc_bytealign_S (w0[0], w0[1], offset); - w1[3] = hc_bytealign_S ( 0, w0[0], offset); + c1[3] = hc_byte_perm_S ( 0, w3[3], selector); + c1[2] = hc_byte_perm_S (w3[3], w3[2], selector); + c1[1] = hc_byte_perm_S (w3[2], w3[1], selector); + c1[0] = hc_byte_perm_S (w3[1], w3[0], selector); + c0[3] = hc_byte_perm_S (w3[0], w2[3], selector); + c0[2] = hc_byte_perm_S (w2[3], w2[2], selector); + c0[1] = hc_byte_perm_S (w2[2], w2[1], selector); + c0[0] = hc_byte_perm_S (w2[1], w2[0], selector); + w3[3] = hc_byte_perm_S (w2[0], w1[3], selector); + w3[2] = hc_byte_perm_S (w1[3], w1[2], selector); + w3[1] = hc_byte_perm_S (w1[2], w1[1], selector); + w3[0] = hc_byte_perm_S (w1[1], w1[0], selector); + w2[3] = hc_byte_perm_S (w1[0], w0[3], selector); + w2[2] = hc_byte_perm_S (w0[3], w0[2], selector); + w2[1] = hc_byte_perm_S (w0[2], w0[1], selector); + w2[0] = hc_byte_perm_S (w0[1], w0[0], selector); + w1[3] = hc_byte_perm_S (w0[0], 0, selector); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -33638,23 +39088,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 8: - c2[0] = hc_bytealign_S (w3[3], 0, offset); - c1[3] = hc_bytealign_S (w3[2], w3[3], offset); - c1[2] = hc_bytealign_S (w3[1], w3[2], offset); - c1[1] = hc_bytealign_S (w3[0], w3[1], offset); - c1[0] = hc_bytealign_S (w2[3], w3[0], offset); - c0[3] = hc_bytealign_S (w2[2], w2[3], offset); - c0[2] = hc_bytealign_S (w2[1], w2[2], offset); - c0[1] = hc_bytealign_S (w2[0], w2[1], offset); - c0[0] = hc_bytealign_S (w1[3], w2[0], offset); - w3[3] = hc_bytealign_S (w1[2], w1[3], offset); - w3[2] = hc_bytealign_S (w1[1], w1[2], offset); - w3[1] = hc_bytealign_S (w1[0], w1[1], offset); - w3[0] = hc_bytealign_S (w0[3], w1[0], offset); - w2[3] = hc_bytealign_S (w0[2], w0[3], offset); - w2[2] = hc_bytealign_S (w0[1], w0[2], offset); - w2[1] = hc_bytealign_S (w0[0], w0[1], offset); - w2[0] = hc_bytealign_S ( 0, w0[0], offset); + c2[0] = hc_byte_perm_S ( 0, w3[3], selector); + c1[3] = hc_byte_perm_S (w3[3], w3[2], selector); + c1[2] = hc_byte_perm_S (w3[2], w3[1], selector); + c1[1] = hc_byte_perm_S (w3[1], w3[0], selector); + c1[0] = hc_byte_perm_S (w3[0], w2[3], selector); + c0[3] = hc_byte_perm_S (w2[3], w2[2], selector); + c0[2] = hc_byte_perm_S (w2[2], w2[1], selector); + c0[1] = hc_byte_perm_S (w2[1], w2[0], selector); + c0[0] = hc_byte_perm_S (w2[0], w1[3], selector); + w3[3] = hc_byte_perm_S (w1[3], w1[2], selector); + w3[2] = hc_byte_perm_S (w1[2], w1[1], selector); + w3[1] = hc_byte_perm_S (w1[1], w1[0], selector); + w3[0] = hc_byte_perm_S (w1[0], w0[3], selector); + w2[3] = hc_byte_perm_S (w0[3], w0[2], selector); + w2[2] = hc_byte_perm_S (w0[2], w0[1], selector); + w2[1] = hc_byte_perm_S (w0[1], w0[0], selector); + w2[0] = hc_byte_perm_S (w0[0], 0, selector); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -33667,23 +39117,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 9: - c2[1] = hc_bytealign_S (w3[3], 0, offset); - c2[0] = hc_bytealign_S (w3[2], w3[3], offset); - c1[3] = hc_bytealign_S (w3[1], w3[2], offset); - c1[2] = hc_bytealign_S (w3[0], w3[1], offset); - c1[1] = hc_bytealign_S (w2[3], w3[0], offset); - c1[0] = hc_bytealign_S (w2[2], w2[3], offset); - c0[3] = hc_bytealign_S (w2[1], w2[2], offset); - c0[2] = hc_bytealign_S (w2[0], w2[1], offset); - c0[1] = hc_bytealign_S (w1[3], w2[0], offset); - c0[0] = hc_bytealign_S (w1[2], w1[3], offset); - w3[3] = hc_bytealign_S (w1[1], w1[2], offset); - w3[2] = hc_bytealign_S (w1[0], w1[1], offset); - w3[1] = hc_bytealign_S (w0[3], w1[0], offset); - w3[0] = hc_bytealign_S (w0[2], w0[3], offset); - w2[3] = hc_bytealign_S (w0[1], w0[2], offset); - w2[2] = hc_bytealign_S (w0[0], w0[1], offset); - w2[1] = hc_bytealign_S ( 0, w0[0], offset); + c2[1] = hc_byte_perm_S ( 0, w3[3], selector); + c2[0] = hc_byte_perm_S (w3[3], w3[2], selector); + c1[3] = hc_byte_perm_S (w3[2], w3[1], selector); + c1[2] = hc_byte_perm_S (w3[1], w3[0], selector); + c1[1] = hc_byte_perm_S (w3[0], w2[3], selector); + c1[0] = hc_byte_perm_S (w2[3], w2[2], selector); + c0[3] = hc_byte_perm_S (w2[2], w2[1], selector); + c0[2] = hc_byte_perm_S (w2[1], w2[0], selector); + c0[1] = hc_byte_perm_S (w2[0], w1[3], selector); + c0[0] = hc_byte_perm_S (w1[3], w1[2], selector); + w3[3] = hc_byte_perm_S (w1[2], w1[1], selector); + w3[2] = hc_byte_perm_S (w1[1], w1[0], selector); + w3[1] = hc_byte_perm_S (w1[0], w0[3], selector); + w3[0] = hc_byte_perm_S (w0[3], w0[2], selector); + w2[3] = hc_byte_perm_S (w0[2], w0[1], selector); + w2[2] = hc_byte_perm_S (w0[1], w0[0], selector); + w2[1] = hc_byte_perm_S (w0[0], 0, selector); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -33697,23 +39147,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 10: - c2[2] = hc_bytealign_S (w3[3], 0, offset); - c2[1] = hc_bytealign_S (w3[2], w3[3], offset); - c2[0] = hc_bytealign_S (w3[1], w3[2], offset); - c1[3] = hc_bytealign_S (w3[0], w3[1], offset); - c1[2] = hc_bytealign_S (w2[3], w3[0], offset); - c1[1] = hc_bytealign_S (w2[2], w2[3], offset); - c1[0] = hc_bytealign_S (w2[1], w2[2], offset); - c0[3] = hc_bytealign_S (w2[0], w2[1], offset); - c0[2] = hc_bytealign_S (w1[3], w2[0], offset); - c0[1] = hc_bytealign_S (w1[2], w1[3], offset); - c0[0] = hc_bytealign_S (w1[1], w1[2], offset); - w3[3] = hc_bytealign_S (w1[0], w1[1], offset); - w3[2] = hc_bytealign_S (w0[3], w1[0], offset); - w3[1] = hc_bytealign_S (w0[2], w0[3], offset); - w3[0] = hc_bytealign_S (w0[1], w0[2], offset); - w2[3] = hc_bytealign_S (w0[0], w0[1], offset); - w2[2] = hc_bytealign_S ( 0, w0[0], offset); + c2[2] = hc_byte_perm_S ( 0, w3[3], selector); + c2[1] = hc_byte_perm_S (w3[3], w3[2], selector); + c2[0] = hc_byte_perm_S (w3[2], w3[1], selector); + c1[3] = hc_byte_perm_S (w3[1], w3[0], selector); + c1[2] = hc_byte_perm_S (w3[0], w2[3], selector); + c1[1] = hc_byte_perm_S (w2[3], w2[2], selector); + c1[0] = hc_byte_perm_S (w2[2], w2[1], selector); + c0[3] = hc_byte_perm_S (w2[1], w2[0], selector); + c0[2] = hc_byte_perm_S (w2[0], w1[3], selector); + c0[1] = hc_byte_perm_S (w1[3], w1[2], selector); + c0[0] = hc_byte_perm_S (w1[2], w1[1], selector); + w3[3] = hc_byte_perm_S (w1[1], w1[0], selector); + w3[2] = hc_byte_perm_S (w1[0], w0[3], selector); + w3[1] = hc_byte_perm_S (w0[3], w0[2], selector); + w3[0] = hc_byte_perm_S (w0[2], w0[1], selector); + w2[3] = hc_byte_perm_S (w0[1], w0[0], selector); + w2[2] = hc_byte_perm_S (w0[0], 0, selector); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -33728,55 +39178,55 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 11: - c2[3] = hc_bytealign_S (w3[3], 0, offset); - c2[2] = hc_bytealign_S (w3[2], w3[3], offset); - c2[1] = hc_bytealign_S (w3[1], w3[2], offset); - c2[0] = hc_bytealign_S (w3[0], w3[1], offset); - c1[3] = hc_bytealign_S (w2[3], w3[0], offset); - c1[2] = hc_bytealign_S (w2[2], w2[3], offset); - c1[1] = hc_bytealign_S (w2[1], w2[2], offset); - c1[0] = hc_bytealign_S (w2[0], w2[1], offset); - c0[3] = hc_bytealign_S (w1[3], w2[0], offset); - c0[2] = hc_bytealign_S (w1[2], w1[3], offset); - c0[1] = hc_bytealign_S (w1[1], w1[2], offset); - c0[0] = hc_bytealign_S (w1[0], w1[1], offset); - w3[3] = hc_bytealign_S (w0[3], w1[0], offset); - w3[2] = hc_bytealign_S (w0[2], w0[3], offset); - w3[1] = hc_bytealign_S (w0[1], w0[2], offset); - w3[0] = hc_bytealign_S (w0[0], w0[1], offset); - w2[3] = hc_bytealign_S ( 0, w0[0], offset); + c2[3] = hc_byte_perm_S ( 0, w3[3], selector); + c2[2] = hc_byte_perm_S (w3[3], w3[2], selector); + c2[1] = hc_byte_perm_S (w3[2], w3[1], selector); + c2[0] = hc_byte_perm_S (w3[1], w3[0], selector); + c1[3] = hc_byte_perm_S (w3[0], w2[3], selector); + c1[2] = hc_byte_perm_S (w2[3], w2[2], selector); + c1[1] = hc_byte_perm_S (w2[2], w2[1], selector); + c1[0] = hc_byte_perm_S (w2[1], w2[0], selector); + c0[3] = hc_byte_perm_S (w2[0], w1[3], selector); + c0[2] = hc_byte_perm_S (w1[3], w1[2], selector); + c0[1] = hc_byte_perm_S (w1[2], w1[1], selector); + c0[0] = hc_byte_perm_S (w1[1], w1[0], selector); + w3[3] = hc_byte_perm_S (w1[0], w0[3], selector); + w3[2] = hc_byte_perm_S (w0[3], w0[2], selector); + w3[1] = hc_byte_perm_S (w0[2], w0[1], selector); + w3[0] = hc_byte_perm_S (w0[1], w0[0], selector); + w2[3] = hc_byte_perm_S (w0[0], 0, selector); w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - break; - - case 12: - c3[0] = hc_bytealign_S (w3[3], 0, offset); - c2[3] = hc_bytealign_S (w3[2], w3[3], offset); - c2[2] = hc_bytealign_S (w3[1], w3[2], offset); - c2[1] = hc_bytealign_S (w3[0], w3[1], offset); - c2[0] = hc_bytealign_S (w2[3], w3[0], offset); - c1[3] = hc_bytealign_S (w2[2], w2[3], offset); - c1[2] = hc_bytealign_S (w2[1], w2[2], offset); - c1[1] = hc_bytealign_S (w2[0], w2[1], offset); - c1[0] = hc_bytealign_S (w1[3], w2[0], offset); - c0[3] = hc_bytealign_S (w1[2], w1[3], offset); - c0[2] = hc_bytealign_S (w1[1], w1[2], offset); - c0[1] = hc_bytealign_S (w1[0], w1[1], offset); - c0[0] = hc_bytealign_S (w0[3], w1[0], offset); - w3[3] = hc_bytealign_S (w0[2], w0[3], offset); - w3[2] = hc_bytealign_S (w0[1], w0[2], offset); - w3[1] = hc_bytealign_S (w0[0], w0[1], offset); - w3[0] = hc_bytealign_S ( 0, w0[0], offset); + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 12: + c3[0] = hc_byte_perm_S ( 0, w3[3], selector); + c2[3] = hc_byte_perm_S (w3[3], w3[2], selector); + c2[2] = hc_byte_perm_S (w3[2], w3[1], selector); + c2[1] = hc_byte_perm_S (w3[1], w3[0], selector); + c2[0] = hc_byte_perm_S (w3[0], w2[3], selector); + c1[3] = hc_byte_perm_S (w2[3], w2[2], selector); + c1[2] = hc_byte_perm_S (w2[2], w2[1], selector); + c1[1] = hc_byte_perm_S (w2[1], w2[0], selector); + c1[0] = hc_byte_perm_S (w2[0], w1[3], selector); + c0[3] = hc_byte_perm_S (w1[3], w1[2], selector); + c0[2] = hc_byte_perm_S (w1[2], w1[1], selector); + c0[1] = hc_byte_perm_S (w1[1], w1[0], selector); + c0[0] = hc_byte_perm_S (w1[0], w0[3], selector); + w3[3] = hc_byte_perm_S (w0[3], w0[2], selector); + w3[2] = hc_byte_perm_S (w0[2], w0[1], selector); + w3[1] = hc_byte_perm_S (w0[1], w0[0], selector); + w3[0] = hc_byte_perm_S (w0[0], 0, selector); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -33793,23 +39243,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 13: - c3[1] = hc_bytealign_S (w3[3], 0, offset); - c3[0] = hc_bytealign_S (w3[2], w3[3], offset); - c2[3] = hc_bytealign_S (w3[1], w3[2], offset); - c2[2] = hc_bytealign_S (w3[0], w3[1], offset); - c2[1] = hc_bytealign_S (w2[3], w3[0], offset); - c2[0] = hc_bytealign_S (w2[2], w2[3], offset); - c1[3] = hc_bytealign_S (w2[1], w2[2], offset); - c1[2] = hc_bytealign_S (w2[0], w2[1], offset); - c1[1] = hc_bytealign_S (w1[3], w2[0], offset); - c1[0] = hc_bytealign_S (w1[2], w1[3], offset); - c0[3] = hc_bytealign_S (w1[1], w1[2], offset); - c0[2] = hc_bytealign_S (w1[0], w1[1], offset); - c0[1] = hc_bytealign_S (w0[3], w1[0], offset); - c0[0] = hc_bytealign_S (w0[2], w0[3], offset); - w3[3] = hc_bytealign_S (w0[1], w0[2], offset); - w3[2] = hc_bytealign_S (w0[0], w0[1], offset); - w3[1] = hc_bytealign_S ( 0, w0[0], offset); + c3[1] = hc_byte_perm_S ( 0, w3[3], selector); + c3[0] = hc_byte_perm_S (w3[3], w3[2], selector); + c2[3] = hc_byte_perm_S (w3[2], w3[1], selector); + c2[2] = hc_byte_perm_S (w3[1], w3[0], selector); + c2[1] = hc_byte_perm_S (w3[0], w2[3], selector); + c2[0] = hc_byte_perm_S (w2[3], w2[2], selector); + c1[3] = hc_byte_perm_S (w2[2], w2[1], selector); + c1[2] = hc_byte_perm_S (w2[1], w2[0], selector); + c1[1] = hc_byte_perm_S (w2[0], w1[3], selector); + c1[0] = hc_byte_perm_S (w1[3], w1[2], selector); + c0[3] = hc_byte_perm_S (w1[2], w1[1], selector); + c0[2] = hc_byte_perm_S (w1[1], w1[0], selector); + c0[1] = hc_byte_perm_S (w1[0], w0[3], selector); + c0[0] = hc_byte_perm_S (w0[3], w0[2], selector); + w3[3] = hc_byte_perm_S (w0[2], w0[1], selector); + w3[2] = hc_byte_perm_S (w0[1], w0[0], selector); + w3[1] = hc_byte_perm_S (w0[0], 0, selector); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -33827,23 +39277,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 14: - c3[2] = hc_bytealign_S (w3[3], 0, offset); - c3[1] = hc_bytealign_S (w3[2], w3[3], offset); - c3[0] = hc_bytealign_S (w3[1], w3[2], offset); - c2[3] = hc_bytealign_S (w3[0], w3[1], offset); - c2[2] = hc_bytealign_S (w2[3], w3[0], offset); - c2[1] = hc_bytealign_S (w2[2], w2[3], offset); - c2[0] = hc_bytealign_S (w2[1], w2[2], offset); - c1[3] = hc_bytealign_S (w2[0], w2[1], offset); - c1[2] = hc_bytealign_S (w1[3], w2[0], offset); - c1[1] = hc_bytealign_S (w1[2], w1[3], offset); - c1[0] = hc_bytealign_S (w1[1], w1[2], offset); - c0[3] = hc_bytealign_S (w1[0], w1[1], offset); - c0[2] = hc_bytealign_S (w0[3], w1[0], offset); - c0[1] = hc_bytealign_S (w0[2], w0[3], offset); - c0[0] = hc_bytealign_S (w0[1], w0[2], offset); - w3[3] = hc_bytealign_S (w0[0], w0[1], offset); - w3[2] = hc_bytealign_S ( 0, w0[0], offset); + c3[2] = hc_byte_perm_S ( 0, w3[3], selector); + c3[1] = hc_byte_perm_S (w3[3], w3[2], selector); + c3[0] = hc_byte_perm_S (w3[2], w3[1], selector); + c2[3] = hc_byte_perm_S (w3[1], w3[0], selector); + c2[2] = hc_byte_perm_S (w3[0], w2[3], selector); + c2[1] = hc_byte_perm_S (w2[3], w2[2], selector); + c2[0] = hc_byte_perm_S (w2[2], w2[1], selector); + c1[3] = hc_byte_perm_S (w2[1], w2[0], selector); + c1[2] = hc_byte_perm_S (w2[0], w1[3], selector); + c1[1] = hc_byte_perm_S (w1[3], w1[2], selector); + c1[0] = hc_byte_perm_S (w1[2], w1[1], selector); + c0[3] = hc_byte_perm_S (w1[1], w1[0], selector); + c0[2] = hc_byte_perm_S (w1[0], w0[3], selector); + c0[1] = hc_byte_perm_S (w0[3], w0[2], selector); + c0[0] = hc_byte_perm_S (w0[2], w0[1], selector); + w3[3] = hc_byte_perm_S (w0[1], w0[0], selector); + w3[2] = hc_byte_perm_S (w0[0], 0, selector); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -33862,23 +39312,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 15: - c3[3] = hc_bytealign_S (w3[3], 0, offset); - c3[2] = hc_bytealign_S (w3[2], w3[3], offset); - c3[1] = hc_bytealign_S (w3[1], w3[2], offset); - c3[0] = hc_bytealign_S (w3[0], w3[1], offset); - c2[3] = hc_bytealign_S (w2[3], w3[0], offset); - c2[2] = hc_bytealign_S (w2[2], w2[3], offset); - c2[1] = hc_bytealign_S (w2[1], w2[2], offset); - c2[0] = hc_bytealign_S (w2[0], w2[1], offset); - c1[3] = hc_bytealign_S (w1[3], w2[0], offset); - c1[2] = hc_bytealign_S (w1[2], w1[3], offset); - c1[1] = hc_bytealign_S (w1[1], w1[2], offset); - c1[0] = hc_bytealign_S (w1[0], w1[1], offset); - c0[3] = hc_bytealign_S (w0[3], w1[0], offset); - c0[2] = hc_bytealign_S (w0[2], w0[3], offset); - c0[1] = hc_bytealign_S (w0[1], w0[2], offset); - c0[0] = hc_bytealign_S (w0[0], w0[1], offset); - w3[3] = hc_bytealign_S ( 0, w0[0], offset); + c3[3] = hc_byte_perm_S ( 0, w3[3], selector); + c3[2] = hc_byte_perm_S (w3[3], w3[2], selector); + c3[1] = hc_byte_perm_S (w3[2], w3[1], selector); + c3[0] = hc_byte_perm_S (w3[1], w3[0], selector); + c2[3] = hc_byte_perm_S (w3[0], w2[3], selector); + c2[2] = hc_byte_perm_S (w2[3], w2[2], selector); + c2[1] = hc_byte_perm_S (w2[2], w2[1], selector); + c2[0] = hc_byte_perm_S (w2[1], w2[0], selector); + c1[3] = hc_byte_perm_S (w2[0], w1[3], selector); + c1[2] = hc_byte_perm_S (w1[3], w1[2], selector); + c1[1] = hc_byte_perm_S (w1[2], w1[1], selector); + c1[0] = hc_byte_perm_S (w1[1], w1[0], selector); + c0[3] = hc_byte_perm_S (w1[0], w0[3], selector); + c0[2] = hc_byte_perm_S (w0[3], w0[2], selector); + c0[1] = hc_byte_perm_S (w0[2], w0[1], selector); + c0[0] = hc_byte_perm_S (w0[1], w0[0], selector); + w3[3] = hc_byte_perm_S (w0[0], 0, selector); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -33898,13 +39348,32 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 break; } #endif +} - #ifdef IS_NV - // could be improved, too +DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, u32 *w4, u32 *w5, u32 *w6, u32 *w7, const u32 offset) +{ + const int offset_switch = offset / 4; + + #if (defined IS_AMD && HAS_VPERM == 0) || defined IS_GENERIC switch (offset_switch) { - case 0: - c0[0] = hc_bytealign_S (w3[3], 0, offset); + case 0: + w7[3] = hc_bytealign_S (w7[2], w7[3], offset); + w7[2] = hc_bytealign_S (w7[1], w7[2], offset); + w7[1] = hc_bytealign_S (w7[0], w7[1], offset); + w7[0] = hc_bytealign_S (w6[3], w7[0], offset); + w6[3] = hc_bytealign_S (w6[2], w6[3], offset); + w6[2] = hc_bytealign_S (w6[1], w6[2], offset); + w6[1] = hc_bytealign_S (w6[0], w6[1], offset); + w6[0] = hc_bytealign_S (w5[3], w6[0], offset); + w5[3] = hc_bytealign_S (w5[2], w5[3], offset); + w5[2] = hc_bytealign_S (w5[1], w5[2], offset); + w5[1] = hc_bytealign_S (w5[0], w5[1], offset); + w5[0] = hc_bytealign_S (w4[3], w5[0], offset); + w4[3] = hc_bytealign_S (w4[2], w4[3], offset); + w4[2] = hc_bytealign_S (w4[1], w4[2], offset); + w4[1] = hc_bytealign_S (w4[0], w4[1], offset); + w4[0] = hc_bytealign_S (w3[3], w4[0], offset); w3[3] = hc_bytealign_S (w3[2], w3[3], offset); w3[2] = hc_bytealign_S (w3[1], w3[2], offset); w3[1] = hc_bytealign_S (w3[0], w3[1], offset); @@ -33924,9 +39393,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 break; - case 1: - c0[1] = hc_bytealign_S (w3[3], 0, offset); - c0[0] = hc_bytealign_S (w3[2], w3[3], offset); + case 1: + w7[3] = hc_bytealign_S (w7[1], w7[2], offset); + w7[2] = hc_bytealign_S (w7[0], w7[1], offset); + w7[1] = hc_bytealign_S (w6[3], w7[0], offset); + w7[0] = hc_bytealign_S (w6[2], w6[3], offset); + w6[3] = hc_bytealign_S (w6[1], w6[2], offset); + w6[2] = hc_bytealign_S (w6[0], w6[1], offset); + w6[1] = hc_bytealign_S (w5[3], w6[0], offset); + w6[0] = hc_bytealign_S (w5[2], w5[3], offset); + w5[3] = hc_bytealign_S (w5[1], w5[2], offset); + w5[2] = hc_bytealign_S (w5[0], w5[1], offset); + w5[1] = hc_bytealign_S (w4[3], w5[0], offset); + w5[0] = hc_bytealign_S (w4[2], w4[3], offset); + w4[3] = hc_bytealign_S (w4[1], w4[2], offset); + w4[2] = hc_bytealign_S (w4[0], w4[1], offset); + w4[1] = hc_bytealign_S (w3[3], w4[0], offset); + w4[0] = hc_bytealign_S (w3[2], w3[3], offset); w3[3] = hc_bytealign_S (w3[1], w3[2], offset); w3[2] = hc_bytealign_S (w3[0], w3[1], offset); w3[1] = hc_bytealign_S (w2[3], w3[0], offset); @@ -33946,10 +39429,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 break; - case 2: - c0[2] = hc_bytealign_S (w3[3], 0, offset); - c0[1] = hc_bytealign_S (w3[2], w3[3], offset); - c0[0] = hc_bytealign_S (w3[1], w3[2], offset); + case 2: + w7[3] = hc_bytealign_S (w7[0], w7[1], offset); + w7[2] = hc_bytealign_S (w6[3], w7[0], offset); + w7[1] = hc_bytealign_S (w6[2], w6[3], offset); + w7[0] = hc_bytealign_S (w6[1], w6[2], offset); + w6[3] = hc_bytealign_S (w6[0], w6[1], offset); + w6[2] = hc_bytealign_S (w5[3], w6[0], offset); + w6[1] = hc_bytealign_S (w5[2], w5[3], offset); + w6[0] = hc_bytealign_S (w5[1], w5[2], offset); + w5[3] = hc_bytealign_S (w5[0], w5[1], offset); + w5[2] = hc_bytealign_S (w4[3], w5[0], offset); + w5[1] = hc_bytealign_S (w4[2], w4[3], offset); + w5[0] = hc_bytealign_S (w4[1], w4[2], offset); + w4[3] = hc_bytealign_S (w4[0], w4[1], offset); + w4[2] = hc_bytealign_S (w3[3], w4[0], offset); + w4[1] = hc_bytealign_S (w3[2], w3[3], offset); + w4[0] = hc_bytealign_S (w3[1], w3[2], offset); w3[3] = hc_bytealign_S (w3[0], w3[1], offset); w3[2] = hc_bytealign_S (w2[3], w3[0], offset); w3[1] = hc_bytealign_S (w2[2], w2[3], offset); @@ -33969,11 +39465,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 break; - case 3: - c0[3] = hc_bytealign_S (w3[3], 0, offset); - c0[2] = hc_bytealign_S (w3[2], w3[3], offset); - c0[1] = hc_bytealign_S (w3[1], w3[2], offset); - c0[0] = hc_bytealign_S (w3[0], w3[1], offset); + case 3: + w7[3] = hc_bytealign_S (w6[3], w7[0], offset); + w7[2] = hc_bytealign_S (w6[2], w6[3], offset); + w7[1] = hc_bytealign_S (w6[1], w6[2], offset); + w7[0] = hc_bytealign_S (w6[0], w6[1], offset); + w6[3] = hc_bytealign_S (w5[3], w6[0], offset); + w6[2] = hc_bytealign_S (w5[2], w5[3], offset); + w6[1] = hc_bytealign_S (w5[1], w5[2], offset); + w6[0] = hc_bytealign_S (w5[0], w5[1], offset); + w5[3] = hc_bytealign_S (w4[3], w5[0], offset); + w5[2] = hc_bytealign_S (w4[2], w4[3], offset); + w5[1] = hc_bytealign_S (w4[1], w4[2], offset); + w5[0] = hc_bytealign_S (w4[0], w4[1], offset); + w4[3] = hc_bytealign_S (w3[3], w4[0], offset); + w4[2] = hc_bytealign_S (w3[2], w3[3], offset); + w4[1] = hc_bytealign_S (w3[1], w3[2], offset); + w4[0] = hc_bytealign_S (w3[0], w3[1], offset); w3[3] = hc_bytealign_S (w2[3], w3[0], offset); w3[2] = hc_bytealign_S (w2[2], w2[3], offset); w3[1] = hc_bytealign_S (w2[1], w2[2], offset); @@ -33993,12 +39501,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 break; - case 4: - c1[0] = hc_bytealign_S (w3[3], 0, offset); - c0[3] = hc_bytealign_S (w3[2], w3[3], offset); - c0[2] = hc_bytealign_S (w3[1], w3[2], offset); - c0[1] = hc_bytealign_S (w3[0], w3[1], offset); - c0[0] = hc_bytealign_S (w2[3], w3[0], offset); + case 4: + w7[3] = hc_bytealign_S (w6[2], w6[3], offset); + w7[2] = hc_bytealign_S (w6[1], w6[2], offset); + w7[1] = hc_bytealign_S (w6[0], w6[1], offset); + w7[0] = hc_bytealign_S (w5[3], w6[0], offset); + w6[3] = hc_bytealign_S (w5[2], w5[3], offset); + w6[2] = hc_bytealign_S (w5[1], w5[2], offset); + w6[1] = hc_bytealign_S (w5[0], w5[1], offset); + w6[0] = hc_bytealign_S (w4[3], w5[0], offset); + w5[3] = hc_bytealign_S (w4[2], w4[3], offset); + w5[2] = hc_bytealign_S (w4[1], w4[2], offset); + w5[1] = hc_bytealign_S (w4[0], w4[1], offset); + w5[0] = hc_bytealign_S (w3[3], w4[0], offset); + w4[3] = hc_bytealign_S (w3[2], w3[3], offset); + w4[2] = hc_bytealign_S (w3[1], w3[2], offset); + w4[1] = hc_bytealign_S (w3[0], w3[1], offset); + w4[0] = hc_bytealign_S (w2[3], w3[0], offset); w3[3] = hc_bytealign_S (w2[2], w2[3], offset); w3[2] = hc_bytealign_S (w2[1], w2[2], offset); w3[1] = hc_bytealign_S (w2[0], w2[1], offset); @@ -34018,13 +39537,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 break; - case 5: - c1[1] = hc_bytealign_S (w3[3], 0, offset); - c1[0] = hc_bytealign_S (w3[2], w3[3], offset); - c0[3] = hc_bytealign_S (w3[1], w3[2], offset); - c0[2] = hc_bytealign_S (w3[0], w3[1], offset); - c0[1] = hc_bytealign_S (w2[3], w3[0], offset); - c0[0] = hc_bytealign_S (w2[2], w2[3], offset); + case 5: + w7[3] = hc_bytealign_S (w6[1], w6[2], offset); + w7[2] = hc_bytealign_S (w6[0], w6[1], offset); + w7[1] = hc_bytealign_S (w5[3], w6[0], offset); + w7[0] = hc_bytealign_S (w5[2], w5[3], offset); + w6[3] = hc_bytealign_S (w5[1], w5[2], offset); + w6[2] = hc_bytealign_S (w5[0], w5[1], offset); + w6[1] = hc_bytealign_S (w4[3], w5[0], offset); + w6[0] = hc_bytealign_S (w4[2], w4[3], offset); + w5[3] = hc_bytealign_S (w4[1], w4[2], offset); + w5[2] = hc_bytealign_S (w4[0], w4[1], offset); + w5[1] = hc_bytealign_S (w3[3], w4[0], offset); + w5[0] = hc_bytealign_S (w3[2], w3[3], offset); + w4[3] = hc_bytealign_S (w3[1], w3[2], offset); + w4[2] = hc_bytealign_S (w3[0], w3[1], offset); + w4[1] = hc_bytealign_S (w2[3], w3[0], offset); + w4[0] = hc_bytealign_S (w2[2], w2[3], offset); w3[3] = hc_bytealign_S (w2[1], w2[2], offset); w3[2] = hc_bytealign_S (w2[0], w2[1], offset); w3[1] = hc_bytealign_S (w1[3], w2[0], offset); @@ -34044,14 +39573,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 break; - case 6: - c1[2] = hc_bytealign_S (w3[3], 0, offset); - c1[1] = hc_bytealign_S (w3[2], w3[3], offset); - c1[0] = hc_bytealign_S (w3[1], w3[2], offset); - c0[3] = hc_bytealign_S (w3[0], w3[1], offset); - c0[2] = hc_bytealign_S (w2[3], w3[0], offset); - c0[1] = hc_bytealign_S (w2[2], w2[3], offset); - c0[0] = hc_bytealign_S (w2[1], w2[2], offset); + case 6: + w7[3] = hc_bytealign_S (w6[0], w6[1], offset); + w7[2] = hc_bytealign_S (w5[3], w6[0], offset); + w7[1] = hc_bytealign_S (w5[2], w5[3], offset); + w7[0] = hc_bytealign_S (w5[1], w5[2], offset); + w6[3] = hc_bytealign_S (w5[0], w5[1], offset); + w6[2] = hc_bytealign_S (w4[3], w5[0], offset); + w6[1] = hc_bytealign_S (w4[2], w4[3], offset); + w6[0] = hc_bytealign_S (w4[1], w4[2], offset); + w5[3] = hc_bytealign_S (w4[0], w4[1], offset); + w5[2] = hc_bytealign_S (w3[3], w4[0], offset); + w5[1] = hc_bytealign_S (w3[2], w3[3], offset); + w5[0] = hc_bytealign_S (w3[1], w3[2], offset); + w4[3] = hc_bytealign_S (w3[0], w3[1], offset); + w4[2] = hc_bytealign_S (w2[3], w3[0], offset); + w4[1] = hc_bytealign_S (w2[2], w2[3], offset); + w4[0] = hc_bytealign_S (w2[1], w2[2], offset); w3[3] = hc_bytealign_S (w2[0], w2[1], offset); w3[2] = hc_bytealign_S (w1[3], w2[0], offset); w3[1] = hc_bytealign_S (w1[2], w1[3], offset); @@ -34071,15 +39609,23 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 break; - case 7: - c1[3] = hc_bytealign_S (w3[3], 0, offset); - c1[2] = hc_bytealign_S (w3[2], w3[3], offset); - c1[1] = hc_bytealign_S (w3[1], w3[2], offset); - c1[0] = hc_bytealign_S (w3[0], w3[1], offset); - c0[3] = hc_bytealign_S (w2[3], w3[0], offset); - c0[2] = hc_bytealign_S (w2[2], w2[3], offset); - c0[1] = hc_bytealign_S (w2[1], w2[2], offset); - c0[0] = hc_bytealign_S (w2[0], w2[1], offset); + case 7: + w7[3] = hc_bytealign_S (w5[3], w6[0], offset); + w7[2] = hc_bytealign_S (w5[2], w5[3], offset); + w7[1] = hc_bytealign_S (w5[1], w5[2], offset); + w7[0] = hc_bytealign_S (w5[0], w5[1], offset); + w6[3] = hc_bytealign_S (w4[3], w5[0], offset); + w6[2] = hc_bytealign_S (w4[2], w4[3], offset); + w6[1] = hc_bytealign_S (w4[1], w4[2], offset); + w6[0] = hc_bytealign_S (w4[0], w4[1], offset); + w5[3] = hc_bytealign_S (w3[3], w4[0], offset); + w5[2] = hc_bytealign_S (w3[2], w3[3], offset); + w5[1] = hc_bytealign_S (w3[1], w3[2], offset); + w5[0] = hc_bytealign_S (w3[0], w3[1], offset); + w4[3] = hc_bytealign_S (w2[3], w3[0], offset); + w4[2] = hc_bytealign_S (w2[2], w2[3], offset); + w4[1] = hc_bytealign_S (w2[1], w2[2], offset); + w4[0] = hc_bytealign_S (w2[0], w2[1], offset); w3[3] = hc_bytealign_S (w1[3], w2[0], offset); w3[2] = hc_bytealign_S (w1[2], w1[3], offset); w3[1] = hc_bytealign_S (w1[1], w1[2], offset); @@ -34099,184 +39645,31 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 break; - case 8: - c2[0] = hc_bytealign_S (w3[3], 0, offset); - c1[3] = hc_bytealign_S (w3[2], w3[3], offset); - c1[2] = hc_bytealign_S (w3[1], w3[2], offset); - c1[1] = hc_bytealign_S (w3[0], w3[1], offset); - c1[0] = hc_bytealign_S (w2[3], w3[0], offset); - c0[3] = hc_bytealign_S (w2[2], w2[3], offset); - c0[2] = hc_bytealign_S (w2[1], w2[2], offset); - c0[1] = hc_bytealign_S (w2[0], w2[1], offset); - c0[0] = hc_bytealign_S (w1[3], w2[0], offset); + case 8: + w7[3] = hc_bytealign_S (w5[2], w5[3], offset); + w7[2] = hc_bytealign_S (w5[1], w5[2], offset); + w7[1] = hc_bytealign_S (w5[0], w5[1], offset); + w7[0] = hc_bytealign_S (w4[3], w5[0], offset); + w6[3] = hc_bytealign_S (w4[2], w4[3], offset); + w6[2] = hc_bytealign_S (w4[1], w4[2], offset); + w6[1] = hc_bytealign_S (w4[0], w4[1], offset); + w6[0] = hc_bytealign_S (w3[3], w4[0], offset); + w5[3] = hc_bytealign_S (w3[2], w3[3], offset); + w5[2] = hc_bytealign_S (w3[1], w3[2], offset); + w5[1] = hc_bytealign_S (w3[0], w3[1], offset); + w5[0] = hc_bytealign_S (w2[3], w3[0], offset); + w4[3] = hc_bytealign_S (w2[2], w2[3], offset); + w4[2] = hc_bytealign_S (w2[1], w2[2], offset); + w4[1] = hc_bytealign_S (w2[0], w2[1], offset); + w4[0] = hc_bytealign_S (w1[3], w2[0], offset); w3[3] = hc_bytealign_S (w1[2], w1[3], offset); w3[2] = hc_bytealign_S (w1[1], w1[2], offset); w3[1] = hc_bytealign_S (w1[0], w1[1], offset); w3[0] = hc_bytealign_S (w0[3], w1[0], offset); - w2[3] = hc_bytealign_S (w0[2], w0[3], offset); - w2[2] = hc_bytealign_S (w0[1], w0[2], offset); - w2[1] = hc_bytealign_S (w0[0], w0[1], offset); - w2[0] = hc_bytealign_S ( 0, w0[0], offset); - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - break; - - case 9: - c2[1] = hc_bytealign_S (w3[3], 0, offset); - c2[0] = hc_bytealign_S (w3[2], w3[3], offset); - c1[3] = hc_bytealign_S (w3[1], w3[2], offset); - c1[2] = hc_bytealign_S (w3[0], w3[1], offset); - c1[1] = hc_bytealign_S (w2[3], w3[0], offset); - c1[0] = hc_bytealign_S (w2[2], w2[3], offset); - c0[3] = hc_bytealign_S (w2[1], w2[2], offset); - c0[2] = hc_bytealign_S (w2[0], w2[1], offset); - c0[1] = hc_bytealign_S (w1[3], w2[0], offset); - c0[0] = hc_bytealign_S (w1[2], w1[3], offset); - w3[3] = hc_bytealign_S (w1[1], w1[2], offset); - w3[2] = hc_bytealign_S (w1[0], w1[1], offset); - w3[1] = hc_bytealign_S (w0[3], w1[0], offset); - w3[0] = hc_bytealign_S (w0[2], w0[3], offset); - w2[3] = hc_bytealign_S (w0[1], w0[2], offset); - w2[2] = hc_bytealign_S (w0[0], w0[1], offset); - w2[1] = hc_bytealign_S ( 0, w0[0], offset); - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - break; - - case 10: - c2[2] = hc_bytealign_S (w3[3], 0, offset); - c2[1] = hc_bytealign_S (w3[2], w3[3], offset); - c2[0] = hc_bytealign_S (w3[1], w3[2], offset); - c1[3] = hc_bytealign_S (w3[0], w3[1], offset); - c1[2] = hc_bytealign_S (w2[3], w3[0], offset); - c1[1] = hc_bytealign_S (w2[2], w2[3], offset); - c1[0] = hc_bytealign_S (w2[1], w2[2], offset); - c0[3] = hc_bytealign_S (w2[0], w2[1], offset); - c0[2] = hc_bytealign_S (w1[3], w2[0], offset); - c0[1] = hc_bytealign_S (w1[2], w1[3], offset); - c0[0] = hc_bytealign_S (w1[1], w1[2], offset); - w3[3] = hc_bytealign_S (w1[0], w1[1], offset); - w3[2] = hc_bytealign_S (w0[3], w1[0], offset); - w3[1] = hc_bytealign_S (w0[2], w0[3], offset); - w3[0] = hc_bytealign_S (w0[1], w0[2], offset); - w2[3] = hc_bytealign_S (w0[0], w0[1], offset); - w2[2] = hc_bytealign_S ( 0, w0[0], offset); - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - break; - - case 11: - c2[3] = hc_bytealign_S (w3[3], 0, offset); - c2[2] = hc_bytealign_S (w3[2], w3[3], offset); - c2[1] = hc_bytealign_S (w3[1], w3[2], offset); - c2[0] = hc_bytealign_S (w3[0], w3[1], offset); - c1[3] = hc_bytealign_S (w2[3], w3[0], offset); - c1[2] = hc_bytealign_S (w2[2], w2[3], offset); - c1[1] = hc_bytealign_S (w2[1], w2[2], offset); - c1[0] = hc_bytealign_S (w2[0], w2[1], offset); - c0[3] = hc_bytealign_S (w1[3], w2[0], offset); - c0[2] = hc_bytealign_S (w1[2], w1[3], offset); - c0[1] = hc_bytealign_S (w1[1], w1[2], offset); - c0[0] = hc_bytealign_S (w1[0], w1[1], offset); - w3[3] = hc_bytealign_S (w0[3], w1[0], offset); - w3[2] = hc_bytealign_S (w0[2], w0[3], offset); - w3[1] = hc_bytealign_S (w0[1], w0[2], offset); - w3[0] = hc_bytealign_S (w0[0], w0[1], offset); - w2[3] = hc_bytealign_S ( 0, w0[0], offset); - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - break; - - case 12: - c3[0] = hc_bytealign_S (w3[3], 0, offset); - c2[3] = hc_bytealign_S (w3[2], w3[3], offset); - c2[2] = hc_bytealign_S (w3[1], w3[2], offset); - c2[1] = hc_bytealign_S (w3[0], w3[1], offset); - c2[0] = hc_bytealign_S (w2[3], w3[0], offset); - c1[3] = hc_bytealign_S (w2[2], w2[3], offset); - c1[2] = hc_bytealign_S (w2[1], w2[2], offset); - c1[1] = hc_bytealign_S (w2[0], w2[1], offset); - c1[0] = hc_bytealign_S (w1[3], w2[0], offset); - c0[3] = hc_bytealign_S (w1[2], w1[3], offset); - c0[2] = hc_bytealign_S (w1[1], w1[2], offset); - c0[1] = hc_bytealign_S (w1[0], w1[1], offset); - c0[0] = hc_bytealign_S (w0[3], w1[0], offset); - w3[3] = hc_bytealign_S (w0[2], w0[3], offset); - w3[2] = hc_bytealign_S (w0[1], w0[2], offset); - w3[1] = hc_bytealign_S (w0[0], w0[1], offset); - w3[0] = hc_bytealign_S ( 0, w0[0], offset); - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - break; - - case 13: - c3[1] = hc_bytealign_S (w3[3], 0, offset); - c3[0] = hc_bytealign_S (w3[2], w3[3], offset); - c2[3] = hc_bytealign_S (w3[1], w3[2], offset); - c2[2] = hc_bytealign_S (w3[0], w3[1], offset); - c2[1] = hc_bytealign_S (w2[3], w3[0], offset); - c2[0] = hc_bytealign_S (w2[2], w2[3], offset); - c1[3] = hc_bytealign_S (w2[1], w2[2], offset); - c1[2] = hc_bytealign_S (w2[0], w2[1], offset); - c1[1] = hc_bytealign_S (w1[3], w2[0], offset); - c1[0] = hc_bytealign_S (w1[2], w1[3], offset); - c0[3] = hc_bytealign_S (w1[1], w1[2], offset); - c0[2] = hc_bytealign_S (w1[0], w1[1], offset); - c0[1] = hc_bytealign_S (w0[3], w1[0], offset); - c0[0] = hc_bytealign_S (w0[2], w0[3], offset); - w3[3] = hc_bytealign_S (w0[1], w0[2], offset); - w3[2] = hc_bytealign_S (w0[0], w0[1], offset); - w3[1] = hc_bytealign_S ( 0, w0[0], offset); - w3[0] = 0; - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; + w2[3] = hc_bytealign_S (w0[2], w0[3], offset); + w2[2] = hc_bytealign_S (w0[1], w0[2], offset); + w2[1] = hc_bytealign_S (w0[0], w0[1], offset); + w2[0] = hc_bytealign_S ( 0, w0[0], offset); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -34288,29 +39681,30 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 break; - case 14: - c3[2] = hc_bytealign_S (w3[3], 0, offset); - c3[1] = hc_bytealign_S (w3[2], w3[3], offset); - c3[0] = hc_bytealign_S (w3[1], w3[2], offset); - c2[3] = hc_bytealign_S (w3[0], w3[1], offset); - c2[2] = hc_bytealign_S (w2[3], w3[0], offset); - c2[1] = hc_bytealign_S (w2[2], w2[3], offset); - c2[0] = hc_bytealign_S (w2[1], w2[2], offset); - c1[3] = hc_bytealign_S (w2[0], w2[1], offset); - c1[2] = hc_bytealign_S (w1[3], w2[0], offset); - c1[1] = hc_bytealign_S (w1[2], w1[3], offset); - c1[0] = hc_bytealign_S (w1[1], w1[2], offset); - c0[3] = hc_bytealign_S (w1[0], w1[1], offset); - c0[2] = hc_bytealign_S (w0[3], w1[0], offset); - c0[1] = hc_bytealign_S (w0[2], w0[3], offset); - c0[0] = hc_bytealign_S (w0[1], w0[2], offset); - w3[3] = hc_bytealign_S (w0[0], w0[1], offset); - w3[2] = hc_bytealign_S ( 0, w0[0], offset); - w3[1] = 0; - w3[0] = 0; - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; + case 9: + w7[3] = hc_bytealign_S (w5[1], w5[2], offset); + w7[2] = hc_bytealign_S (w5[0], w5[1], offset); + w7[1] = hc_bytealign_S (w4[3], w5[0], offset); + w7[0] = hc_bytealign_S (w4[2], w4[3], offset); + w6[3] = hc_bytealign_S (w4[1], w4[2], offset); + w6[2] = hc_bytealign_S (w4[0], w4[1], offset); + w6[1] = hc_bytealign_S (w3[3], w4[0], offset); + w6[0] = hc_bytealign_S (w3[2], w3[3], offset); + w5[3] = hc_bytealign_S (w3[1], w3[2], offset); + w5[2] = hc_bytealign_S (w3[0], w3[1], offset); + w5[1] = hc_bytealign_S (w2[3], w3[0], offset); + w5[0] = hc_bytealign_S (w2[2], w2[3], offset); + w4[3] = hc_bytealign_S (w2[1], w2[2], offset); + w4[2] = hc_bytealign_S (w2[0], w2[1], offset); + w4[1] = hc_bytealign_S (w1[3], w2[0], offset); + w4[0] = hc_bytealign_S (w1[2], w1[3], offset); + w3[3] = hc_bytealign_S (w1[1], w1[2], offset); + w3[2] = hc_bytealign_S (w1[0], w1[1], offset); + w3[1] = hc_bytealign_S (w0[3], w1[0], offset); + w3[0] = hc_bytealign_S (w0[2], w0[3], offset); + w2[3] = hc_bytealign_S (w0[1], w0[2], offset); + w2[2] = hc_bytealign_S (w0[0], w0[1], offset); + w2[1] = hc_bytealign_S ( 0, w0[0], offset); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -34323,29 +39717,29 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 break; - case 15: - c3[3] = hc_bytealign_S (w3[3], 0, offset); - c3[2] = hc_bytealign_S (w3[2], w3[3], offset); - c3[1] = hc_bytealign_S (w3[1], w3[2], offset); - c3[0] = hc_bytealign_S (w3[0], w3[1], offset); - c2[3] = hc_bytealign_S (w2[3], w3[0], offset); - c2[2] = hc_bytealign_S (w2[2], w2[3], offset); - c2[1] = hc_bytealign_S (w2[1], w2[2], offset); - c2[0] = hc_bytealign_S (w2[0], w2[1], offset); - c1[3] = hc_bytealign_S (w1[3], w2[0], offset); - c1[2] = hc_bytealign_S (w1[2], w1[3], offset); - c1[1] = hc_bytealign_S (w1[1], w1[2], offset); - c1[0] = hc_bytealign_S (w1[0], w1[1], offset); - c0[3] = hc_bytealign_S (w0[3], w1[0], offset); - c0[2] = hc_bytealign_S (w0[2], w0[3], offset); - c0[1] = hc_bytealign_S (w0[1], w0[2], offset); - c0[0] = hc_bytealign_S (w0[0], w0[1], offset); - w3[3] = hc_bytealign_S ( 0, w0[0], offset); - w3[2] = 0; - w3[1] = 0; - w3[0] = 0; - w2[3] = 0; - w2[2] = 0; + case 10: + w7[3] = hc_bytealign_S (w5[0], w5[1], offset); + w7[2] = hc_bytealign_S (w4[3], w5[0], offset); + w7[1] = hc_bytealign_S (w4[2], w4[3], offset); + w7[0] = hc_bytealign_S (w4[1], w4[2], offset); + w6[3] = hc_bytealign_S (w4[0], w4[1], offset); + w6[2] = hc_bytealign_S (w3[3], w4[0], offset); + w6[1] = hc_bytealign_S (w3[2], w3[3], offset); + w6[0] = hc_bytealign_S (w3[1], w3[2], offset); + w5[3] = hc_bytealign_S (w3[0], w3[1], offset); + w5[2] = hc_bytealign_S (w2[3], w3[0], offset); + w5[1] = hc_bytealign_S (w2[2], w2[3], offset); + w5[0] = hc_bytealign_S (w2[1], w2[2], offset); + w4[3] = hc_bytealign_S (w2[0], w2[1], offset); + w4[2] = hc_bytealign_S (w1[3], w2[0], offset); + w4[1] = hc_bytealign_S (w1[2], w1[3], offset); + w4[0] = hc_bytealign_S (w1[1], w1[2], offset); + w3[3] = hc_bytealign_S (w1[0], w1[1], offset); + w3[2] = hc_bytealign_S (w0[3], w1[0], offset); + w3[1] = hc_bytealign_S (w0[2], w0[3], offset); + w3[0] = hc_bytealign_S (w0[1], w0[2], offset); + w2[3] = hc_bytealign_S (w0[0], w0[1], offset); + w2[2] = hc_bytealign_S ( 0, w0[0], offset); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -34357,111 +39751,37 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 w0[1] = 0; w0[0] = 0; - break; - } - #endif -} - -DECLSPEC void switch_buffer_by_offset_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 offset) -{ - const int offset_switch = offset / 4; - - #if (defined IS_AMD && HAS_VPERM == 0) || defined IS_GENERIC - switch (offset_switch) - { - case 0: - w3[3] = hc_bytealign_be_S (w3[2], w3[3], offset); - w3[2] = hc_bytealign_be_S (w3[1], w3[2], offset); - w3[1] = hc_bytealign_be_S (w3[0], w3[1], offset); - w3[0] = hc_bytealign_be_S (w2[3], w3[0], offset); - w2[3] = hc_bytealign_be_S (w2[2], w2[3], offset); - w2[2] = hc_bytealign_be_S (w2[1], w2[2], offset); - w2[1] = hc_bytealign_be_S (w2[0], w2[1], offset); - w2[0] = hc_bytealign_be_S (w1[3], w2[0], offset); - w1[3] = hc_bytealign_be_S (w1[2], w1[3], offset); - w1[2] = hc_bytealign_be_S (w1[1], w1[2], offset); - w1[1] = hc_bytealign_be_S (w1[0], w1[1], offset); - w1[0] = hc_bytealign_be_S (w0[3], w1[0], offset); - w0[3] = hc_bytealign_be_S (w0[2], w0[3], offset); - w0[2] = hc_bytealign_be_S (w0[1], w0[2], offset); - w0[1] = hc_bytealign_be_S (w0[0], w0[1], offset); - w0[0] = hc_bytealign_be_S ( 0, w0[0], offset); - - break; - - case 1: - w3[3] = hc_bytealign_be_S (w3[1], w3[2], offset); - w3[2] = hc_bytealign_be_S (w3[0], w3[1], offset); - w3[1] = hc_bytealign_be_S (w2[3], w3[0], offset); - w3[0] = hc_bytealign_be_S (w2[2], w2[3], offset); - w2[3] = hc_bytealign_be_S (w2[1], w2[2], offset); - w2[2] = hc_bytealign_be_S (w2[0], w2[1], offset); - w2[1] = hc_bytealign_be_S (w1[3], w2[0], offset); - w2[0] = hc_bytealign_be_S (w1[2], w1[3], offset); - w1[3] = hc_bytealign_be_S (w1[1], w1[2], offset); - w1[2] = hc_bytealign_be_S (w1[0], w1[1], offset); - w1[1] = hc_bytealign_be_S (w0[3], w1[0], offset); - w1[0] = hc_bytealign_be_S (w0[2], w0[3], offset); - w0[3] = hc_bytealign_be_S (w0[1], w0[2], offset); - w0[2] = hc_bytealign_be_S (w0[0], w0[1], offset); - w0[1] = hc_bytealign_be_S ( 0, w0[0], offset); - w0[0] = 0; - - break; - - case 2: - w3[3] = hc_bytealign_be_S (w3[0], w3[1], offset); - w3[2] = hc_bytealign_be_S (w2[3], w3[0], offset); - w3[1] = hc_bytealign_be_S (w2[2], w2[3], offset); - w3[0] = hc_bytealign_be_S (w2[1], w2[2], offset); - w2[3] = hc_bytealign_be_S (w2[0], w2[1], offset); - w2[2] = hc_bytealign_be_S (w1[3], w2[0], offset); - w2[1] = hc_bytealign_be_S (w1[2], w1[3], offset); - w2[0] = hc_bytealign_be_S (w1[1], w1[2], offset); - w1[3] = hc_bytealign_be_S (w1[0], w1[1], offset); - w1[2] = hc_bytealign_be_S (w0[3], w1[0], offset); - w1[1] = hc_bytealign_be_S (w0[2], w0[3], offset); - w1[0] = hc_bytealign_be_S (w0[1], w0[2], offset); - w0[3] = hc_bytealign_be_S (w0[0], w0[1], offset); - w0[2] = hc_bytealign_be_S ( 0, w0[0], offset); - w0[1] = 0; - w0[0] = 0; - - break; - - case 3: - w3[3] = hc_bytealign_be_S (w2[3], w3[0], offset); - w3[2] = hc_bytealign_be_S (w2[2], w2[3], offset); - w3[1] = hc_bytealign_be_S (w2[1], w2[2], offset); - w3[0] = hc_bytealign_be_S (w2[0], w2[1], offset); - w2[3] = hc_bytealign_be_S (w1[3], w2[0], offset); - w2[2] = hc_bytealign_be_S (w1[2], w1[3], offset); - w2[1] = hc_bytealign_be_S (w1[1], w1[2], offset); - w2[0] = hc_bytealign_be_S (w1[0], w1[1], offset); - w1[3] = hc_bytealign_be_S (w0[3], w1[0], offset); - w1[2] = hc_bytealign_be_S (w0[2], w0[3], offset); - w1[1] = hc_bytealign_be_S (w0[1], w0[2], offset); - w1[0] = hc_bytealign_be_S (w0[0], w0[1], offset); - w0[3] = hc_bytealign_be_S ( 0, w0[0], offset); - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - case 4: - w3[3] = hc_bytealign_be_S (w2[2], w2[3], offset); - w3[2] = hc_bytealign_be_S (w2[1], w2[2], offset); - w3[1] = hc_bytealign_be_S (w2[0], w2[1], offset); - w3[0] = hc_bytealign_be_S (w1[3], w2[0], offset); - w2[3] = hc_bytealign_be_S (w1[2], w1[3], offset); - w2[2] = hc_bytealign_be_S (w1[1], w1[2], offset); - w2[1] = hc_bytealign_be_S (w1[0], w1[1], offset); - w2[0] = hc_bytealign_be_S (w0[3], w1[0], offset); - w1[3] = hc_bytealign_be_S (w0[2], w0[3], offset); - w1[2] = hc_bytealign_be_S (w0[1], w0[2], offset); - w1[1] = hc_bytealign_be_S (w0[0], w0[1], offset); - w1[0] = hc_bytealign_be_S ( 0, w0[0], offset); + case 11: + w7[3] = hc_bytealign_S (w4[3], w5[0], offset); + w7[2] = hc_bytealign_S (w4[2], w4[3], offset); + w7[1] = hc_bytealign_S (w4[1], w4[2], offset); + w7[0] = hc_bytealign_S (w4[0], w4[1], offset); + w6[3] = hc_bytealign_S (w3[3], w4[0], offset); + w6[2] = hc_bytealign_S (w3[2], w3[3], offset); + w6[1] = hc_bytealign_S (w3[1], w3[2], offset); + w6[0] = hc_bytealign_S (w3[0], w3[1], offset); + w5[3] = hc_bytealign_S (w2[3], w3[0], offset); + w5[2] = hc_bytealign_S (w2[2], w2[3], offset); + w5[1] = hc_bytealign_S (w2[1], w2[2], offset); + w5[0] = hc_bytealign_S (w2[0], w2[1], offset); + w4[3] = hc_bytealign_S (w1[3], w2[0], offset); + w4[2] = hc_bytealign_S (w1[2], w1[3], offset); + w4[1] = hc_bytealign_S (w1[1], w1[2], offset); + w4[0] = hc_bytealign_S (w1[0], w1[1], offset); + w3[3] = hc_bytealign_S (w0[3], w1[0], offset); + w3[2] = hc_bytealign_S (w0[2], w0[3], offset); + w3[1] = hc_bytealign_S (w0[1], w0[2], offset); + w3[0] = hc_bytealign_S (w0[0], w0[1], offset); + w2[3] = hc_bytealign_S ( 0, w0[0], offset); + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; @@ -34469,18 +39789,34 @@ DECLSPEC void switch_buffer_by_offset_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; - case 5: - w3[3] = hc_bytealign_be_S (w2[1], w2[2], offset); - w3[2] = hc_bytealign_be_S (w2[0], w2[1], offset); - w3[1] = hc_bytealign_be_S (w1[3], w2[0], offset); - w3[0] = hc_bytealign_be_S (w1[2], w1[3], offset); - w2[3] = hc_bytealign_be_S (w1[1], w1[2], offset); - w2[2] = hc_bytealign_be_S (w1[0], w1[1], offset); - w2[1] = hc_bytealign_be_S (w0[3], w1[0], offset); - w2[0] = hc_bytealign_be_S (w0[2], w0[3], offset); - w1[3] = hc_bytealign_be_S (w0[1], w0[2], offset); - w1[2] = hc_bytealign_be_S (w0[0], w0[1], offset); - w1[1] = hc_bytealign_be_S ( 0, w0[0], offset); + case 12: + w7[3] = hc_bytealign_S (w4[2], w4[3], offset); + w7[2] = hc_bytealign_S (w4[1], w4[2], offset); + w7[1] = hc_bytealign_S (w4[0], w4[1], offset); + w7[0] = hc_bytealign_S (w3[3], w4[0], offset); + w6[3] = hc_bytealign_S (w3[2], w3[3], offset); + w6[2] = hc_bytealign_S (w3[1], w3[2], offset); + w6[1] = hc_bytealign_S (w3[0], w3[1], offset); + w6[0] = hc_bytealign_S (w2[3], w3[0], offset); + w5[3] = hc_bytealign_S (w2[2], w2[3], offset); + w5[2] = hc_bytealign_S (w2[1], w2[2], offset); + w5[1] = hc_bytealign_S (w2[0], w2[1], offset); + w5[0] = hc_bytealign_S (w1[3], w2[0], offset); + w4[3] = hc_bytealign_S (w1[2], w1[3], offset); + w4[2] = hc_bytealign_S (w1[1], w1[2], offset); + w4[1] = hc_bytealign_S (w1[0], w1[1], offset); + w4[0] = hc_bytealign_S (w0[3], w1[0], offset); + w3[3] = hc_bytealign_S (w0[2], w0[3], offset); + w3[2] = hc_bytealign_S (w0[1], w0[2], offset); + w3[1] = hc_bytealign_S (w0[0], w0[1], offset); + w3[0] = hc_bytealign_S ( 0, w0[0], offset); + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; @@ -34489,17 +39825,33 @@ DECLSPEC void switch_buffer_by_offset_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; - case 6: - w3[3] = hc_bytealign_be_S (w2[0], w2[1], offset); - w3[2] = hc_bytealign_be_S (w1[3], w2[0], offset); - w3[1] = hc_bytealign_be_S (w1[2], w1[3], offset); - w3[0] = hc_bytealign_be_S (w1[1], w1[2], offset); - w2[3] = hc_bytealign_be_S (w1[0], w1[1], offset); - w2[2] = hc_bytealign_be_S (w0[3], w1[0], offset); - w2[1] = hc_bytealign_be_S (w0[2], w0[3], offset); - w2[0] = hc_bytealign_be_S (w0[1], w0[2], offset); - w1[3] = hc_bytealign_be_S (w0[0], w0[1], offset); - w1[2] = hc_bytealign_be_S ( 0, w0[0], offset); + case 13: + w7[3] = hc_bytealign_S (w4[1], w4[2], offset); + w7[2] = hc_bytealign_S (w4[0], w4[1], offset); + w7[1] = hc_bytealign_S (w3[3], w4[0], offset); + w7[0] = hc_bytealign_S (w3[2], w3[3], offset); + w6[3] = hc_bytealign_S (w3[1], w3[2], offset); + w6[2] = hc_bytealign_S (w3[0], w3[1], offset); + w6[1] = hc_bytealign_S (w2[3], w3[0], offset); + w6[0] = hc_bytealign_S (w2[2], w2[3], offset); + w5[3] = hc_bytealign_S (w2[1], w2[2], offset); + w5[2] = hc_bytealign_S (w2[0], w2[1], offset); + w5[1] = hc_bytealign_S (w1[3], w2[0], offset); + w5[0] = hc_bytealign_S (w1[2], w1[3], offset); + w4[3] = hc_bytealign_S (w1[1], w1[2], offset); + w4[2] = hc_bytealign_S (w1[0], w1[1], offset); + w4[1] = hc_bytealign_S (w0[3], w1[0], offset); + w4[0] = hc_bytealign_S (w0[2], w0[3], offset); + w3[3] = hc_bytealign_S (w0[1], w0[2], offset); + w3[2] = hc_bytealign_S (w0[0], w0[1], offset); + w3[1] = hc_bytealign_S ( 0, w0[0], offset); + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -34509,16 +39861,32 @@ DECLSPEC void switch_buffer_by_offset_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; - case 7: - w3[3] = hc_bytealign_be_S (w1[3], w2[0], offset); - w3[2] = hc_bytealign_be_S (w1[2], w1[3], offset); - w3[1] = hc_bytealign_be_S (w1[1], w1[2], offset); - w3[0] = hc_bytealign_be_S (w1[0], w1[1], offset); - w2[3] = hc_bytealign_be_S (w0[3], w1[0], offset); - w2[2] = hc_bytealign_be_S (w0[2], w0[3], offset); - w2[1] = hc_bytealign_be_S (w0[1], w0[2], offset); - w2[0] = hc_bytealign_be_S (w0[0], w0[1], offset); - w1[3] = hc_bytealign_be_S ( 0, w0[0], offset); + case 14: + w7[3] = hc_bytealign_S (w4[0], w4[1], offset); + w7[2] = hc_bytealign_S (w3[3], w4[0], offset); + w7[1] = hc_bytealign_S (w3[2], w3[3], offset); + w7[0] = hc_bytealign_S (w3[1], w3[2], offset); + w6[3] = hc_bytealign_S (w3[0], w3[1], offset); + w6[2] = hc_bytealign_S (w2[3], w3[0], offset); + w6[1] = hc_bytealign_S (w2[2], w2[3], offset); + w6[0] = hc_bytealign_S (w2[1], w2[2], offset); + w5[3] = hc_bytealign_S (w2[0], w2[1], offset); + w5[2] = hc_bytealign_S (w1[3], w2[0], offset); + w5[1] = hc_bytealign_S (w1[2], w1[3], offset); + w5[0] = hc_bytealign_S (w1[1], w1[2], offset); + w4[3] = hc_bytealign_S (w1[0], w1[1], offset); + w4[2] = hc_bytealign_S (w0[3], w1[0], offset); + w4[1] = hc_bytealign_S (w0[2], w0[3], offset); + w4[0] = hc_bytealign_S (w0[1], w0[2], offset); + w3[3] = hc_bytealign_S (w0[0], w0[1], offset); + w3[2] = hc_bytealign_S ( 0, w0[0], offset); + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -34529,15 +39897,31 @@ DECLSPEC void switch_buffer_by_offset_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; - case 8: - w3[3] = hc_bytealign_be_S (w1[2], w1[3], offset); - w3[2] = hc_bytealign_be_S (w1[1], w1[2], offset); - w3[1] = hc_bytealign_be_S (w1[0], w1[1], offset); - w3[0] = hc_bytealign_be_S (w0[3], w1[0], offset); - w2[3] = hc_bytealign_be_S (w0[2], w0[3], offset); - w2[2] = hc_bytealign_be_S (w0[1], w0[2], offset); - w2[1] = hc_bytealign_be_S (w0[0], w0[1], offset); - w2[0] = hc_bytealign_be_S ( 0, w0[0], offset); + case 15: + w7[3] = hc_bytealign_S (w3[3], w4[0], offset); + w7[2] = hc_bytealign_S (w3[2], w3[3], offset); + w7[1] = hc_bytealign_S (w3[1], w3[2], offset); + w7[0] = hc_bytealign_S (w3[0], w3[1], offset); + w6[3] = hc_bytealign_S (w2[3], w3[0], offset); + w6[2] = hc_bytealign_S (w2[2], w2[3], offset); + w6[1] = hc_bytealign_S (w2[1], w2[2], offset); + w6[0] = hc_bytealign_S (w2[0], w2[1], offset); + w5[3] = hc_bytealign_S (w1[3], w2[0], offset); + w5[2] = hc_bytealign_S (w1[2], w1[3], offset); + w5[1] = hc_bytealign_S (w1[1], w1[2], offset); + w5[0] = hc_bytealign_S (w1[0], w1[1], offset); + w4[3] = hc_bytealign_S (w0[3], w1[0], offset); + w4[2] = hc_bytealign_S (w0[2], w0[3], offset); + w4[1] = hc_bytealign_S (w0[1], w0[2], offset); + w4[0] = hc_bytealign_S (w0[0], w0[1], offset); + w3[3] = hc_bytealign_S ( 0, w0[0], offset); + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -34549,14 +39933,30 @@ DECLSPEC void switch_buffer_by_offset_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; - case 9: - w3[3] = hc_bytealign_be_S (w1[1], w1[2], offset); - w3[2] = hc_bytealign_be_S (w1[0], w1[1], offset); - w3[1] = hc_bytealign_be_S (w0[3], w1[0], offset); - w3[0] = hc_bytealign_be_S (w0[2], w0[3], offset); - w2[3] = hc_bytealign_be_S (w0[1], w0[2], offset); - w2[2] = hc_bytealign_be_S (w0[0], w0[1], offset); - w2[1] = hc_bytealign_be_S ( 0, w0[0], offset); + case 16: + w7[3] = hc_bytealign_S (w3[2], w3[3], offset); + w7[2] = hc_bytealign_S (w3[1], w3[2], offset); + w7[1] = hc_bytealign_S (w3[0], w3[1], offset); + w7[0] = hc_bytealign_S (w2[3], w3[0], offset); + w6[3] = hc_bytealign_S (w2[2], w2[3], offset); + w6[2] = hc_bytealign_S (w2[1], w2[2], offset); + w6[1] = hc_bytealign_S (w2[0], w2[1], offset); + w6[0] = hc_bytealign_S (w1[3], w2[0], offset); + w5[3] = hc_bytealign_S (w1[2], w1[3], offset); + w5[2] = hc_bytealign_S (w1[1], w1[2], offset); + w5[1] = hc_bytealign_S (w1[0], w1[1], offset); + w5[0] = hc_bytealign_S (w0[3], w1[0], offset); + w4[3] = hc_bytealign_S (w0[2], w0[3], offset); + w4[2] = hc_bytealign_S (w0[1], w0[2], offset); + w4[1] = hc_bytealign_S (w0[0], w0[1], offset); + w4[0] = hc_bytealign_S ( 0, w0[0], offset); + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -34569,13 +39969,29 @@ DECLSPEC void switch_buffer_by_offset_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; - case 10: - w3[3] = hc_bytealign_be_S (w1[0], w1[1], offset); - w3[2] = hc_bytealign_be_S (w0[3], w1[0], offset); - w3[1] = hc_bytealign_be_S (w0[2], w0[3], offset); - w3[0] = hc_bytealign_be_S (w0[1], w0[2], offset); - w2[3] = hc_bytealign_be_S (w0[0], w0[1], offset); - w2[2] = hc_bytealign_be_S ( 0, w0[0], offset); + case 17: + w7[3] = hc_bytealign_S (w3[1], w3[2], offset); + w7[2] = hc_bytealign_S (w3[0], w3[1], offset); + w7[1] = hc_bytealign_S (w2[3], w3[0], offset); + w7[0] = hc_bytealign_S (w2[2], w2[3], offset); + w6[3] = hc_bytealign_S (w2[1], w2[2], offset); + w6[2] = hc_bytealign_S (w2[0], w2[1], offset); + w6[1] = hc_bytealign_S (w1[3], w2[0], offset); + w6[0] = hc_bytealign_S (w1[2], w1[3], offset); + w5[3] = hc_bytealign_S (w1[1], w1[2], offset); + w5[2] = hc_bytealign_S (w1[0], w1[1], offset); + w5[1] = hc_bytealign_S (w0[3], w1[0], offset); + w5[0] = hc_bytealign_S (w0[2], w0[3], offset); + w4[3] = hc_bytealign_S (w0[1], w0[2], offset); + w4[2] = hc_bytealign_S (w0[0], w0[1], offset); + w4[1] = hc_bytealign_S ( 0, w0[0], offset); + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -34589,12 +40005,28 @@ DECLSPEC void switch_buffer_by_offset_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; - case 11: - w3[3] = hc_bytealign_be_S (w0[3], w1[0], offset); - w3[2] = hc_bytealign_be_S (w0[2], w0[3], offset); - w3[1] = hc_bytealign_be_S (w0[1], w0[2], offset); - w3[0] = hc_bytealign_be_S (w0[0], w0[1], offset); - w2[3] = hc_bytealign_be_S ( 0, w0[0], offset); + case 18: + w7[3] = hc_bytealign_S (w3[0], w3[1], offset); + w7[2] = hc_bytealign_S (w2[3], w3[0], offset); + w7[1] = hc_bytealign_S (w2[2], w2[3], offset); + w7[0] = hc_bytealign_S (w2[1], w2[2], offset); + w6[3] = hc_bytealign_S (w2[0], w2[1], offset); + w6[2] = hc_bytealign_S (w1[3], w2[0], offset); + w6[1] = hc_bytealign_S (w1[2], w1[3], offset); + w6[0] = hc_bytealign_S (w1[1], w1[2], offset); + w5[3] = hc_bytealign_S (w1[0], w1[1], offset); + w5[2] = hc_bytealign_S (w0[3], w1[0], offset); + w5[1] = hc_bytealign_S (w0[2], w0[3], offset); + w5[0] = hc_bytealign_S (w0[1], w0[2], offset); + w4[3] = hc_bytealign_S (w0[0], w0[1], offset); + w4[2] = hc_bytealign_S ( 0, w0[0], offset); + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -34609,11 +40041,27 @@ DECLSPEC void switch_buffer_by_offset_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; - case 12: - w3[3] = hc_bytealign_be_S (w0[2], w0[3], offset); - w3[2] = hc_bytealign_be_S (w0[1], w0[2], offset); - w3[1] = hc_bytealign_be_S (w0[0], w0[1], offset); - w3[0] = hc_bytealign_be_S ( 0, w0[0], offset); + case 19: + w7[3] = hc_bytealign_S (w2[3], w3[0], offset); + w7[2] = hc_bytealign_S (w2[2], w2[3], offset); + w7[1] = hc_bytealign_S (w2[1], w2[2], offset); + w7[0] = hc_bytealign_S (w2[0], w2[1], offset); + w6[3] = hc_bytealign_S (w1[3], w2[0], offset); + w6[2] = hc_bytealign_S (w1[2], w1[3], offset); + w6[1] = hc_bytealign_S (w1[1], w1[2], offset); + w6[0] = hc_bytealign_S (w1[0], w1[1], offset); + w5[3] = hc_bytealign_S (w0[3], w1[0], offset); + w5[2] = hc_bytealign_S (w0[2], w0[3], offset); + w5[1] = hc_bytealign_S (w0[1], w0[2], offset); + w5[0] = hc_bytealign_S (w0[0], w0[1], offset); + w4[3] = hc_bytealign_S ( 0, w0[0], offset); + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -34629,10 +40077,26 @@ DECLSPEC void switch_buffer_by_offset_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; - case 13: - w3[3] = hc_bytealign_be_S (w0[1], w0[2], offset); - w3[2] = hc_bytealign_be_S (w0[0], w0[1], offset); - w3[1] = hc_bytealign_be_S ( 0, w0[0], offset); + case 20: + w7[3] = hc_bytealign_S (w2[2], w2[3], offset); + w7[2] = hc_bytealign_S (w2[1], w2[2], offset); + w7[1] = hc_bytealign_S (w2[0], w2[1], offset); + w7[0] = hc_bytealign_S (w1[3], w2[0], offset); + w6[3] = hc_bytealign_S (w1[2], w1[3], offset); + w6[2] = hc_bytealign_S (w1[1], w1[2], offset); + w6[1] = hc_bytealign_S (w1[0], w1[1], offset); + w6[0] = hc_bytealign_S (w0[3], w1[0], offset); + w5[3] = hc_bytealign_S (w0[2], w0[3], offset); + w5[2] = hc_bytealign_S (w0[1], w0[2], offset); + w5[1] = hc_bytealign_S (w0[0], w0[1], offset); + w5[0] = hc_bytealign_S ( 0, w0[0], offset); + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -34649,9 +40113,25 @@ DECLSPEC void switch_buffer_by_offset_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; - case 14: - w3[3] = hc_bytealign_be_S (w0[0], w0[1], offset); - w3[2] = hc_bytealign_be_S ( 0, w0[0], offset); + case 21: + w7[3] = hc_bytealign_S (w2[1], w2[2], offset); + w7[2] = hc_bytealign_S (w2[0], w2[1], offset); + w7[1] = hc_bytealign_S (w1[3], w2[0], offset); + w7[0] = hc_bytealign_S (w1[2], w1[3], offset); + w6[3] = hc_bytealign_S (w1[1], w1[2], offset); + w6[2] = hc_bytealign_S (w1[0], w1[1], offset); + w6[1] = hc_bytealign_S (w0[3], w1[0], offset); + w6[0] = hc_bytealign_S (w0[2], w0[3], offset); + w5[3] = hc_bytealign_S (w0[1], w0[2], offset); + w5[2] = hc_bytealign_S (w0[0], w0[1], offset); + w5[1] = hc_bytealign_S ( 0, w0[0], offset); + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -34669,8 +40149,24 @@ DECLSPEC void switch_buffer_by_offset_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; - case 15: - w3[3] = hc_bytealign_be_S ( 0, w0[0], offset); + case 22: + w7[3] = hc_bytealign_S (w2[0], w2[1], offset); + w7[2] = hc_bytealign_S (w1[3], w2[0], offset); + w7[1] = hc_bytealign_S (w1[2], w1[3], offset); + w7[0] = hc_bytealign_S (w1[1], w1[2], offset); + w6[3] = hc_bytealign_S (w1[0], w1[1], offset); + w6[2] = hc_bytealign_S (w0[3], w1[0], offset); + w6[1] = hc_bytealign_S (w0[2], w0[3], offset); + w6[0] = hc_bytealign_S (w0[1], w0[2], offset); + w5[3] = hc_bytealign_S (w0[0], w0[1], offset); + w5[2] = hc_bytealign_S ( 0, w0[0], offset); + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -34687,172 +40183,34 @@ DECLSPEC void switch_buffer_by_offset_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, w0[1] = 0; w0[0] = 0; - break; - } - #endif - - #if (defined IS_AMD && HAS_VPERM == 1) || defined IS_NV - - #if defined IS_NV - const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; - #endif - - #if defined IS_AMD - const int selector = 0x0706050403020100 >> ((offset & 3) * 8); - #endif - - switch (offset_switch) - { - case 0: - w3[3] = hc_byte_perm_S (w3[3], w3[2], selector); - w3[2] = hc_byte_perm_S (w3[2], w3[1], selector); - w3[1] = hc_byte_perm_S (w3[1], w3[0], selector); - w3[0] = hc_byte_perm_S (w3[0], w2[3], selector); - w2[3] = hc_byte_perm_S (w2[3], w2[2], selector); - w2[2] = hc_byte_perm_S (w2[2], w2[1], selector); - w2[1] = hc_byte_perm_S (w2[1], w2[0], selector); - w2[0] = hc_byte_perm_S (w2[0], w1[3], selector); - w1[3] = hc_byte_perm_S (w1[3], w1[2], selector); - w1[2] = hc_byte_perm_S (w1[2], w1[1], selector); - w1[1] = hc_byte_perm_S (w1[1], w1[0], selector); - w1[0] = hc_byte_perm_S (w1[0], w0[3], selector); - w0[3] = hc_byte_perm_S (w0[3], w0[2], selector); - w0[2] = hc_byte_perm_S (w0[2], w0[1], selector); - w0[1] = hc_byte_perm_S (w0[1], w0[0], selector); - w0[0] = hc_byte_perm_S (w0[0], 0, selector); - - break; - - case 1: - w3[3] = hc_byte_perm_S (w3[2], w3[1], selector); - w3[2] = hc_byte_perm_S (w3[1], w3[0], selector); - w3[1] = hc_byte_perm_S (w3[0], w2[3], selector); - w3[0] = hc_byte_perm_S (w2[3], w2[2], selector); - w2[3] = hc_byte_perm_S (w2[2], w2[1], selector); - w2[2] = hc_byte_perm_S (w2[1], w2[0], selector); - w2[1] = hc_byte_perm_S (w2[0], w1[3], selector); - w2[0] = hc_byte_perm_S (w1[3], w1[2], selector); - w1[3] = hc_byte_perm_S (w1[2], w1[1], selector); - w1[2] = hc_byte_perm_S (w1[1], w1[0], selector); - w1[1] = hc_byte_perm_S (w1[0], w0[3], selector); - w1[0] = hc_byte_perm_S (w0[3], w0[2], selector); - w0[3] = hc_byte_perm_S (w0[2], w0[1], selector); - w0[2] = hc_byte_perm_S (w0[1], w0[0], selector); - w0[1] = hc_byte_perm_S (w0[0], 0, selector); - w0[0] = 0; - - break; - - case 2: - w3[3] = hc_byte_perm_S (w3[1], w3[0], selector); - w3[2] = hc_byte_perm_S (w3[0], w2[3], selector); - w3[1] = hc_byte_perm_S (w2[3], w2[2], selector); - w3[0] = hc_byte_perm_S (w2[2], w2[1], selector); - w2[3] = hc_byte_perm_S (w2[1], w2[0], selector); - w2[2] = hc_byte_perm_S (w2[0], w1[3], selector); - w2[1] = hc_byte_perm_S (w1[3], w1[2], selector); - w2[0] = hc_byte_perm_S (w1[2], w1[1], selector); - w1[3] = hc_byte_perm_S (w1[1], w1[0], selector); - w1[2] = hc_byte_perm_S (w1[0], w0[3], selector); - w1[1] = hc_byte_perm_S (w0[3], w0[2], selector); - w1[0] = hc_byte_perm_S (w0[2], w0[1], selector); - w0[3] = hc_byte_perm_S (w0[1], w0[0], selector); - w0[2] = hc_byte_perm_S (w0[0], 0, selector); - w0[1] = 0; - w0[0] = 0; - - break; - - case 3: - w3[3] = hc_byte_perm_S (w3[0], w2[3], selector); - w3[2] = hc_byte_perm_S (w2[3], w2[2], selector); - w3[1] = hc_byte_perm_S (w2[2], w2[1], selector); - w3[0] = hc_byte_perm_S (w2[1], w2[0], selector); - w2[3] = hc_byte_perm_S (w2[0], w1[3], selector); - w2[2] = hc_byte_perm_S (w1[3], w1[2], selector); - w2[1] = hc_byte_perm_S (w1[2], w1[1], selector); - w2[0] = hc_byte_perm_S (w1[1], w1[0], selector); - w1[3] = hc_byte_perm_S (w1[0], w0[3], selector); - w1[2] = hc_byte_perm_S (w0[3], w0[2], selector); - w1[1] = hc_byte_perm_S (w0[2], w0[1], selector); - w1[0] = hc_byte_perm_S (w0[1], w0[0], selector); - w0[3] = hc_byte_perm_S (w0[0], 0, selector); - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - break; - - case 4: - w3[3] = hc_byte_perm_S (w2[3], w2[2], selector); - w3[2] = hc_byte_perm_S (w2[2], w2[1], selector); - w3[1] = hc_byte_perm_S (w2[1], w2[0], selector); - w3[0] = hc_byte_perm_S (w2[0], w1[3], selector); - w2[3] = hc_byte_perm_S (w1[3], w1[2], selector); - w2[2] = hc_byte_perm_S (w1[2], w1[1], selector); - w2[1] = hc_byte_perm_S (w1[1], w1[0], selector); - w2[0] = hc_byte_perm_S (w1[0], w0[3], selector); - w1[3] = hc_byte_perm_S (w0[3], w0[2], selector); - w1[2] = hc_byte_perm_S (w0[2], w0[1], selector); - w1[1] = hc_byte_perm_S (w0[1], w0[0], selector); - w1[0] = hc_byte_perm_S (w0[0], 0, selector); - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - break; - - case 5: - w3[3] = hc_byte_perm_S (w2[2], w2[1], selector); - w3[2] = hc_byte_perm_S (w2[1], w2[0], selector); - w3[1] = hc_byte_perm_S (w2[0], w1[3], selector); - w3[0] = hc_byte_perm_S (w1[3], w1[2], selector); - w2[3] = hc_byte_perm_S (w1[2], w1[1], selector); - w2[2] = hc_byte_perm_S (w1[1], w1[0], selector); - w2[1] = hc_byte_perm_S (w1[0], w0[3], selector); - w2[0] = hc_byte_perm_S (w0[3], w0[2], selector); - w1[3] = hc_byte_perm_S (w0[2], w0[1], selector); - w1[2] = hc_byte_perm_S (w0[1], w0[0], selector); - w1[1] = hc_byte_perm_S (w0[0], 0, selector); - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - break; - - case 6: - w3[3] = hc_byte_perm_S (w2[1], w2[0], selector); - w3[2] = hc_byte_perm_S (w2[0], w1[3], selector); - w3[1] = hc_byte_perm_S (w1[3], w1[2], selector); - w3[0] = hc_byte_perm_S (w1[2], w1[1], selector); - w2[3] = hc_byte_perm_S (w1[1], w1[0], selector); - w2[2] = hc_byte_perm_S (w1[0], w0[3], selector); - w2[1] = hc_byte_perm_S (w0[3], w0[2], selector); - w2[0] = hc_byte_perm_S (w0[2], w0[1], selector); - w1[3] = hc_byte_perm_S (w0[1], w0[0], selector); - w1[2] = hc_byte_perm_S (w0[0], 0, selector); - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - case 7: - w3[3] = hc_byte_perm_S (w2[0], w1[3], selector); - w3[2] = hc_byte_perm_S (w1[3], w1[2], selector); - w3[1] = hc_byte_perm_S (w1[2], w1[1], selector); - w3[0] = hc_byte_perm_S (w1[1], w1[0], selector); - w2[3] = hc_byte_perm_S (w1[0], w0[3], selector); - w2[2] = hc_byte_perm_S (w0[3], w0[2], selector); - w2[1] = hc_byte_perm_S (w0[2], w0[1], selector); - w2[0] = hc_byte_perm_S (w0[1], w0[0], selector); - w1[3] = hc_byte_perm_S (w0[0], 0, selector); + case 23: + w7[3] = hc_bytealign_S (w1[3], w2[0], offset); + w7[2] = hc_bytealign_S (w1[2], w1[3], offset); + w7[1] = hc_bytealign_S (w1[1], w1[2], offset); + w7[0] = hc_bytealign_S (w1[0], w1[1], offset); + w6[3] = hc_bytealign_S (w0[3], w1[0], offset); + w6[2] = hc_bytealign_S (w0[2], w0[3], offset); + w6[1] = hc_bytealign_S (w0[1], w0[2], offset); + w6[0] = hc_bytealign_S (w0[0], w0[1], offset); + w5[3] = hc_bytealign_S ( 0, w0[0], offset); + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -34863,15 +40221,31 @@ DECLSPEC void switch_buffer_by_offset_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; - case 8: - w3[3] = hc_byte_perm_S (w1[3], w1[2], selector); - w3[2] = hc_byte_perm_S (w1[2], w1[1], selector); - w3[1] = hc_byte_perm_S (w1[1], w1[0], selector); - w3[0] = hc_byte_perm_S (w1[0], w0[3], selector); - w2[3] = hc_byte_perm_S (w0[3], w0[2], selector); - w2[2] = hc_byte_perm_S (w0[2], w0[1], selector); - w2[1] = hc_byte_perm_S (w0[1], w0[0], selector); - w2[0] = hc_byte_perm_S (w0[0], 0, selector); + case 24: + w7[3] = hc_bytealign_S (w1[2], w1[3], offset); + w7[2] = hc_bytealign_S (w1[1], w1[2], offset); + w7[1] = hc_bytealign_S (w1[0], w1[1], offset); + w7[0] = hc_bytealign_S (w0[3], w1[0], offset); + w6[3] = hc_bytealign_S (w0[2], w0[3], offset); + w6[2] = hc_bytealign_S (w0[1], w0[2], offset); + w6[1] = hc_bytealign_S (w0[0], w0[1], offset); + w6[0] = hc_bytealign_S ( 0, w0[0], offset); + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -34883,14 +40257,30 @@ DECLSPEC void switch_buffer_by_offset_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; - case 9: - w3[3] = hc_byte_perm_S (w1[2], w1[1], selector); - w3[2] = hc_byte_perm_S (w1[1], w1[0], selector); - w3[1] = hc_byte_perm_S (w1[0], w0[3], selector); - w3[0] = hc_byte_perm_S (w0[3], w0[2], selector); - w2[3] = hc_byte_perm_S (w0[2], w0[1], selector); - w2[2] = hc_byte_perm_S (w0[1], w0[0], selector); - w2[1] = hc_byte_perm_S (w0[0], 0, selector); + case 25: + w7[3] = hc_bytealign_S (w1[1], w1[2], offset); + w7[2] = hc_bytealign_S (w1[0], w1[1], offset); + w7[1] = hc_bytealign_S (w0[3], w1[0], offset); + w7[0] = hc_bytealign_S (w0[2], w0[3], offset); + w6[3] = hc_bytealign_S (w0[1], w0[2], offset); + w6[2] = hc_bytealign_S (w0[0], w0[1], offset); + w6[1] = hc_bytealign_S ( 0, w0[0], offset); + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -34903,13 +40293,29 @@ DECLSPEC void switch_buffer_by_offset_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; - case 10: - w3[3] = hc_byte_perm_S (w1[1], w1[0], selector); - w3[2] = hc_byte_perm_S (w1[0], w0[3], selector); - w3[1] = hc_byte_perm_S (w0[3], w0[2], selector); - w3[0] = hc_byte_perm_S (w0[2], w0[1], selector); - w2[3] = hc_byte_perm_S (w0[1], w0[0], selector); - w2[2] = hc_byte_perm_S (w0[0], 0, selector); + case 26: + w7[3] = hc_bytealign_S (w1[0], w1[1], offset); + w7[2] = hc_bytealign_S (w0[3], w1[0], offset); + w7[1] = hc_bytealign_S (w0[2], w0[3], offset); + w7[0] = hc_bytealign_S (w0[1], w0[2], offset); + w6[3] = hc_bytealign_S (w0[0], w0[1], offset); + w6[2] = hc_bytealign_S ( 0, w0[0], offset); + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -34923,12 +40329,28 @@ DECLSPEC void switch_buffer_by_offset_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; - case 11: - w3[3] = hc_byte_perm_S (w1[0], w0[3], selector); - w3[2] = hc_byte_perm_S (w0[3], w0[2], selector); - w3[1] = hc_byte_perm_S (w0[2], w0[1], selector); - w3[0] = hc_byte_perm_S (w0[1], w0[0], selector); - w2[3] = hc_byte_perm_S (w0[0], 0, selector); + case 27: + w7[3] = hc_bytealign_S (w0[3], w1[0], offset); + w7[2] = hc_bytealign_S (w0[2], w0[3], offset); + w7[1] = hc_bytealign_S (w0[1], w0[2], offset); + w7[0] = hc_bytealign_S (w0[0], w0[1], offset); + w6[3] = hc_bytealign_S ( 0, w0[0], offset); + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -34943,11 +40365,27 @@ DECLSPEC void switch_buffer_by_offset_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; - case 12: - w3[3] = hc_byte_perm_S (w0[3], w0[2], selector); - w3[2] = hc_byte_perm_S (w0[2], w0[1], selector); - w3[1] = hc_byte_perm_S (w0[1], w0[0], selector); - w3[0] = hc_byte_perm_S (w0[0], 0, selector); + case 28: + w7[3] = hc_bytealign_S (w0[2], w0[3], offset); + w7[2] = hc_bytealign_S (w0[1], w0[2], offset); + w7[1] = hc_bytealign_S (w0[0], w0[1], offset); + w7[0] = hc_bytealign_S ( 0, w0[0], offset); + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -34963,10 +40401,26 @@ DECLSPEC void switch_buffer_by_offset_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; - case 13: - w3[3] = hc_byte_perm_S (w0[2], w0[1], selector); - w3[2] = hc_byte_perm_S (w0[1], w0[0], selector); - w3[1] = hc_byte_perm_S (w0[0], 0, selector); + case 29: + w7[3] = hc_bytealign_S (w0[1], w0[2], offset); + w7[2] = hc_bytealign_S (w0[0], w0[1], offset); + w7[1] = hc_bytealign_S ( 0, w0[0], offset); + w7[0] = 0; + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -34983,9 +40437,25 @@ DECLSPEC void switch_buffer_by_offset_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; - case 14: - w3[3] = hc_byte_perm_S (w0[1], w0[0], selector); - w3[2] = hc_byte_perm_S (w0[0], 0, selector); + case 30: + w7[3] = hc_bytealign_S (w0[0], w0[1], offset); + w7[2] = hc_bytealign_S ( 0, w0[0], offset); + w7[1] = 0; + w7[0] = 0; + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -35003,8 +40473,24 @@ DECLSPEC void switch_buffer_by_offset_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; - case 15: - w3[3] = hc_byte_perm_S (w0[0], 0, selector); + case 31: + w7[3] = hc_bytealign_S ( 0, w0[0], offset); + w7[2] = 0; + w7[1] = 0; + w7[0] = 0; + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -35024,201 +40510,294 @@ DECLSPEC void switch_buffer_by_offset_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, break; } #endif -} -DECLSPEC void switch_buffer_by_offset_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, u32 *c0, u32 *c1, u32 *c2, u32 *c3, const u32 offset) -{ - const int offset_switch = offset / 4; + #if (defined IS_AMD && HAS_VPERM == 1) || defined IS_NV + + const int offset_mod_4 = offset & 3; + + const int offset_minus_4 = 4 - offset_mod_4; + + #if defined IS_NV + const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; + #endif + + #if defined IS_AMD + const int selector = 0x0706050403020100 >> (offset_minus_4 * 8); + #endif - #if (defined IS_AMD && HAS_VPERM == 0) || defined IS_GENERIC switch (offset_switch) { - case 0: - c0[0] = hc_bytealign_be_S (w3[3], 0, offset); - w3[3] = hc_bytealign_be_S (w3[2], w3[3], offset); - w3[2] = hc_bytealign_be_S (w3[1], w3[2], offset); - w3[1] = hc_bytealign_be_S (w3[0], w3[1], offset); - w3[0] = hc_bytealign_be_S (w2[3], w3[0], offset); - w2[3] = hc_bytealign_be_S (w2[2], w2[3], offset); - w2[2] = hc_bytealign_be_S (w2[1], w2[2], offset); - w2[1] = hc_bytealign_be_S (w2[0], w2[1], offset); - w2[0] = hc_bytealign_be_S (w1[3], w2[0], offset); - w1[3] = hc_bytealign_be_S (w1[2], w1[3], offset); - w1[2] = hc_bytealign_be_S (w1[1], w1[2], offset); - w1[1] = hc_bytealign_be_S (w1[0], w1[1], offset); - w1[0] = hc_bytealign_be_S (w0[3], w1[0], offset); - w0[3] = hc_bytealign_be_S (w0[2], w0[3], offset); - w0[2] = hc_bytealign_be_S (w0[1], w0[2], offset); - w0[1] = hc_bytealign_be_S (w0[0], w0[1], offset); - w0[0] = hc_bytealign_be_S ( 0, w0[0], offset); - + case 0: + w7[3] = hc_byte_perm_S (w7[2], w7[3], selector); + w7[2] = hc_byte_perm_S (w7[1], w7[2], selector); + w7[1] = hc_byte_perm_S (w7[0], w7[1], selector); + w7[0] = hc_byte_perm_S (w6[3], w7[0], selector); + w6[3] = hc_byte_perm_S (w6[2], w6[3], selector); + w6[2] = hc_byte_perm_S (w6[1], w6[2], selector); + w6[1] = hc_byte_perm_S (w6[0], w6[1], selector); + w6[0] = hc_byte_perm_S (w5[3], w6[0], selector); + w5[3] = hc_byte_perm_S (w5[2], w5[3], selector); + w5[2] = hc_byte_perm_S (w5[1], w5[2], selector); + w5[1] = hc_byte_perm_S (w5[0], w5[1], selector); + w5[0] = hc_byte_perm_S (w4[3], w5[0], selector); + w4[3] = hc_byte_perm_S (w4[2], w4[3], selector); + w4[2] = hc_byte_perm_S (w4[1], w4[2], selector); + w4[1] = hc_byte_perm_S (w4[0], w4[1], selector); + w4[0] = hc_byte_perm_S (w3[3], w4[0], selector); + w3[3] = hc_byte_perm_S (w3[2], w3[3], selector); + w3[2] = hc_byte_perm_S (w3[1], w3[2], selector); + w3[1] = hc_byte_perm_S (w3[0], w3[1], selector); + w3[0] = hc_byte_perm_S (w2[3], w3[0], selector); + w2[3] = hc_byte_perm_S (w2[2], w2[3], selector); + w2[2] = hc_byte_perm_S (w2[1], w2[2], selector); + w2[1] = hc_byte_perm_S (w2[0], w2[1], selector); + w2[0] = hc_byte_perm_S (w1[3], w2[0], selector); + w1[3] = hc_byte_perm_S (w1[2], w1[3], selector); + w1[2] = hc_byte_perm_S (w1[1], w1[2], selector); + w1[1] = hc_byte_perm_S (w1[0], w1[1], selector); + w1[0] = hc_byte_perm_S (w0[3], w1[0], selector); + w0[3] = hc_byte_perm_S (w0[2], w0[3], selector); + w0[2] = hc_byte_perm_S (w0[1], w0[2], selector); + w0[1] = hc_byte_perm_S (w0[0], w0[1], selector); + w0[0] = hc_byte_perm_S ( 0, w0[0], selector); break; - case 1: - c0[1] = hc_bytealign_be_S (w3[3], 0, offset); - c0[0] = hc_bytealign_be_S (w3[2], w3[3], offset); - w3[3] = hc_bytealign_be_S (w3[1], w3[2], offset); - w3[2] = hc_bytealign_be_S (w3[0], w3[1], offset); - w3[1] = hc_bytealign_be_S (w2[3], w3[0], offset); - w3[0] = hc_bytealign_be_S (w2[2], w2[3], offset); - w2[3] = hc_bytealign_be_S (w2[1], w2[2], offset); - w2[2] = hc_bytealign_be_S (w2[0], w2[1], offset); - w2[1] = hc_bytealign_be_S (w1[3], w2[0], offset); - w2[0] = hc_bytealign_be_S (w1[2], w1[3], offset); - w1[3] = hc_bytealign_be_S (w1[1], w1[2], offset); - w1[2] = hc_bytealign_be_S (w1[0], w1[1], offset); - w1[1] = hc_bytealign_be_S (w0[3], w1[0], offset); - w1[0] = hc_bytealign_be_S (w0[2], w0[3], offset); - w0[3] = hc_bytealign_be_S (w0[1], w0[2], offset); - w0[2] = hc_bytealign_be_S (w0[0], w0[1], offset); - w0[1] = hc_bytealign_be_S ( 0, w0[0], offset); + case 1: + w7[3] = hc_byte_perm_S (w7[1], w7[2], selector); + w7[2] = hc_byte_perm_S (w7[0], w7[1], selector); + w7[1] = hc_byte_perm_S (w6[3], w7[0], selector); + w7[0] = hc_byte_perm_S (w6[2], w6[3], selector); + w6[3] = hc_byte_perm_S (w6[1], w6[2], selector); + w6[2] = hc_byte_perm_S (w6[0], w6[1], selector); + w6[1] = hc_byte_perm_S (w5[3], w6[0], selector); + w6[0] = hc_byte_perm_S (w5[2], w5[3], selector); + w5[3] = hc_byte_perm_S (w5[1], w5[2], selector); + w5[2] = hc_byte_perm_S (w5[0], w5[1], selector); + w5[1] = hc_byte_perm_S (w4[3], w5[0], selector); + w5[0] = hc_byte_perm_S (w4[2], w4[3], selector); + w4[3] = hc_byte_perm_S (w4[1], w4[2], selector); + w4[2] = hc_byte_perm_S (w4[0], w4[1], selector); + w4[1] = hc_byte_perm_S (w3[3], w4[0], selector); + w4[0] = hc_byte_perm_S (w3[2], w3[3], selector); + w3[3] = hc_byte_perm_S (w3[1], w3[2], selector); + w3[2] = hc_byte_perm_S (w3[0], w3[1], selector); + w3[1] = hc_byte_perm_S (w2[3], w3[0], selector); + w3[0] = hc_byte_perm_S (w2[2], w2[3], selector); + w2[3] = hc_byte_perm_S (w2[1], w2[2], selector); + w2[2] = hc_byte_perm_S (w2[0], w2[1], selector); + w2[1] = hc_byte_perm_S (w1[3], w2[0], selector); + w2[0] = hc_byte_perm_S (w1[2], w1[3], selector); + w1[3] = hc_byte_perm_S (w1[1], w1[2], selector); + w1[2] = hc_byte_perm_S (w1[0], w1[1], selector); + w1[1] = hc_byte_perm_S (w0[3], w1[0], selector); + w1[0] = hc_byte_perm_S (w0[2], w0[3], selector); + w0[3] = hc_byte_perm_S (w0[1], w0[2], selector); + w0[2] = hc_byte_perm_S (w0[0], w0[1], selector); + w0[1] = hc_byte_perm_S ( 0, w0[0], selector); w0[0] = 0; - break; - case 2: - c0[2] = hc_bytealign_be_S (w3[3], 0, offset); - c0[1] = hc_bytealign_be_S (w3[2], w3[3], offset); - c0[0] = hc_bytealign_be_S (w3[1], w3[2], offset); - w3[3] = hc_bytealign_be_S (w3[0], w3[1], offset); - w3[2] = hc_bytealign_be_S (w2[3], w3[0], offset); - w3[1] = hc_bytealign_be_S (w2[2], w2[3], offset); - w3[0] = hc_bytealign_be_S (w2[1], w2[2], offset); - w2[3] = hc_bytealign_be_S (w2[0], w2[1], offset); - w2[2] = hc_bytealign_be_S (w1[3], w2[0], offset); - w2[1] = hc_bytealign_be_S (w1[2], w1[3], offset); - w2[0] = hc_bytealign_be_S (w1[1], w1[2], offset); - w1[3] = hc_bytealign_be_S (w1[0], w1[1], offset); - w1[2] = hc_bytealign_be_S (w0[3], w1[0], offset); - w1[1] = hc_bytealign_be_S (w0[2], w0[3], offset); - w1[0] = hc_bytealign_be_S (w0[1], w0[2], offset); - w0[3] = hc_bytealign_be_S (w0[0], w0[1], offset); - w0[2] = hc_bytealign_be_S ( 0, w0[0], offset); + case 2: + w7[3] = hc_byte_perm_S (w7[0], w7[1], selector); + w7[2] = hc_byte_perm_S (w6[3], w7[0], selector); + w7[1] = hc_byte_perm_S (w6[2], w6[3], selector); + w7[0] = hc_byte_perm_S (w6[1], w6[2], selector); + w6[3] = hc_byte_perm_S (w6[0], w6[1], selector); + w6[2] = hc_byte_perm_S (w5[3], w6[0], selector); + w6[1] = hc_byte_perm_S (w5[2], w5[3], selector); + w6[0] = hc_byte_perm_S (w5[1], w5[2], selector); + w5[3] = hc_byte_perm_S (w5[0], w5[1], selector); + w5[2] = hc_byte_perm_S (w4[3], w5[0], selector); + w5[1] = hc_byte_perm_S (w4[2], w4[3], selector); + w5[0] = hc_byte_perm_S (w4[1], w4[2], selector); + w4[3] = hc_byte_perm_S (w4[0], w4[1], selector); + w4[2] = hc_byte_perm_S (w3[3], w4[0], selector); + w4[1] = hc_byte_perm_S (w3[2], w3[3], selector); + w4[0] = hc_byte_perm_S (w3[1], w3[2], selector); + w3[3] = hc_byte_perm_S (w3[0], w3[1], selector); + w3[2] = hc_byte_perm_S (w2[3], w3[0], selector); + w3[1] = hc_byte_perm_S (w2[2], w2[3], selector); + w3[0] = hc_byte_perm_S (w2[1], w2[2], selector); + w2[3] = hc_byte_perm_S (w2[0], w2[1], selector); + w2[2] = hc_byte_perm_S (w1[3], w2[0], selector); + w2[1] = hc_byte_perm_S (w1[2], w1[3], selector); + w2[0] = hc_byte_perm_S (w1[1], w1[2], selector); + w1[3] = hc_byte_perm_S (w1[0], w1[1], selector); + w1[2] = hc_byte_perm_S (w0[3], w1[0], selector); + w1[1] = hc_byte_perm_S (w0[2], w0[3], selector); + w1[0] = hc_byte_perm_S (w0[1], w0[2], selector); + w0[3] = hc_byte_perm_S (w0[0], w0[1], selector); + w0[2] = hc_byte_perm_S ( 0, w0[0], selector); w0[1] = 0; w0[0] = 0; - break; - case 3: - c0[3] = hc_bytealign_be_S (w3[3], 0, offset); - c0[2] = hc_bytealign_be_S (w3[2], w3[3], offset); - c0[1] = hc_bytealign_be_S (w3[1], w3[2], offset); - c0[0] = hc_bytealign_be_S (w3[0], w3[1], offset); - w3[3] = hc_bytealign_be_S (w2[3], w3[0], offset); - w3[2] = hc_bytealign_be_S (w2[2], w2[3], offset); - w3[1] = hc_bytealign_be_S (w2[1], w2[2], offset); - w3[0] = hc_bytealign_be_S (w2[0], w2[1], offset); - w2[3] = hc_bytealign_be_S (w1[3], w2[0], offset); - w2[2] = hc_bytealign_be_S (w1[2], w1[3], offset); - w2[1] = hc_bytealign_be_S (w1[1], w1[2], offset); - w2[0] = hc_bytealign_be_S (w1[0], w1[1], offset); - w1[3] = hc_bytealign_be_S (w0[3], w1[0], offset); - w1[2] = hc_bytealign_be_S (w0[2], w0[3], offset); - w1[1] = hc_bytealign_be_S (w0[1], w0[2], offset); - w1[0] = hc_bytealign_be_S (w0[0], w0[1], offset); - w0[3] = hc_bytealign_be_S ( 0, w0[0], offset); + case 3: + w7[3] = hc_byte_perm_S (w6[3], w7[0], selector); + w7[2] = hc_byte_perm_S (w6[2], w6[3], selector); + w7[1] = hc_byte_perm_S (w6[1], w6[2], selector); + w7[0] = hc_byte_perm_S (w6[0], w6[1], selector); + w6[3] = hc_byte_perm_S (w5[3], w6[0], selector); + w6[2] = hc_byte_perm_S (w5[2], w5[3], selector); + w6[1] = hc_byte_perm_S (w5[1], w5[2], selector); + w6[0] = hc_byte_perm_S (w5[0], w5[1], selector); + w5[3] = hc_byte_perm_S (w4[3], w5[0], selector); + w5[2] = hc_byte_perm_S (w4[2], w4[3], selector); + w5[1] = hc_byte_perm_S (w4[1], w4[2], selector); + w5[0] = hc_byte_perm_S (w4[0], w4[1], selector); + w4[3] = hc_byte_perm_S (w3[3], w4[0], selector); + w4[2] = hc_byte_perm_S (w3[2], w3[3], selector); + w4[1] = hc_byte_perm_S (w3[1], w3[2], selector); + w4[0] = hc_byte_perm_S (w3[0], w3[1], selector); + w3[3] = hc_byte_perm_S (w2[3], w3[0], selector); + w3[2] = hc_byte_perm_S (w2[2], w2[3], selector); + w3[1] = hc_byte_perm_S (w2[1], w2[2], selector); + w3[0] = hc_byte_perm_S (w2[0], w2[1], selector); + w2[3] = hc_byte_perm_S (w1[3], w2[0], selector); + w2[2] = hc_byte_perm_S (w1[2], w1[3], selector); + w2[1] = hc_byte_perm_S (w1[1], w1[2], selector); + w2[0] = hc_byte_perm_S (w1[0], w1[1], selector); + w1[3] = hc_byte_perm_S (w0[3], w1[0], selector); + w1[2] = hc_byte_perm_S (w0[2], w0[3], selector); + w1[1] = hc_byte_perm_S (w0[1], w0[2], selector); + w1[0] = hc_byte_perm_S (w0[0], w0[1], selector); + w0[3] = hc_byte_perm_S ( 0, w0[0], selector); w0[2] = 0; w0[1] = 0; w0[0] = 0; - break; - case 4: - c1[0] = hc_bytealign_be_S (w3[3], 0, offset); - c0[3] = hc_bytealign_be_S (w3[2], w3[3], offset); - c0[2] = hc_bytealign_be_S (w3[1], w3[2], offset); - c0[1] = hc_bytealign_be_S (w3[0], w3[1], offset); - c0[0] = hc_bytealign_be_S (w2[3], w3[0], offset); - w3[3] = hc_bytealign_be_S (w2[2], w2[3], offset); - w3[2] = hc_bytealign_be_S (w2[1], w2[2], offset); - w3[1] = hc_bytealign_be_S (w2[0], w2[1], offset); - w3[0] = hc_bytealign_be_S (w1[3], w2[0], offset); - w2[3] = hc_bytealign_be_S (w1[2], w1[3], offset); - w2[2] = hc_bytealign_be_S (w1[1], w1[2], offset); - w2[1] = hc_bytealign_be_S (w1[0], w1[1], offset); - w2[0] = hc_bytealign_be_S (w0[3], w1[0], offset); - w1[3] = hc_bytealign_be_S (w0[2], w0[3], offset); - w1[2] = hc_bytealign_be_S (w0[1], w0[2], offset); - w1[1] = hc_bytealign_be_S (w0[0], w0[1], offset); - w1[0] = hc_bytealign_be_S ( 0, w0[0], offset); + case 4: + w7[3] = hc_byte_perm_S (w6[2], w6[3], selector); + w7[2] = hc_byte_perm_S (w6[1], w6[2], selector); + w7[1] = hc_byte_perm_S (w6[0], w6[1], selector); + w7[0] = hc_byte_perm_S (w5[3], w6[0], selector); + w6[3] = hc_byte_perm_S (w5[2], w5[3], selector); + w6[2] = hc_byte_perm_S (w5[1], w5[2], selector); + w6[1] = hc_byte_perm_S (w5[0], w5[1], selector); + w6[0] = hc_byte_perm_S (w4[3], w5[0], selector); + w5[3] = hc_byte_perm_S (w4[2], w4[3], selector); + w5[2] = hc_byte_perm_S (w4[1], w4[2], selector); + w5[1] = hc_byte_perm_S (w4[0], w4[1], selector); + w5[0] = hc_byte_perm_S (w3[3], w4[0], selector); + w4[3] = hc_byte_perm_S (w3[2], w3[3], selector); + w4[2] = hc_byte_perm_S (w3[1], w3[2], selector); + w4[1] = hc_byte_perm_S (w3[0], w3[1], selector); + w4[0] = hc_byte_perm_S (w2[3], w3[0], selector); + w3[3] = hc_byte_perm_S (w2[2], w2[3], selector); + w3[2] = hc_byte_perm_S (w2[1], w2[2], selector); + w3[1] = hc_byte_perm_S (w2[0], w2[1], selector); + w3[0] = hc_byte_perm_S (w1[3], w2[0], selector); + w2[3] = hc_byte_perm_S (w1[2], w1[3], selector); + w2[2] = hc_byte_perm_S (w1[1], w1[2], selector); + w2[1] = hc_byte_perm_S (w1[0], w1[1], selector); + w2[0] = hc_byte_perm_S (w0[3], w1[0], selector); + w1[3] = hc_byte_perm_S (w0[2], w0[3], selector); + w1[2] = hc_byte_perm_S (w0[1], w0[2], selector); + w1[1] = hc_byte_perm_S (w0[0], w0[1], selector); + w1[0] = hc_byte_perm_S ( 0, w0[0], selector); w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; - break; - case 5: - c1[1] = hc_bytealign_be_S (w3[3], 0, offset); - c1[0] = hc_bytealign_be_S (w3[2], w3[3], offset); - c0[3] = hc_bytealign_be_S (w3[1], w3[2], offset); - c0[2] = hc_bytealign_be_S (w3[0], w3[1], offset); - c0[1] = hc_bytealign_be_S (w2[3], w3[0], offset); - c0[0] = hc_bytealign_be_S (w2[2], w2[3], offset); - w3[3] = hc_bytealign_be_S (w2[1], w2[2], offset); - w3[2] = hc_bytealign_be_S (w2[0], w2[1], offset); - w3[1] = hc_bytealign_be_S (w1[3], w2[0], offset); - w3[0] = hc_bytealign_be_S (w1[2], w1[3], offset); - w2[3] = hc_bytealign_be_S (w1[1], w1[2], offset); - w2[2] = hc_bytealign_be_S (w1[0], w1[1], offset); - w2[1] = hc_bytealign_be_S (w0[3], w1[0], offset); - w2[0] = hc_bytealign_be_S (w0[2], w0[3], offset); - w1[3] = hc_bytealign_be_S (w0[1], w0[2], offset); - w1[2] = hc_bytealign_be_S (w0[0], w0[1], offset); - w1[1] = hc_bytealign_be_S ( 0, w0[0], offset); + case 5: + w7[3] = hc_byte_perm_S (w6[1], w6[2], selector); + w7[2] = hc_byte_perm_S (w6[0], w6[1], selector); + w7[1] = hc_byte_perm_S (w5[3], w6[0], selector); + w7[0] = hc_byte_perm_S (w5[2], w5[3], selector); + w6[3] = hc_byte_perm_S (w5[1], w5[2], selector); + w6[2] = hc_byte_perm_S (w5[0], w5[1], selector); + w6[1] = hc_byte_perm_S (w4[3], w5[0], selector); + w6[0] = hc_byte_perm_S (w4[2], w4[3], selector); + w5[3] = hc_byte_perm_S (w4[1], w4[2], selector); + w5[2] = hc_byte_perm_S (w4[0], w4[1], selector); + w5[1] = hc_byte_perm_S (w3[3], w4[0], selector); + w5[0] = hc_byte_perm_S (w3[2], w3[3], selector); + w4[3] = hc_byte_perm_S (w3[1], w3[2], selector); + w4[2] = hc_byte_perm_S (w3[0], w3[1], selector); + w4[1] = hc_byte_perm_S (w2[3], w3[0], selector); + w4[0] = hc_byte_perm_S (w2[2], w2[3], selector); + w3[3] = hc_byte_perm_S (w2[1], w2[2], selector); + w3[2] = hc_byte_perm_S (w2[0], w2[1], selector); + w3[1] = hc_byte_perm_S (w1[3], w2[0], selector); + w3[0] = hc_byte_perm_S (w1[2], w1[3], selector); + w2[3] = hc_byte_perm_S (w1[1], w1[2], selector); + w2[2] = hc_byte_perm_S (w1[0], w1[1], selector); + w2[1] = hc_byte_perm_S (w0[3], w1[0], selector); + w2[0] = hc_byte_perm_S (w0[2], w0[3], selector); + w1[3] = hc_byte_perm_S (w0[1], w0[2], selector); + w1[2] = hc_byte_perm_S (w0[0], w0[1], selector); + w1[1] = hc_byte_perm_S ( 0, w0[0], selector); w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; - break; - case 6: - c1[2] = hc_bytealign_be_S (w3[3], 0, offset); - c1[1] = hc_bytealign_be_S (w3[2], w3[3], offset); - c1[0] = hc_bytealign_be_S (w3[1], w3[2], offset); - c0[3] = hc_bytealign_be_S (w3[0], w3[1], offset); - c0[2] = hc_bytealign_be_S (w2[3], w3[0], offset); - c0[1] = hc_bytealign_be_S (w2[2], w2[3], offset); - c0[0] = hc_bytealign_be_S (w2[1], w2[2], offset); - w3[3] = hc_bytealign_be_S (w2[0], w2[1], offset); - w3[2] = hc_bytealign_be_S (w1[3], w2[0], offset); - w3[1] = hc_bytealign_be_S (w1[2], w1[3], offset); - w3[0] = hc_bytealign_be_S (w1[1], w1[2], offset); - w2[3] = hc_bytealign_be_S (w1[0], w1[1], offset); - w2[2] = hc_bytealign_be_S (w0[3], w1[0], offset); - w2[1] = hc_bytealign_be_S (w0[2], w0[3], offset); - w2[0] = hc_bytealign_be_S (w0[1], w0[2], offset); - w1[3] = hc_bytealign_be_S (w0[0], w0[1], offset); - w1[2] = hc_bytealign_be_S ( 0, w0[0], offset); + case 6: + w7[3] = hc_byte_perm_S (w6[0], w6[1], selector); + w7[2] = hc_byte_perm_S (w5[3], w6[0], selector); + w7[1] = hc_byte_perm_S (w5[2], w5[3], selector); + w7[0] = hc_byte_perm_S (w5[1], w5[2], selector); + w6[3] = hc_byte_perm_S (w5[0], w5[1], selector); + w6[2] = hc_byte_perm_S (w4[3], w5[0], selector); + w6[1] = hc_byte_perm_S (w4[2], w4[3], selector); + w6[0] = hc_byte_perm_S (w4[1], w4[2], selector); + w5[3] = hc_byte_perm_S (w4[0], w4[1], selector); + w5[2] = hc_byte_perm_S (w3[3], w4[0], selector); + w5[1] = hc_byte_perm_S (w3[2], w3[3], selector); + w5[0] = hc_byte_perm_S (w3[1], w3[2], selector); + w4[3] = hc_byte_perm_S (w3[0], w3[1], selector); + w4[2] = hc_byte_perm_S (w2[3], w3[0], selector); + w4[1] = hc_byte_perm_S (w2[2], w2[3], selector); + w4[0] = hc_byte_perm_S (w2[1], w2[2], selector); + w3[3] = hc_byte_perm_S (w2[0], w2[1], selector); + w3[2] = hc_byte_perm_S (w1[3], w2[0], selector); + w3[1] = hc_byte_perm_S (w1[2], w1[3], selector); + w3[0] = hc_byte_perm_S (w1[1], w1[2], selector); + w2[3] = hc_byte_perm_S (w1[0], w1[1], selector); + w2[2] = hc_byte_perm_S (w0[3], w1[0], selector); + w2[1] = hc_byte_perm_S (w0[2], w0[3], selector); + w2[0] = hc_byte_perm_S (w0[1], w0[2], selector); + w1[3] = hc_byte_perm_S (w0[0], w0[1], selector); + w1[2] = hc_byte_perm_S ( 0, w0[0], selector); w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; - break; - case 7: - c1[3] = hc_bytealign_be_S (w3[3], 0, offset); - c1[2] = hc_bytealign_be_S (w3[2], w3[3], offset); - c1[1] = hc_bytealign_be_S (w3[1], w3[2], offset); - c1[0] = hc_bytealign_be_S (w3[0], w3[1], offset); - c0[3] = hc_bytealign_be_S (w2[3], w3[0], offset); - c0[2] = hc_bytealign_be_S (w2[2], w2[3], offset); - c0[1] = hc_bytealign_be_S (w2[1], w2[2], offset); - c0[0] = hc_bytealign_be_S (w2[0], w2[1], offset); - w3[3] = hc_bytealign_be_S (w1[3], w2[0], offset); - w3[2] = hc_bytealign_be_S (w1[2], w1[3], offset); - w3[1] = hc_bytealign_be_S (w1[1], w1[2], offset); - w3[0] = hc_bytealign_be_S (w1[0], w1[1], offset); - w2[3] = hc_bytealign_be_S (w0[3], w1[0], offset); - w2[2] = hc_bytealign_be_S (w0[2], w0[3], offset); - w2[1] = hc_bytealign_be_S (w0[1], w0[2], offset); - w2[0] = hc_bytealign_be_S (w0[0], w0[1], offset); - w1[3] = hc_bytealign_be_S ( 0, w0[0], offset); + case 7: + w7[3] = hc_byte_perm_S (w5[3], w6[0], selector); + w7[2] = hc_byte_perm_S (w5[2], w5[3], selector); + w7[1] = hc_byte_perm_S (w5[1], w5[2], selector); + w7[0] = hc_byte_perm_S (w5[0], w5[1], selector); + w6[3] = hc_byte_perm_S (w4[3], w5[0], selector); + w6[2] = hc_byte_perm_S (w4[2], w4[3], selector); + w6[1] = hc_byte_perm_S (w4[1], w4[2], selector); + w6[0] = hc_byte_perm_S (w4[0], w4[1], selector); + w5[3] = hc_byte_perm_S (w3[3], w4[0], selector); + w5[2] = hc_byte_perm_S (w3[2], w3[3], selector); + w5[1] = hc_byte_perm_S (w3[1], w3[2], selector); + w5[0] = hc_byte_perm_S (w3[0], w3[1], selector); + w4[3] = hc_byte_perm_S (w2[3], w3[0], selector); + w4[2] = hc_byte_perm_S (w2[2], w2[3], selector); + w4[1] = hc_byte_perm_S (w2[1], w2[2], selector); + w4[0] = hc_byte_perm_S (w2[0], w2[1], selector); + w3[3] = hc_byte_perm_S (w1[3], w2[0], selector); + w3[2] = hc_byte_perm_S (w1[2], w1[3], selector); + w3[1] = hc_byte_perm_S (w1[1], w1[2], selector); + w3[0] = hc_byte_perm_S (w1[0], w1[1], selector); + w2[3] = hc_byte_perm_S (w0[3], w1[0], selector); + w2[2] = hc_byte_perm_S (w0[2], w0[3], selector); + w2[1] = hc_byte_perm_S (w0[1], w0[2], selector); + w2[0] = hc_byte_perm_S (w0[0], w0[1], selector); + w1[3] = hc_byte_perm_S ( 0, w0[0], selector); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -35226,27 +40805,33 @@ DECLSPEC void switch_buffer_by_offset_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 w0[2] = 0; w0[1] = 0; w0[0] = 0; - break; - case 8: - c2[0] = hc_bytealign_be_S (w3[3], 0, offset); - c1[3] = hc_bytealign_be_S (w3[2], w3[3], offset); - c1[2] = hc_bytealign_be_S (w3[1], w3[2], offset); - c1[1] = hc_bytealign_be_S (w3[0], w3[1], offset); - c1[0] = hc_bytealign_be_S (w2[3], w3[0], offset); - c0[3] = hc_bytealign_be_S (w2[2], w2[3], offset); - c0[2] = hc_bytealign_be_S (w2[1], w2[2], offset); - c0[1] = hc_bytealign_be_S (w2[0], w2[1], offset); - c0[0] = hc_bytealign_be_S (w1[3], w2[0], offset); - w3[3] = hc_bytealign_be_S (w1[2], w1[3], offset); - w3[2] = hc_bytealign_be_S (w1[1], w1[2], offset); - w3[1] = hc_bytealign_be_S (w1[0], w1[1], offset); - w3[0] = hc_bytealign_be_S (w0[3], w1[0], offset); - w2[3] = hc_bytealign_be_S (w0[2], w0[3], offset); - w2[2] = hc_bytealign_be_S (w0[1], w0[2], offset); - w2[1] = hc_bytealign_be_S (w0[0], w0[1], offset); - w2[0] = hc_bytealign_be_S ( 0, w0[0], offset); + case 8: + w7[3] = hc_byte_perm_S (w5[2], w5[3], selector); + w7[2] = hc_byte_perm_S (w5[1], w5[2], selector); + w7[1] = hc_byte_perm_S (w5[0], w5[1], selector); + w7[0] = hc_byte_perm_S (w4[3], w5[0], selector); + w6[3] = hc_byte_perm_S (w4[2], w4[3], selector); + w6[2] = hc_byte_perm_S (w4[1], w4[2], selector); + w6[1] = hc_byte_perm_S (w4[0], w4[1], selector); + w6[0] = hc_byte_perm_S (w3[3], w4[0], selector); + w5[3] = hc_byte_perm_S (w3[2], w3[3], selector); + w5[2] = hc_byte_perm_S (w3[1], w3[2], selector); + w5[1] = hc_byte_perm_S (w3[0], w3[1], selector); + w5[0] = hc_byte_perm_S (w2[3], w3[0], selector); + w4[3] = hc_byte_perm_S (w2[2], w2[3], selector); + w4[2] = hc_byte_perm_S (w2[1], w2[2], selector); + w4[1] = hc_byte_perm_S (w2[0], w2[1], selector); + w4[0] = hc_byte_perm_S (w1[3], w2[0], selector); + w3[3] = hc_byte_perm_S (w1[2], w1[3], selector); + w3[2] = hc_byte_perm_S (w1[1], w1[2], selector); + w3[1] = hc_byte_perm_S (w1[0], w1[1], selector); + w3[0] = hc_byte_perm_S (w0[3], w1[0], selector); + w2[3] = hc_byte_perm_S (w0[2], w0[3], selector); + w2[2] = hc_byte_perm_S (w0[1], w0[2], selector); + w2[1] = hc_byte_perm_S (w0[0], w0[1], selector); + w2[0] = hc_byte_perm_S ( 0, w0[0], selector); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -35255,27 +40840,32 @@ DECLSPEC void switch_buffer_by_offset_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 w0[2] = 0; w0[1] = 0; w0[0] = 0; - break; - case 9: - c2[1] = hc_bytealign_be_S (w3[3], 0, offset); - c2[0] = hc_bytealign_be_S (w3[2], w3[3], offset); - c1[3] = hc_bytealign_be_S (w3[1], w3[2], offset); - c1[2] = hc_bytealign_be_S (w3[0], w3[1], offset); - c1[1] = hc_bytealign_be_S (w2[3], w3[0], offset); - c1[0] = hc_bytealign_be_S (w2[2], w2[3], offset); - c0[3] = hc_bytealign_be_S (w2[1], w2[2], offset); - c0[2] = hc_bytealign_be_S (w2[0], w2[1], offset); - c0[1] = hc_bytealign_be_S (w1[3], w2[0], offset); - c0[0] = hc_bytealign_be_S (w1[2], w1[3], offset); - w3[3] = hc_bytealign_be_S (w1[1], w1[2], offset); - w3[2] = hc_bytealign_be_S (w1[0], w1[1], offset); - w3[1] = hc_bytealign_be_S (w0[3], w1[0], offset); - w3[0] = hc_bytealign_be_S (w0[2], w0[3], offset); - w2[3] = hc_bytealign_be_S (w0[1], w0[2], offset); - w2[2] = hc_bytealign_be_S (w0[0], w0[1], offset); - w2[1] = hc_bytealign_be_S ( 0, w0[0], offset); + case 9: + w7[3] = hc_byte_perm_S (w5[1], w5[2], selector); + w7[2] = hc_byte_perm_S (w5[0], w5[1], selector); + w7[1] = hc_byte_perm_S (w4[3], w5[0], selector); + w7[0] = hc_byte_perm_S (w4[2], w4[3], selector); + w6[3] = hc_byte_perm_S (w4[1], w4[2], selector); + w6[2] = hc_byte_perm_S (w4[0], w4[1], selector); + w6[1] = hc_byte_perm_S (w3[3], w4[0], selector); + w6[0] = hc_byte_perm_S (w3[2], w3[3], selector); + w5[3] = hc_byte_perm_S (w3[1], w3[2], selector); + w5[2] = hc_byte_perm_S (w3[0], w3[1], selector); + w5[1] = hc_byte_perm_S (w2[3], w3[0], selector); + w5[0] = hc_byte_perm_S (w2[2], w2[3], selector); + w4[3] = hc_byte_perm_S (w2[1], w2[2], selector); + w4[2] = hc_byte_perm_S (w2[0], w2[1], selector); + w4[1] = hc_byte_perm_S (w1[3], w2[0], selector); + w4[0] = hc_byte_perm_S (w1[2], w1[3], selector); + w3[3] = hc_byte_perm_S (w1[1], w1[2], selector); + w3[2] = hc_byte_perm_S (w1[0], w1[1], selector); + w3[1] = hc_byte_perm_S (w0[3], w1[0], selector); + w3[0] = hc_byte_perm_S (w0[2], w0[3], selector); + w2[3] = hc_byte_perm_S (w0[1], w0[2], selector); + w2[2] = hc_byte_perm_S (w0[0], w0[1], selector); + w2[1] = hc_byte_perm_S ( 0, w0[0], selector); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -35285,27 +40875,31 @@ DECLSPEC void switch_buffer_by_offset_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 w0[2] = 0; w0[1] = 0; w0[0] = 0; - break; case 10: - c2[2] = hc_bytealign_be_S (w3[3], 0, offset); - c2[1] = hc_bytealign_be_S (w3[2], w3[3], offset); - c2[0] = hc_bytealign_be_S (w3[1], w3[2], offset); - c1[3] = hc_bytealign_be_S (w3[0], w3[1], offset); - c1[2] = hc_bytealign_be_S (w2[3], w3[0], offset); - c1[1] = hc_bytealign_be_S (w2[2], w2[3], offset); - c1[0] = hc_bytealign_be_S (w2[1], w2[2], offset); - c0[3] = hc_bytealign_be_S (w2[0], w2[1], offset); - c0[2] = hc_bytealign_be_S (w1[3], w2[0], offset); - c0[1] = hc_bytealign_be_S (w1[2], w1[3], offset); - c0[0] = hc_bytealign_be_S (w1[1], w1[2], offset); - w3[3] = hc_bytealign_be_S (w1[0], w1[1], offset); - w3[2] = hc_bytealign_be_S (w0[3], w1[0], offset); - w3[1] = hc_bytealign_be_S (w0[2], w0[3], offset); - w3[0] = hc_bytealign_be_S (w0[1], w0[2], offset); - w2[3] = hc_bytealign_be_S (w0[0], w0[1], offset); - w2[2] = hc_bytealign_be_S ( 0, w0[0], offset); + w7[3] = hc_byte_perm_S (w5[0], w5[1], selector); + w7[2] = hc_byte_perm_S (w4[3], w5[0], selector); + w7[1] = hc_byte_perm_S (w4[2], w4[3], selector); + w7[0] = hc_byte_perm_S (w4[1], w4[2], selector); + w6[3] = hc_byte_perm_S (w4[0], w4[1], selector); + w6[2] = hc_byte_perm_S (w3[3], w4[0], selector); + w6[1] = hc_byte_perm_S (w3[2], w3[3], selector); + w6[0] = hc_byte_perm_S (w3[1], w3[2], selector); + w5[3] = hc_byte_perm_S (w3[0], w3[1], selector); + w5[2] = hc_byte_perm_S (w2[3], w3[0], selector); + w5[1] = hc_byte_perm_S (w2[2], w2[3], selector); + w5[0] = hc_byte_perm_S (w2[1], w2[2], selector); + w4[3] = hc_byte_perm_S (w2[0], w2[1], selector); + w4[2] = hc_byte_perm_S (w1[3], w2[0], selector); + w4[1] = hc_byte_perm_S (w1[2], w1[3], selector); + w4[0] = hc_byte_perm_S (w1[1], w1[2], selector); + w3[3] = hc_byte_perm_S (w1[0], w1[1], selector); + w3[2] = hc_byte_perm_S (w0[3], w1[0], selector); + w3[1] = hc_byte_perm_S (w0[2], w0[3], selector); + w3[0] = hc_byte_perm_S (w0[1], w0[2], selector); + w2[3] = hc_byte_perm_S (w0[0], w0[1], selector); + w2[2] = hc_byte_perm_S ( 0, w0[0], selector); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -35316,27 +40910,30 @@ DECLSPEC void switch_buffer_by_offset_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 w0[2] = 0; w0[1] = 0; w0[0] = 0; - break; case 11: - c2[3] = hc_bytealign_be_S (w3[3], 0, offset); - c2[2] = hc_bytealign_be_S (w3[2], w3[3], offset); - c2[1] = hc_bytealign_be_S (w3[1], w3[2], offset); - c2[0] = hc_bytealign_be_S (w3[0], w3[1], offset); - c1[3] = hc_bytealign_be_S (w2[3], w3[0], offset); - c1[2] = hc_bytealign_be_S (w2[2], w2[3], offset); - c1[1] = hc_bytealign_be_S (w2[1], w2[2], offset); - c1[0] = hc_bytealign_be_S (w2[0], w2[1], offset); - c0[3] = hc_bytealign_be_S (w1[3], w2[0], offset); - c0[2] = hc_bytealign_be_S (w1[2], w1[3], offset); - c0[1] = hc_bytealign_be_S (w1[1], w1[2], offset); - c0[0] = hc_bytealign_be_S (w1[0], w1[1], offset); - w3[3] = hc_bytealign_be_S (w0[3], w1[0], offset); - w3[2] = hc_bytealign_be_S (w0[2], w0[3], offset); - w3[1] = hc_bytealign_be_S (w0[1], w0[2], offset); - w3[0] = hc_bytealign_be_S (w0[0], w0[1], offset); - w2[3] = hc_bytealign_be_S ( 0, w0[0], offset); + w7[3] = hc_byte_perm_S (w4[3], w5[0], selector); + w7[2] = hc_byte_perm_S (w4[2], w4[3], selector); + w7[1] = hc_byte_perm_S (w4[1], w4[2], selector); + w7[0] = hc_byte_perm_S (w4[0], w4[1], selector); + w6[3] = hc_byte_perm_S (w3[3], w4[0], selector); + w6[2] = hc_byte_perm_S (w3[2], w3[3], selector); + w6[1] = hc_byte_perm_S (w3[1], w3[2], selector); + w6[0] = hc_byte_perm_S (w3[0], w3[1], selector); + w5[3] = hc_byte_perm_S (w2[3], w3[0], selector); + w5[2] = hc_byte_perm_S (w2[2], w2[3], selector); + w5[1] = hc_byte_perm_S (w2[1], w2[2], selector); + w5[0] = hc_byte_perm_S (w2[0], w2[1], selector); + w4[3] = hc_byte_perm_S (w1[3], w2[0], selector); + w4[2] = hc_byte_perm_S (w1[2], w1[3], selector); + w4[1] = hc_byte_perm_S (w1[1], w1[2], selector); + w4[0] = hc_byte_perm_S (w1[0], w1[1], selector); + w3[3] = hc_byte_perm_S (w0[3], w1[0], selector); + w3[2] = hc_byte_perm_S (w0[2], w0[3], selector); + w3[1] = hc_byte_perm_S (w0[1], w0[2], selector); + w3[0] = hc_byte_perm_S (w0[0], w0[1], selector); + w2[3] = hc_byte_perm_S ( 0, w0[0], selector); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -35348,27 +40945,29 @@ DECLSPEC void switch_buffer_by_offset_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 w0[2] = 0; w0[1] = 0; w0[0] = 0; - break; case 12: - c3[0] = hc_bytealign_be_S (w3[3], 0, offset); - c2[3] = hc_bytealign_be_S (w3[2], w3[3], offset); - c2[2] = hc_bytealign_be_S (w3[1], w3[2], offset); - c2[1] = hc_bytealign_be_S (w3[0], w3[1], offset); - c2[0] = hc_bytealign_be_S (w2[3], w3[0], offset); - c1[3] = hc_bytealign_be_S (w2[2], w2[3], offset); - c1[2] = hc_bytealign_be_S (w2[1], w2[2], offset); - c1[1] = hc_bytealign_be_S (w2[0], w2[1], offset); - c1[0] = hc_bytealign_be_S (w1[3], w2[0], offset); - c0[3] = hc_bytealign_be_S (w1[2], w1[3], offset); - c0[2] = hc_bytealign_be_S (w1[1], w1[2], offset); - c0[1] = hc_bytealign_be_S (w1[0], w1[1], offset); - c0[0] = hc_bytealign_be_S (w0[3], w1[0], offset); - w3[3] = hc_bytealign_be_S (w0[2], w0[3], offset); - w3[2] = hc_bytealign_be_S (w0[1], w0[2], offset); - w3[1] = hc_bytealign_be_S (w0[0], w0[1], offset); - w3[0] = hc_bytealign_be_S ( 0, w0[0], offset); + w7[3] = hc_byte_perm_S (w4[2], w4[3], selector); + w7[2] = hc_byte_perm_S (w4[1], w4[2], selector); + w7[1] = hc_byte_perm_S (w4[0], w4[1], selector); + w7[0] = hc_byte_perm_S (w3[3], w4[0], selector); + w6[3] = hc_byte_perm_S (w3[2], w3[3], selector); + w6[2] = hc_byte_perm_S (w3[1], w3[2], selector); + w6[1] = hc_byte_perm_S (w3[0], w3[1], selector); + w6[0] = hc_byte_perm_S (w2[3], w3[0], selector); + w5[3] = hc_byte_perm_S (w2[2], w2[3], selector); + w5[2] = hc_byte_perm_S (w2[1], w2[2], selector); + w5[1] = hc_byte_perm_S (w2[0], w2[1], selector); + w5[0] = hc_byte_perm_S (w1[3], w2[0], selector); + w4[3] = hc_byte_perm_S (w1[2], w1[3], selector); + w4[2] = hc_byte_perm_S (w1[1], w1[2], selector); + w4[1] = hc_byte_perm_S (w1[0], w1[1], selector); + w4[0] = hc_byte_perm_S (w0[3], w1[0], selector); + w3[3] = hc_byte_perm_S (w0[2], w0[3], selector); + w3[2] = hc_byte_perm_S (w0[1], w0[2], selector); + w3[1] = hc_byte_perm_S (w0[0], w0[1], selector); + w3[0] = hc_byte_perm_S ( 0, w0[0], selector); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -35381,27 +40980,28 @@ DECLSPEC void switch_buffer_by_offset_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 w0[2] = 0; w0[1] = 0; w0[0] = 0; - break; case 13: - c3[1] = hc_bytealign_be_S (w3[3], 0, offset); - c3[0] = hc_bytealign_be_S (w3[2], w3[3], offset); - c2[3] = hc_bytealign_be_S (w3[1], w3[2], offset); - c2[2] = hc_bytealign_be_S (w3[0], w3[1], offset); - c2[1] = hc_bytealign_be_S (w2[3], w3[0], offset); - c2[0] = hc_bytealign_be_S (w2[2], w2[3], offset); - c1[3] = hc_bytealign_be_S (w2[1], w2[2], offset); - c1[2] = hc_bytealign_be_S (w2[0], w2[1], offset); - c1[1] = hc_bytealign_be_S (w1[3], w2[0], offset); - c1[0] = hc_bytealign_be_S (w1[2], w1[3], offset); - c0[3] = hc_bytealign_be_S (w1[1], w1[2], offset); - c0[2] = hc_bytealign_be_S (w1[0], w1[1], offset); - c0[1] = hc_bytealign_be_S (w0[3], w1[0], offset); - c0[0] = hc_bytealign_be_S (w0[2], w0[3], offset); - w3[3] = hc_bytealign_be_S (w0[1], w0[2], offset); - w3[2] = hc_bytealign_be_S (w0[0], w0[1], offset); - w3[1] = hc_bytealign_be_S ( 0, w0[0], offset); + w7[3] = hc_byte_perm_S (w4[1], w4[2], selector); + w7[2] = hc_byte_perm_S (w4[0], w4[1], selector); + w7[1] = hc_byte_perm_S (w3[3], w4[0], selector); + w7[0] = hc_byte_perm_S (w3[2], w3[3], selector); + w6[3] = hc_byte_perm_S (w3[1], w3[2], selector); + w6[2] = hc_byte_perm_S (w3[0], w3[1], selector); + w6[1] = hc_byte_perm_S (w2[3], w3[0], selector); + w6[0] = hc_byte_perm_S (w2[2], w2[3], selector); + w5[3] = hc_byte_perm_S (w2[1], w2[2], selector); + w5[2] = hc_byte_perm_S (w2[0], w2[1], selector); + w5[1] = hc_byte_perm_S (w1[3], w2[0], selector); + w5[0] = hc_byte_perm_S (w1[2], w1[3], selector); + w4[3] = hc_byte_perm_S (w1[1], w1[2], selector); + w4[2] = hc_byte_perm_S (w1[0], w1[1], selector); + w4[1] = hc_byte_perm_S (w0[3], w1[0], selector); + w4[0] = hc_byte_perm_S (w0[2], w0[3], selector); + w3[3] = hc_byte_perm_S (w0[1], w0[2], selector); + w3[2] = hc_byte_perm_S (w0[0], w0[1], selector); + w3[1] = hc_byte_perm_S ( 0, w0[0], selector); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -35415,27 +41015,27 @@ DECLSPEC void switch_buffer_by_offset_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 w0[2] = 0; w0[1] = 0; w0[0] = 0; - break; case 14: - c3[2] = hc_bytealign_be_S (w3[3], 0, offset); - c3[1] = hc_bytealign_be_S (w3[2], w3[3], offset); - c3[0] = hc_bytealign_be_S (w3[1], w3[2], offset); - c2[3] = hc_bytealign_be_S (w3[0], w3[1], offset); - c2[2] = hc_bytealign_be_S (w2[3], w3[0], offset); - c2[1] = hc_bytealign_be_S (w2[2], w2[3], offset); - c2[0] = hc_bytealign_be_S (w2[1], w2[2], offset); - c1[3] = hc_bytealign_be_S (w2[0], w2[1], offset); - c1[2] = hc_bytealign_be_S (w1[3], w2[0], offset); - c1[1] = hc_bytealign_be_S (w1[2], w1[3], offset); - c1[0] = hc_bytealign_be_S (w1[1], w1[2], offset); - c0[3] = hc_bytealign_be_S (w1[0], w1[1], offset); - c0[2] = hc_bytealign_be_S (w0[3], w1[0], offset); - c0[1] = hc_bytealign_be_S (w0[2], w0[3], offset); - c0[0] = hc_bytealign_be_S (w0[1], w0[2], offset); - w3[3] = hc_bytealign_be_S (w0[0], w0[1], offset); - w3[2] = hc_bytealign_be_S ( 0, w0[0], offset); + w7[3] = hc_byte_perm_S (w4[0], w4[1], selector); + w7[2] = hc_byte_perm_S (w3[3], w4[0], selector); + w7[1] = hc_byte_perm_S (w3[2], w3[3], selector); + w7[0] = hc_byte_perm_S (w3[1], w3[2], selector); + w6[3] = hc_byte_perm_S (w3[0], w3[1], selector); + w6[2] = hc_byte_perm_S (w2[3], w3[0], selector); + w6[1] = hc_byte_perm_S (w2[2], w2[3], selector); + w6[0] = hc_byte_perm_S (w2[1], w2[2], selector); + w5[3] = hc_byte_perm_S (w2[0], w2[1], selector); + w5[2] = hc_byte_perm_S (w1[3], w2[0], selector); + w5[1] = hc_byte_perm_S (w1[2], w1[3], selector); + w5[0] = hc_byte_perm_S (w1[1], w1[2], selector); + w4[3] = hc_byte_perm_S (w1[0], w1[1], selector); + w4[2] = hc_byte_perm_S (w0[3], w1[0], selector); + w4[1] = hc_byte_perm_S (w0[2], w0[3], selector); + w4[0] = hc_byte_perm_S (w0[1], w0[2], selector); + w3[3] = hc_byte_perm_S (w0[0], w0[1], selector); + w3[2] = hc_byte_perm_S ( 0, w0[0], selector); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -35450,27 +41050,26 @@ DECLSPEC void switch_buffer_by_offset_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 w0[2] = 0; w0[1] = 0; w0[0] = 0; - break; case 15: - c3[3] = hc_bytealign_be_S (w3[3], 0, offset); - c3[2] = hc_bytealign_be_S (w3[2], w3[3], offset); - c3[1] = hc_bytealign_be_S (w3[1], w3[2], offset); - c3[0] = hc_bytealign_be_S (w3[0], w3[1], offset); - c2[3] = hc_bytealign_be_S (w2[3], w3[0], offset); - c2[2] = hc_bytealign_be_S (w2[2], w2[3], offset); - c2[1] = hc_bytealign_be_S (w2[1], w2[2], offset); - c2[0] = hc_bytealign_be_S (w2[0], w2[1], offset); - c1[3] = hc_bytealign_be_S (w1[3], w2[0], offset); - c1[2] = hc_bytealign_be_S (w1[2], w1[3], offset); - c1[1] = hc_bytealign_be_S (w1[1], w1[2], offset); - c1[0] = hc_bytealign_be_S (w1[0], w1[1], offset); - c0[3] = hc_bytealign_be_S (w0[3], w1[0], offset); - c0[2] = hc_bytealign_be_S (w0[2], w0[3], offset); - c0[1] = hc_bytealign_be_S (w0[1], w0[2], offset); - c0[0] = hc_bytealign_be_S (w0[0], w0[1], offset); - w3[3] = hc_bytealign_be_S ( 0, w0[0], offset); + w7[3] = hc_byte_perm_S (w3[3], w4[0], selector); + w7[2] = hc_byte_perm_S (w3[2], w3[3], selector); + w7[1] = hc_byte_perm_S (w3[1], w3[2], selector); + w7[0] = hc_byte_perm_S (w3[0], w3[1], selector); + w6[3] = hc_byte_perm_S (w2[3], w3[0], selector); + w6[2] = hc_byte_perm_S (w2[2], w2[3], selector); + w6[1] = hc_byte_perm_S (w2[1], w2[2], selector); + w6[0] = hc_byte_perm_S (w2[0], w2[1], selector); + w5[3] = hc_byte_perm_S (w1[3], w2[0], selector); + w5[2] = hc_byte_perm_S (w1[2], w1[3], selector); + w5[1] = hc_byte_perm_S (w1[1], w1[2], selector); + w5[0] = hc_byte_perm_S (w1[0], w1[1], selector); + w4[3] = hc_byte_perm_S (w0[3], w1[0], selector); + w4[2] = hc_byte_perm_S (w0[2], w0[3], selector); + w4[1] = hc_byte_perm_S (w0[1], w0[2], selector); + w4[0] = hc_byte_perm_S (w0[0], w0[1], selector); + w3[3] = hc_byte_perm_S ( 0, w0[0], selector); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -35486,107 +41085,166 @@ DECLSPEC void switch_buffer_by_offset_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 w0[2] = 0; w0[1] = 0; w0[0] = 0; - break; } #endif +} - #if (defined IS_AMD && HAS_VPERM == 1) || defined IS_NV - - #if defined IS_NV - const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; - #endif - - #if defined IS_AMD - const int selector = 0x0706050403020100 >> ((offset & 3) * 8); - #endif +DECLSPEC void switch_buffer_by_offset_8x4_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, u32 *w4, u32 *w5, u32 *w6, u32 *w7, u32 *c0, u32 *c1, u32 *c2, u32 *c3, u32 *c4, u32 *c5, u32 *c6, u32 *c7, const u32 offset) +{ + const int offset_switch = offset / 4; + #if (defined IS_AMD && HAS_VPERM == 0) || defined IS_GENERIC switch (offset_switch) { case 0: - c0[0] = hc_byte_perm_S ( 0, w3[3], selector); - w3[3] = hc_byte_perm_S (w3[3], w3[2], selector); - w3[2] = hc_byte_perm_S (w3[2], w3[1], selector); - w3[1] = hc_byte_perm_S (w3[1], w3[0], selector); - w3[0] = hc_byte_perm_S (w3[0], w2[3], selector); - w2[3] = hc_byte_perm_S (w2[3], w2[2], selector); - w2[2] = hc_byte_perm_S (w2[2], w2[1], selector); - w2[1] = hc_byte_perm_S (w2[1], w2[0], selector); - w2[0] = hc_byte_perm_S (w2[0], w1[3], selector); - w1[3] = hc_byte_perm_S (w1[3], w1[2], selector); - w1[2] = hc_byte_perm_S (w1[2], w1[1], selector); - w1[1] = hc_byte_perm_S (w1[1], w1[0], selector); - w1[0] = hc_byte_perm_S (w1[0], w0[3], selector); - w0[3] = hc_byte_perm_S (w0[3], w0[2], selector); - w0[2] = hc_byte_perm_S (w0[2], w0[1], selector); - w0[1] = hc_byte_perm_S (w0[1], w0[0], selector); - w0[0] = hc_byte_perm_S (w0[0], 0, selector); + c0[0] = hc_bytealign_S (w7[3], 0, offset); + w7[3] = hc_bytealign_S (w7[2], w7[3], offset); + w7[2] = hc_bytealign_S (w7[1], w7[2], offset); + w7[1] = hc_bytealign_S (w7[0], w7[1], offset); + w7[0] = hc_bytealign_S (w6[3], w7[0], offset); + w6[3] = hc_bytealign_S (w6[2], w6[3], offset); + w6[2] = hc_bytealign_S (w6[1], w6[2], offset); + w6[1] = hc_bytealign_S (w6[0], w6[1], offset); + w6[0] = hc_bytealign_S (w5[3], w6[0], offset); + w5[3] = hc_bytealign_S (w5[2], w5[3], offset); + w5[2] = hc_bytealign_S (w5[1], w5[2], offset); + w5[1] = hc_bytealign_S (w5[0], w5[1], offset); + w5[0] = hc_bytealign_S (w4[3], w5[0], offset); + w4[3] = hc_bytealign_S (w4[2], w4[3], offset); + w4[2] = hc_bytealign_S (w4[1], w4[2], offset); + w4[1] = hc_bytealign_S (w4[0], w4[1], offset); + w4[0] = hc_bytealign_S (w3[3], w4[0], offset); + w3[3] = hc_bytealign_S (w3[2], w3[3], offset); + w3[2] = hc_bytealign_S (w3[1], w3[2], offset); + w3[1] = hc_bytealign_S (w3[0], w3[1], offset); + w3[0] = hc_bytealign_S (w2[3], w3[0], offset); + w2[3] = hc_bytealign_S (w2[2], w2[3], offset); + w2[2] = hc_bytealign_S (w2[1], w2[2], offset); + w2[1] = hc_bytealign_S (w2[0], w2[1], offset); + w2[0] = hc_bytealign_S (w1[3], w2[0], offset); + w1[3] = hc_bytealign_S (w1[2], w1[3], offset); + w1[2] = hc_bytealign_S (w1[1], w1[2], offset); + w1[1] = hc_bytealign_S (w1[0], w1[1], offset); + w1[0] = hc_bytealign_S (w0[3], w1[0], offset); + w0[3] = hc_bytealign_S (w0[2], w0[3], offset); + w0[2] = hc_bytealign_S (w0[1], w0[2], offset); + w0[1] = hc_bytealign_S (w0[0], w0[1], offset); + w0[0] = hc_bytealign_S ( 0, w0[0], offset); break; case 1: - c0[1] = hc_byte_perm_S ( 0, w3[3], selector); - c0[0] = hc_byte_perm_S (w3[3], w3[2], selector); - w3[3] = hc_byte_perm_S (w3[2], w3[1], selector); - w3[2] = hc_byte_perm_S (w3[1], w3[0], selector); - w3[1] = hc_byte_perm_S (w3[0], w2[3], selector); - w3[0] = hc_byte_perm_S (w2[3], w2[2], selector); - w2[3] = hc_byte_perm_S (w2[2], w2[1], selector); - w2[2] = hc_byte_perm_S (w2[1], w2[0], selector); - w2[1] = hc_byte_perm_S (w2[0], w1[3], selector); - w2[0] = hc_byte_perm_S (w1[3], w1[2], selector); - w1[3] = hc_byte_perm_S (w1[2], w1[1], selector); - w1[2] = hc_byte_perm_S (w1[1], w1[0], selector); - w1[1] = hc_byte_perm_S (w1[0], w0[3], selector); - w1[0] = hc_byte_perm_S (w0[3], w0[2], selector); - w0[3] = hc_byte_perm_S (w0[2], w0[1], selector); - w0[2] = hc_byte_perm_S (w0[1], w0[0], selector); - w0[1] = hc_byte_perm_S (w0[0], 0, selector); + c0[1] = hc_bytealign_S (w7[3], 0, offset); + c0[0] = hc_bytealign_S (w7[2], w7[3], offset); + w7[3] = hc_bytealign_S (w7[1], w7[2], offset); + w7[2] = hc_bytealign_S (w7[0], w7[1], offset); + w7[1] = hc_bytealign_S (w6[3], w7[0], offset); + w7[0] = hc_bytealign_S (w6[2], w6[3], offset); + w6[3] = hc_bytealign_S (w6[1], w6[2], offset); + w6[2] = hc_bytealign_S (w6[0], w6[1], offset); + w6[1] = hc_bytealign_S (w5[3], w6[0], offset); + w6[0] = hc_bytealign_S (w5[2], w5[3], offset); + w5[3] = hc_bytealign_S (w5[1], w5[2], offset); + w5[2] = hc_bytealign_S (w5[0], w5[1], offset); + w5[1] = hc_bytealign_S (w4[3], w5[0], offset); + w5[0] = hc_bytealign_S (w4[2], w4[3], offset); + w4[3] = hc_bytealign_S (w4[1], w4[2], offset); + w4[2] = hc_bytealign_S (w4[0], w4[1], offset); + w4[1] = hc_bytealign_S (w3[3], w4[0], offset); + w4[0] = hc_bytealign_S (w3[2], w3[3], offset); + w3[3] = hc_bytealign_S (w3[1], w3[2], offset); + w3[2] = hc_bytealign_S (w3[0], w3[1], offset); + w3[1] = hc_bytealign_S (w2[3], w3[0], offset); + w3[0] = hc_bytealign_S (w2[2], w2[3], offset); + w2[3] = hc_bytealign_S (w2[1], w2[2], offset); + w2[2] = hc_bytealign_S (w2[0], w2[1], offset); + w2[1] = hc_bytealign_S (w1[3], w2[0], offset); + w2[0] = hc_bytealign_S (w1[2], w1[3], offset); + w1[3] = hc_bytealign_S (w1[1], w1[2], offset); + w1[2] = hc_bytealign_S (w1[0], w1[1], offset); + w1[1] = hc_bytealign_S (w0[3], w1[0], offset); + w1[0] = hc_bytealign_S (w0[2], w0[3], offset); + w0[3] = hc_bytealign_S (w0[1], w0[2], offset); + w0[2] = hc_bytealign_S (w0[0], w0[1], offset); + w0[1] = hc_bytealign_S ( 0, w0[0], offset); w0[0] = 0; break; case 2: - c0[2] = hc_byte_perm_S ( 0, w3[3], selector); - c0[1] = hc_byte_perm_S (w3[3], w3[2], selector); - c0[0] = hc_byte_perm_S (w3[2], w3[1], selector); - w3[3] = hc_byte_perm_S (w3[1], w3[0], selector); - w3[2] = hc_byte_perm_S (w3[0], w2[3], selector); - w3[1] = hc_byte_perm_S (w2[3], w2[2], selector); - w3[0] = hc_byte_perm_S (w2[2], w2[1], selector); - w2[3] = hc_byte_perm_S (w2[1], w2[0], selector); - w2[2] = hc_byte_perm_S (w2[0], w1[3], selector); - w2[1] = hc_byte_perm_S (w1[3], w1[2], selector); - w2[0] = hc_byte_perm_S (w1[2], w1[1], selector); - w1[3] = hc_byte_perm_S (w1[1], w1[0], selector); - w1[2] = hc_byte_perm_S (w1[0], w0[3], selector); - w1[1] = hc_byte_perm_S (w0[3], w0[2], selector); - w1[0] = hc_byte_perm_S (w0[2], w0[1], selector); - w0[3] = hc_byte_perm_S (w0[1], w0[0], selector); - w0[2] = hc_byte_perm_S (w0[0], 0, selector); + c0[2] = hc_bytealign_S (w7[3], 0, offset); + c0[1] = hc_bytealign_S (w7[2], w7[3], offset); + c0[0] = hc_bytealign_S (w7[1], w7[2], offset); + w7[3] = hc_bytealign_S (w7[0], w7[1], offset); + w7[2] = hc_bytealign_S (w6[3], w7[0], offset); + w7[1] = hc_bytealign_S (w6[2], w6[3], offset); + w7[0] = hc_bytealign_S (w6[1], w6[2], offset); + w6[3] = hc_bytealign_S (w6[0], w6[1], offset); + w6[2] = hc_bytealign_S (w5[3], w6[0], offset); + w6[1] = hc_bytealign_S (w5[2], w5[3], offset); + w6[0] = hc_bytealign_S (w5[1], w5[2], offset); + w5[3] = hc_bytealign_S (w5[0], w5[1], offset); + w5[2] = hc_bytealign_S (w4[3], w5[0], offset); + w5[1] = hc_bytealign_S (w4[2], w4[3], offset); + w5[0] = hc_bytealign_S (w4[1], w4[2], offset); + w4[3] = hc_bytealign_S (w4[0], w4[1], offset); + w4[2] = hc_bytealign_S (w3[3], w4[0], offset); + w4[1] = hc_bytealign_S (w3[2], w3[3], offset); + w4[0] = hc_bytealign_S (w3[1], w3[2], offset); + w3[3] = hc_bytealign_S (w3[0], w3[1], offset); + w3[2] = hc_bytealign_S (w2[3], w3[0], offset); + w3[1] = hc_bytealign_S (w2[2], w2[3], offset); + w3[0] = hc_bytealign_S (w2[1], w2[2], offset); + w2[3] = hc_bytealign_S (w2[0], w2[1], offset); + w2[2] = hc_bytealign_S (w1[3], w2[0], offset); + w2[1] = hc_bytealign_S (w1[2], w1[3], offset); + w2[0] = hc_bytealign_S (w1[1], w1[2], offset); + w1[3] = hc_bytealign_S (w1[0], w1[1], offset); + w1[2] = hc_bytealign_S (w0[3], w1[0], offset); + w1[1] = hc_bytealign_S (w0[2], w0[3], offset); + w1[0] = hc_bytealign_S (w0[1], w0[2], offset); + w0[3] = hc_bytealign_S (w0[0], w0[1], offset); + w0[2] = hc_bytealign_S ( 0, w0[0], offset); w0[1] = 0; w0[0] = 0; break; case 3: - c0[3] = hc_byte_perm_S ( 0, w3[3], selector); - c0[2] = hc_byte_perm_S (w3[3], w3[2], selector); - c0[1] = hc_byte_perm_S (w3[2], w3[1], selector); - c0[0] = hc_byte_perm_S (w3[1], w3[0], selector); - w3[3] = hc_byte_perm_S (w3[0], w2[3], selector); - w3[2] = hc_byte_perm_S (w2[3], w2[2], selector); - w3[1] = hc_byte_perm_S (w2[2], w2[1], selector); - w3[0] = hc_byte_perm_S (w2[1], w2[0], selector); - w2[3] = hc_byte_perm_S (w2[0], w1[3], selector); - w2[2] = hc_byte_perm_S (w1[3], w1[2], selector); - w2[1] = hc_byte_perm_S (w1[2], w1[1], selector); - w2[0] = hc_byte_perm_S (w1[1], w1[0], selector); - w1[3] = hc_byte_perm_S (w1[0], w0[3], selector); - w1[2] = hc_byte_perm_S (w0[3], w0[2], selector); - w1[1] = hc_byte_perm_S (w0[2], w0[1], selector); - w1[0] = hc_byte_perm_S (w0[1], w0[0], selector); - w0[3] = hc_byte_perm_S (w0[0], 0, selector); + c0[3] = hc_bytealign_S (w7[3], 0, offset); + c0[2] = hc_bytealign_S (w7[2], w7[3], offset); + c0[1] = hc_bytealign_S (w7[1], w7[2], offset); + c0[0] = hc_bytealign_S (w7[0], w7[1], offset); + w7[3] = hc_bytealign_S (w6[3], w7[0], offset); + w7[2] = hc_bytealign_S (w6[2], w6[3], offset); + w7[1] = hc_bytealign_S (w6[1], w6[2], offset); + w7[0] = hc_bytealign_S (w6[0], w6[1], offset); + w6[3] = hc_bytealign_S (w5[3], w6[0], offset); + w6[2] = hc_bytealign_S (w5[2], w5[3], offset); + w6[1] = hc_bytealign_S (w5[1], w5[2], offset); + w6[0] = hc_bytealign_S (w5[0], w5[1], offset); + w5[3] = hc_bytealign_S (w4[3], w5[0], offset); + w5[2] = hc_bytealign_S (w4[2], w4[3], offset); + w5[1] = hc_bytealign_S (w4[1], w4[2], offset); + w5[0] = hc_bytealign_S (w4[0], w4[1], offset); + w4[3] = hc_bytealign_S (w3[3], w4[0], offset); + w4[2] = hc_bytealign_S (w3[2], w3[3], offset); + w4[1] = hc_bytealign_S (w3[1], w3[2], offset); + w4[0] = hc_bytealign_S (w3[0], w3[1], offset); + w3[3] = hc_bytealign_S (w2[3], w3[0], offset); + w3[2] = hc_bytealign_S (w2[2], w2[3], offset); + w3[1] = hc_bytealign_S (w2[1], w2[2], offset); + w3[0] = hc_bytealign_S (w2[0], w2[1], offset); + w2[3] = hc_bytealign_S (w1[3], w2[0], offset); + w2[2] = hc_bytealign_S (w1[2], w1[3], offset); + w2[1] = hc_bytealign_S (w1[1], w1[2], offset); + w2[0] = hc_bytealign_S (w1[0], w1[1], offset); + w1[3] = hc_bytealign_S (w0[3], w1[0], offset); + w1[2] = hc_bytealign_S (w0[2], w0[3], offset); + w1[1] = hc_bytealign_S (w0[1], w0[2], offset); + w1[0] = hc_bytealign_S (w0[0], w0[1], offset); + w0[3] = hc_bytealign_S ( 0, w0[0], offset); w0[2] = 0; w0[1] = 0; w0[0] = 0; @@ -35594,23 +41252,39 @@ DECLSPEC void switch_buffer_by_offset_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 4: - c1[0] = hc_byte_perm_S ( 0, w3[3], selector); - c0[3] = hc_byte_perm_S (w3[3], w3[2], selector); - c0[2] = hc_byte_perm_S (w3[2], w3[1], selector); - c0[1] = hc_byte_perm_S (w3[1], w3[0], selector); - c0[0] = hc_byte_perm_S (w3[0], w2[3], selector); - w3[3] = hc_byte_perm_S (w2[3], w2[2], selector); - w3[2] = hc_byte_perm_S (w2[2], w2[1], selector); - w3[1] = hc_byte_perm_S (w2[1], w2[0], selector); - w3[0] = hc_byte_perm_S (w2[0], w1[3], selector); - w2[3] = hc_byte_perm_S (w1[3], w1[2], selector); - w2[2] = hc_byte_perm_S (w1[2], w1[1], selector); - w2[1] = hc_byte_perm_S (w1[1], w1[0], selector); - w2[0] = hc_byte_perm_S (w1[0], w0[3], selector); - w1[3] = hc_byte_perm_S (w0[3], w0[2], selector); - w1[2] = hc_byte_perm_S (w0[2], w0[1], selector); - w1[1] = hc_byte_perm_S (w0[1], w0[0], selector); - w1[0] = hc_byte_perm_S (w0[0], 0, selector); + c1[0] = hc_bytealign_S (w7[3], 0, offset); + c0[3] = hc_bytealign_S (w7[2], w7[3], offset); + c0[2] = hc_bytealign_S (w7[1], w7[2], offset); + c0[1] = hc_bytealign_S (w7[0], w7[1], offset); + c0[0] = hc_bytealign_S (w6[3], w7[0], offset); + w7[3] = hc_bytealign_S (w6[2], w6[3], offset); + w7[2] = hc_bytealign_S (w6[1], w6[2], offset); + w7[1] = hc_bytealign_S (w6[0], w6[1], offset); + w7[0] = hc_bytealign_S (w5[3], w6[0], offset); + w6[3] = hc_bytealign_S (w5[2], w5[3], offset); + w6[2] = hc_bytealign_S (w5[1], w5[2], offset); + w6[1] = hc_bytealign_S (w5[0], w5[1], offset); + w6[0] = hc_bytealign_S (w4[3], w5[0], offset); + w5[3] = hc_bytealign_S (w4[2], w4[3], offset); + w5[2] = hc_bytealign_S (w4[1], w4[2], offset); + w5[1] = hc_bytealign_S (w4[0], w4[1], offset); + w5[0] = hc_bytealign_S (w3[3], w4[0], offset); + w4[3] = hc_bytealign_S (w3[2], w3[3], offset); + w4[2] = hc_bytealign_S (w3[1], w3[2], offset); + w4[1] = hc_bytealign_S (w3[0], w3[1], offset); + w4[0] = hc_bytealign_S (w2[3], w3[0], offset); + w3[3] = hc_bytealign_S (w2[2], w2[3], offset); + w3[2] = hc_bytealign_S (w2[1], w2[2], offset); + w3[1] = hc_bytealign_S (w2[0], w2[1], offset); + w3[0] = hc_bytealign_S (w1[3], w2[0], offset); + w2[3] = hc_bytealign_S (w1[2], w1[3], offset); + w2[2] = hc_bytealign_S (w1[1], w1[2], offset); + w2[1] = hc_bytealign_S (w1[0], w1[1], offset); + w2[0] = hc_bytealign_S (w0[3], w1[0], offset); + w1[3] = hc_bytealign_S (w0[2], w0[3], offset); + w1[2] = hc_bytealign_S (w0[1], w0[2], offset); + w1[1] = hc_bytealign_S (w0[0], w0[1], offset); + w1[0] = hc_bytealign_S ( 0, w0[0], offset); w0[3] = 0; w0[2] = 0; w0[1] = 0; @@ -35619,106 +41293,81 @@ DECLSPEC void switch_buffer_by_offset_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 break; case 5: - c1[1] = hc_byte_perm_S ( 0, w3[3], selector); - c1[0] = hc_byte_perm_S (w3[3], w3[2], selector); - c0[3] = hc_byte_perm_S (w3[2], w3[1], selector); - c0[2] = hc_byte_perm_S (w3[1], w3[0], selector); - c0[1] = hc_byte_perm_S (w3[0], w2[3], selector); - c0[0] = hc_byte_perm_S (w2[3], w2[2], selector); - w3[3] = hc_byte_perm_S (w2[2], w2[1], selector); - w3[2] = hc_byte_perm_S (w2[1], w2[0], selector); - w3[1] = hc_byte_perm_S (w2[0], w1[3], selector); - w3[0] = hc_byte_perm_S (w1[3], w1[2], selector); - w2[3] = hc_byte_perm_S (w1[2], w1[1], selector); - w2[2] = hc_byte_perm_S (w1[1], w1[0], selector); - w2[1] = hc_byte_perm_S (w1[0], w0[3], selector); - w2[0] = hc_byte_perm_S (w0[3], w0[2], selector); - w1[3] = hc_byte_perm_S (w0[2], w0[1], selector); - w1[2] = hc_byte_perm_S (w0[1], w0[0], selector); - w1[1] = hc_byte_perm_S (w0[0], 0, selector); - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - break; - - case 6: - c1[2] = hc_byte_perm_S ( 0, w3[3], selector); - c1[1] = hc_byte_perm_S (w3[3], w3[2], selector); - c1[0] = hc_byte_perm_S (w3[2], w3[1], selector); - c0[3] = hc_byte_perm_S (w3[1], w3[0], selector); - c0[2] = hc_byte_perm_S (w3[0], w2[3], selector); - c0[1] = hc_byte_perm_S (w2[3], w2[2], selector); - c0[0] = hc_byte_perm_S (w2[2], w2[1], selector); - w3[3] = hc_byte_perm_S (w2[1], w2[0], selector); - w3[2] = hc_byte_perm_S (w2[0], w1[3], selector); - w3[1] = hc_byte_perm_S (w1[3], w1[2], selector); - w3[0] = hc_byte_perm_S (w1[2], w1[1], selector); - w2[3] = hc_byte_perm_S (w1[1], w1[0], selector); - w2[2] = hc_byte_perm_S (w1[0], w0[3], selector); - w2[1] = hc_byte_perm_S (w0[3], w0[2], selector); - w2[0] = hc_byte_perm_S (w0[2], w0[1], selector); - w1[3] = hc_byte_perm_S (w0[1], w0[0], selector); - w1[2] = hc_byte_perm_S (w0[0], 0, selector); - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - break; - - case 7: - c1[3] = hc_byte_perm_S ( 0, w3[3], selector); - c1[2] = hc_byte_perm_S (w3[3], w3[2], selector); - c1[1] = hc_byte_perm_S (w3[2], w3[1], selector); - c1[0] = hc_byte_perm_S (w3[1], w3[0], selector); - c0[3] = hc_byte_perm_S (w3[0], w2[3], selector); - c0[2] = hc_byte_perm_S (w2[3], w2[2], selector); - c0[1] = hc_byte_perm_S (w2[2], w2[1], selector); - c0[0] = hc_byte_perm_S (w2[1], w2[0], selector); - w3[3] = hc_byte_perm_S (w2[0], w1[3], selector); - w3[2] = hc_byte_perm_S (w1[3], w1[2], selector); - w3[1] = hc_byte_perm_S (w1[2], w1[1], selector); - w3[0] = hc_byte_perm_S (w1[1], w1[0], selector); - w2[3] = hc_byte_perm_S (w1[0], w0[3], selector); - w2[2] = hc_byte_perm_S (w0[3], w0[2], selector); - w2[1] = hc_byte_perm_S (w0[2], w0[1], selector); - w2[0] = hc_byte_perm_S (w0[1], w0[0], selector); - w1[3] = hc_byte_perm_S (w0[0], 0, selector); - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - break; - - case 8: - c2[0] = hc_byte_perm_S ( 0, w3[3], selector); - c1[3] = hc_byte_perm_S (w3[3], w3[2], selector); - c1[2] = hc_byte_perm_S (w3[2], w3[1], selector); - c1[1] = hc_byte_perm_S (w3[1], w3[0], selector); - c1[0] = hc_byte_perm_S (w3[0], w2[3], selector); - c0[3] = hc_byte_perm_S (w2[3], w2[2], selector); - c0[2] = hc_byte_perm_S (w2[2], w2[1], selector); - c0[1] = hc_byte_perm_S (w2[1], w2[0], selector); - c0[0] = hc_byte_perm_S (w2[0], w1[3], selector); - w3[3] = hc_byte_perm_S (w1[3], w1[2], selector); - w3[2] = hc_byte_perm_S (w1[2], w1[1], selector); - w3[1] = hc_byte_perm_S (w1[1], w1[0], selector); - w3[0] = hc_byte_perm_S (w1[0], w0[3], selector); - w2[3] = hc_byte_perm_S (w0[3], w0[2], selector); - w2[2] = hc_byte_perm_S (w0[2], w0[1], selector); - w2[1] = hc_byte_perm_S (w0[1], w0[0], selector); - w2[0] = hc_byte_perm_S (w0[0], 0, selector); - w1[3] = 0; - w1[2] = 0; + c1[1] = hc_bytealign_S (w7[3], 0, offset); + c1[0] = hc_bytealign_S (w7[2], w7[3], offset); + c0[3] = hc_bytealign_S (w7[1], w7[2], offset); + c0[2] = hc_bytealign_S (w7[0], w7[1], offset); + c0[1] = hc_bytealign_S (w6[3], w7[0], offset); + c0[0] = hc_bytealign_S (w6[2], w6[3], offset); + w7[3] = hc_bytealign_S (w6[1], w6[2], offset); + w7[2] = hc_bytealign_S (w6[0], w6[1], offset); + w7[1] = hc_bytealign_S (w5[3], w6[0], offset); + w7[0] = hc_bytealign_S (w5[2], w5[3], offset); + w6[3] = hc_bytealign_S (w5[1], w5[2], offset); + w6[2] = hc_bytealign_S (w5[0], w5[1], offset); + w6[1] = hc_bytealign_S (w4[3], w5[0], offset); + w6[0] = hc_bytealign_S (w4[2], w4[3], offset); + w5[3] = hc_bytealign_S (w4[1], w4[2], offset); + w5[2] = hc_bytealign_S (w4[0], w4[1], offset); + w5[1] = hc_bytealign_S (w3[3], w4[0], offset); + w5[0] = hc_bytealign_S (w3[2], w3[3], offset); + w4[3] = hc_bytealign_S (w3[1], w3[2], offset); + w4[2] = hc_bytealign_S (w3[0], w3[1], offset); + w4[1] = hc_bytealign_S (w2[3], w3[0], offset); + w4[0] = hc_bytealign_S (w2[2], w2[3], offset); + w3[3] = hc_bytealign_S (w2[1], w2[2], offset); + w3[2] = hc_bytealign_S (w2[0], w2[1], offset); + w3[1] = hc_bytealign_S (w1[3], w2[0], offset); + w3[0] = hc_bytealign_S (w1[2], w1[3], offset); + w2[3] = hc_bytealign_S (w1[1], w1[2], offset); + w2[2] = hc_bytealign_S (w1[0], w1[1], offset); + w2[1] = hc_bytealign_S (w0[3], w1[0], offset); + w2[0] = hc_bytealign_S (w0[2], w0[3], offset); + w1[3] = hc_bytealign_S (w0[1], w0[2], offset); + w1[2] = hc_bytealign_S (w0[0], w0[1], offset); + w1[1] = hc_bytealign_S ( 0, w0[0], offset); + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 6: + c1[2] = hc_bytealign_S (w7[3], 0, offset); + c1[1] = hc_bytealign_S (w7[2], w7[3], offset); + c1[0] = hc_bytealign_S (w7[1], w7[2], offset); + c0[3] = hc_bytealign_S (w7[0], w7[1], offset); + c0[2] = hc_bytealign_S (w6[3], w7[0], offset); + c0[1] = hc_bytealign_S (w6[2], w6[3], offset); + c0[0] = hc_bytealign_S (w6[1], w6[2], offset); + w7[3] = hc_bytealign_S (w6[0], w6[1], offset); + w7[2] = hc_bytealign_S (w5[3], w6[0], offset); + w7[1] = hc_bytealign_S (w5[2], w5[3], offset); + w7[0] = hc_bytealign_S (w5[1], w5[2], offset); + w6[3] = hc_bytealign_S (w5[0], w5[1], offset); + w6[2] = hc_bytealign_S (w4[3], w5[0], offset); + w6[1] = hc_bytealign_S (w4[2], w4[3], offset); + w6[0] = hc_bytealign_S (w4[1], w4[2], offset); + w5[3] = hc_bytealign_S (w4[0], w4[1], offset); + w5[2] = hc_bytealign_S (w3[3], w4[0], offset); + w5[1] = hc_bytealign_S (w3[2], w3[3], offset); + w5[0] = hc_bytealign_S (w3[1], w3[2], offset); + w4[3] = hc_bytealign_S (w3[0], w3[1], offset); + w4[2] = hc_bytealign_S (w2[3], w3[0], offset); + w4[1] = hc_bytealign_S (w2[2], w2[3], offset); + w4[0] = hc_bytealign_S (w2[1], w2[2], offset); + w3[3] = hc_bytealign_S (w2[0], w2[1], offset); + w3[2] = hc_bytealign_S (w1[3], w2[0], offset); + w3[1] = hc_bytealign_S (w1[2], w1[3], offset); + w3[0] = hc_bytealign_S (w1[1], w1[2], offset); + w2[3] = hc_bytealign_S (w1[0], w1[1], offset); + w2[2] = hc_bytealign_S (w0[3], w1[0], offset); + w2[1] = hc_bytealign_S (w0[2], w0[3], offset); + w2[0] = hc_bytealign_S (w0[1], w0[2], offset); + w1[3] = hc_bytealign_S (w0[0], w0[1], offset); + w1[2] = hc_bytealign_S ( 0, w0[0], offset); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -35728,26 +41377,40 @@ DECLSPEC void switch_buffer_by_offset_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 break; - case 9: - c2[1] = hc_byte_perm_S ( 0, w3[3], selector); - c2[0] = hc_byte_perm_S (w3[3], w3[2], selector); - c1[3] = hc_byte_perm_S (w3[2], w3[1], selector); - c1[2] = hc_byte_perm_S (w3[1], w3[0], selector); - c1[1] = hc_byte_perm_S (w3[0], w2[3], selector); - c1[0] = hc_byte_perm_S (w2[3], w2[2], selector); - c0[3] = hc_byte_perm_S (w2[2], w2[1], selector); - c0[2] = hc_byte_perm_S (w2[1], w2[0], selector); - c0[1] = hc_byte_perm_S (w2[0], w1[3], selector); - c0[0] = hc_byte_perm_S (w1[3], w1[2], selector); - w3[3] = hc_byte_perm_S (w1[2], w1[1], selector); - w3[2] = hc_byte_perm_S (w1[1], w1[0], selector); - w3[1] = hc_byte_perm_S (w1[0], w0[3], selector); - w3[0] = hc_byte_perm_S (w0[3], w0[2], selector); - w2[3] = hc_byte_perm_S (w0[2], w0[1], selector); - w2[2] = hc_byte_perm_S (w0[1], w0[0], selector); - w2[1] = hc_byte_perm_S (w0[0], 0, selector); - w2[0] = 0; - w1[3] = 0; + case 7: + c1[3] = hc_bytealign_S (w7[3], 0, offset); + c1[2] = hc_bytealign_S (w7[2], w7[3], offset); + c1[1] = hc_bytealign_S (w7[1], w7[2], offset); + c1[0] = hc_bytealign_S (w7[0], w7[1], offset); + c0[3] = hc_bytealign_S (w6[3], w7[0], offset); + c0[2] = hc_bytealign_S (w6[2], w6[3], offset); + c0[1] = hc_bytealign_S (w6[1], w6[2], offset); + c0[0] = hc_bytealign_S (w6[0], w6[1], offset); + w7[3] = hc_bytealign_S (w5[3], w6[0], offset); + w7[2] = hc_bytealign_S (w5[2], w5[3], offset); + w7[1] = hc_bytealign_S (w5[1], w5[2], offset); + w7[0] = hc_bytealign_S (w5[0], w5[1], offset); + w6[3] = hc_bytealign_S (w4[3], w5[0], offset); + w6[2] = hc_bytealign_S (w4[2], w4[3], offset); + w6[1] = hc_bytealign_S (w4[1], w4[2], offset); + w6[0] = hc_bytealign_S (w4[0], w4[1], offset); + w5[3] = hc_bytealign_S (w3[3], w4[0], offset); + w5[2] = hc_bytealign_S (w3[2], w3[3], offset); + w5[1] = hc_bytealign_S (w3[1], w3[2], offset); + w5[0] = hc_bytealign_S (w3[0], w3[1], offset); + w4[3] = hc_bytealign_S (w2[3], w3[0], offset); + w4[2] = hc_bytealign_S (w2[2], w2[3], offset); + w4[1] = hc_bytealign_S (w2[1], w2[2], offset); + w4[0] = hc_bytealign_S (w2[0], w2[1], offset); + w3[3] = hc_bytealign_S (w1[3], w2[0], offset); + w3[2] = hc_bytealign_S (w1[2], w1[3], offset); + w3[1] = hc_bytealign_S (w1[1], w1[2], offset); + w3[0] = hc_bytealign_S (w1[0], w1[1], offset); + w2[3] = hc_bytealign_S (w0[3], w1[0], offset); + w2[2] = hc_bytealign_S (w0[2], w0[3], offset); + w2[1] = hc_bytealign_S (w0[1], w0[2], offset); + w2[0] = hc_bytealign_S (w0[0], w0[1], offset); + w1[3] = hc_bytealign_S ( 0, w0[0], offset); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -35758,26 +41421,40 @@ DECLSPEC void switch_buffer_by_offset_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 break; - case 10: - c2[2] = hc_byte_perm_S ( 0, w3[3], selector); - c2[1] = hc_byte_perm_S (w3[3], w3[2], selector); - c2[0] = hc_byte_perm_S (w3[2], w3[1], selector); - c1[3] = hc_byte_perm_S (w3[1], w3[0], selector); - c1[2] = hc_byte_perm_S (w3[0], w2[3], selector); - c1[1] = hc_byte_perm_S (w2[3], w2[2], selector); - c1[0] = hc_byte_perm_S (w2[2], w2[1], selector); - c0[3] = hc_byte_perm_S (w2[1], w2[0], selector); - c0[2] = hc_byte_perm_S (w2[0], w1[3], selector); - c0[1] = hc_byte_perm_S (w1[3], w1[2], selector); - c0[0] = hc_byte_perm_S (w1[2], w1[1], selector); - w3[3] = hc_byte_perm_S (w1[1], w1[0], selector); - w3[2] = hc_byte_perm_S (w1[0], w0[3], selector); - w3[1] = hc_byte_perm_S (w0[3], w0[2], selector); - w3[0] = hc_byte_perm_S (w0[2], w0[1], selector); - w2[3] = hc_byte_perm_S (w0[1], w0[0], selector); - w2[2] = hc_byte_perm_S (w0[0], 0, selector); - w2[1] = 0; - w2[0] = 0; + case 8: + c2[0] = hc_bytealign_S (w7[3], 0, offset); + c1[3] = hc_bytealign_S (w7[2], w7[3], offset); + c1[2] = hc_bytealign_S (w7[1], w7[2], offset); + c1[1] = hc_bytealign_S (w7[0], w7[1], offset); + c1[0] = hc_bytealign_S (w6[3], w7[0], offset); + c0[3] = hc_bytealign_S (w6[2], w6[3], offset); + c0[2] = hc_bytealign_S (w6[1], w6[2], offset); + c0[1] = hc_bytealign_S (w6[0], w6[1], offset); + c0[0] = hc_bytealign_S (w5[3], w6[0], offset); + w7[3] = hc_bytealign_S (w5[2], w5[3], offset); + w7[2] = hc_bytealign_S (w5[1], w5[2], offset); + w7[1] = hc_bytealign_S (w5[0], w5[1], offset); + w7[0] = hc_bytealign_S (w4[3], w5[0], offset); + w6[3] = hc_bytealign_S (w4[2], w4[3], offset); + w6[2] = hc_bytealign_S (w4[1], w4[2], offset); + w6[1] = hc_bytealign_S (w4[0], w4[1], offset); + w6[0] = hc_bytealign_S (w3[3], w4[0], offset); + w5[3] = hc_bytealign_S (w3[2], w3[3], offset); + w5[2] = hc_bytealign_S (w3[1], w3[2], offset); + w5[1] = hc_bytealign_S (w3[0], w3[1], offset); + w5[0] = hc_bytealign_S (w2[3], w3[0], offset); + w4[3] = hc_bytealign_S (w2[2], w2[3], offset); + w4[2] = hc_bytealign_S (w2[1], w2[2], offset); + w4[1] = hc_bytealign_S (w2[0], w2[1], offset); + w4[0] = hc_bytealign_S (w1[3], w2[0], offset); + w3[3] = hc_bytealign_S (w1[2], w1[3], offset); + w3[2] = hc_bytealign_S (w1[1], w1[2], offset); + w3[1] = hc_bytealign_S (w1[0], w1[1], offset); + w3[0] = hc_bytealign_S (w0[3], w1[0], offset); + w2[3] = hc_bytealign_S (w0[2], w0[3], offset); + w2[2] = hc_bytealign_S (w0[1], w0[2], offset); + w2[1] = hc_bytealign_S (w0[0], w0[1], offset); + w2[0] = hc_bytealign_S ( 0, w0[0], offset); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -35789,26 +41466,40 @@ DECLSPEC void switch_buffer_by_offset_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 break; - case 11: - c2[3] = hc_byte_perm_S ( 0, w3[3], selector); - c2[2] = hc_byte_perm_S (w3[3], w3[2], selector); - c2[1] = hc_byte_perm_S (w3[2], w3[1], selector); - c2[0] = hc_byte_perm_S (w3[1], w3[0], selector); - c1[3] = hc_byte_perm_S (w3[0], w2[3], selector); - c1[2] = hc_byte_perm_S (w2[3], w2[2], selector); - c1[1] = hc_byte_perm_S (w2[2], w2[1], selector); - c1[0] = hc_byte_perm_S (w2[1], w2[0], selector); - c0[3] = hc_byte_perm_S (w2[0], w1[3], selector); - c0[2] = hc_byte_perm_S (w1[3], w1[2], selector); - c0[1] = hc_byte_perm_S (w1[2], w1[1], selector); - c0[0] = hc_byte_perm_S (w1[1], w1[0], selector); - w3[3] = hc_byte_perm_S (w1[0], w0[3], selector); - w3[2] = hc_byte_perm_S (w0[3], w0[2], selector); - w3[1] = hc_byte_perm_S (w0[2], w0[1], selector); - w3[0] = hc_byte_perm_S (w0[1], w0[0], selector); - w2[3] = hc_byte_perm_S (w0[0], 0, selector); - w2[2] = 0; - w2[1] = 0; + case 9: + c2[1] = hc_bytealign_S (w7[3], 0, offset); + c2[0] = hc_bytealign_S (w7[2], w7[3], offset); + c1[3] = hc_bytealign_S (w7[1], w7[2], offset); + c1[2] = hc_bytealign_S (w7[0], w7[1], offset); + c1[1] = hc_bytealign_S (w6[3], w7[0], offset); + c1[0] = hc_bytealign_S (w6[2], w6[3], offset); + c0[3] = hc_bytealign_S (w6[1], w6[2], offset); + c0[2] = hc_bytealign_S (w6[0], w6[1], offset); + c0[1] = hc_bytealign_S (w5[3], w6[0], offset); + c0[0] = hc_bytealign_S (w5[2], w5[3], offset); + w7[3] = hc_bytealign_S (w5[1], w5[2], offset); + w7[2] = hc_bytealign_S (w5[0], w5[1], offset); + w7[1] = hc_bytealign_S (w4[3], w5[0], offset); + w7[0] = hc_bytealign_S (w4[2], w4[3], offset); + w6[3] = hc_bytealign_S (w4[1], w4[2], offset); + w6[2] = hc_bytealign_S (w4[0], w4[1], offset); + w6[1] = hc_bytealign_S (w3[3], w4[0], offset); + w6[0] = hc_bytealign_S (w3[2], w3[3], offset); + w5[3] = hc_bytealign_S (w3[1], w3[2], offset); + w5[2] = hc_bytealign_S (w3[0], w3[1], offset); + w5[1] = hc_bytealign_S (w2[3], w3[0], offset); + w5[0] = hc_bytealign_S (w2[2], w2[3], offset); + w4[3] = hc_bytealign_S (w2[1], w2[2], offset); + w4[2] = hc_bytealign_S (w2[0], w2[1], offset); + w4[1] = hc_bytealign_S (w1[3], w2[0], offset); + w4[0] = hc_bytealign_S (w1[2], w1[3], offset); + w3[3] = hc_bytealign_S (w1[1], w1[2], offset); + w3[2] = hc_bytealign_S (w1[0], w1[1], offset); + w3[1] = hc_bytealign_S (w0[3], w1[0], offset); + w3[0] = hc_bytealign_S (w0[2], w0[3], offset); + w2[3] = hc_bytealign_S (w0[1], w0[2], offset); + w2[2] = hc_bytealign_S (w0[0], w0[1], offset); + w2[1] = hc_bytealign_S ( 0, w0[0], offset); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -35821,26 +41512,40 @@ DECLSPEC void switch_buffer_by_offset_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 break; - case 12: - c3[0] = hc_byte_perm_S ( 0, w3[3], selector); - c2[3] = hc_byte_perm_S (w3[3], w3[2], selector); - c2[2] = hc_byte_perm_S (w3[2], w3[1], selector); - c2[1] = hc_byte_perm_S (w3[1], w3[0], selector); - c2[0] = hc_byte_perm_S (w3[0], w2[3], selector); - c1[3] = hc_byte_perm_S (w2[3], w2[2], selector); - c1[2] = hc_byte_perm_S (w2[2], w2[1], selector); - c1[1] = hc_byte_perm_S (w2[1], w2[0], selector); - c1[0] = hc_byte_perm_S (w2[0], w1[3], selector); - c0[3] = hc_byte_perm_S (w1[3], w1[2], selector); - c0[2] = hc_byte_perm_S (w1[2], w1[1], selector); - c0[1] = hc_byte_perm_S (w1[1], w1[0], selector); - c0[0] = hc_byte_perm_S (w1[0], w0[3], selector); - w3[3] = hc_byte_perm_S (w0[3], w0[2], selector); - w3[2] = hc_byte_perm_S (w0[2], w0[1], selector); - w3[1] = hc_byte_perm_S (w0[1], w0[0], selector); - w3[0] = hc_byte_perm_S (w0[0], 0, selector); - w2[3] = 0; - w2[2] = 0; + case 10: + c2[2] = hc_bytealign_S (w7[3], 0, offset); + c2[1] = hc_bytealign_S (w7[2], w7[3], offset); + c2[0] = hc_bytealign_S (w7[1], w7[2], offset); + c1[3] = hc_bytealign_S (w7[0], w7[1], offset); + c1[2] = hc_bytealign_S (w6[3], w7[0], offset); + c1[1] = hc_bytealign_S (w6[2], w6[3], offset); + c1[0] = hc_bytealign_S (w6[1], w6[2], offset); + c0[3] = hc_bytealign_S (w6[0], w6[1], offset); + c0[2] = hc_bytealign_S (w5[3], w6[0], offset); + c0[1] = hc_bytealign_S (w5[2], w5[3], offset); + c0[0] = hc_bytealign_S (w5[1], w5[2], offset); + w7[3] = hc_bytealign_S (w5[0], w5[1], offset); + w7[2] = hc_bytealign_S (w4[3], w5[0], offset); + w7[1] = hc_bytealign_S (w4[2], w4[3], offset); + w7[0] = hc_bytealign_S (w4[1], w4[2], offset); + w6[3] = hc_bytealign_S (w4[0], w4[1], offset); + w6[2] = hc_bytealign_S (w3[3], w4[0], offset); + w6[1] = hc_bytealign_S (w3[2], w3[3], offset); + w6[0] = hc_bytealign_S (w3[1], w3[2], offset); + w5[3] = hc_bytealign_S (w3[0], w3[1], offset); + w5[2] = hc_bytealign_S (w2[3], w3[0], offset); + w5[1] = hc_bytealign_S (w2[2], w2[3], offset); + w5[0] = hc_bytealign_S (w2[1], w2[2], offset); + w4[3] = hc_bytealign_S (w2[0], w2[1], offset); + w4[2] = hc_bytealign_S (w1[3], w2[0], offset); + w4[1] = hc_bytealign_S (w1[2], w1[3], offset); + w4[0] = hc_bytealign_S (w1[1], w1[2], offset); + w3[3] = hc_bytealign_S (w1[0], w1[1], offset); + w3[2] = hc_bytealign_S (w0[3], w1[0], offset); + w3[1] = hc_bytealign_S (w0[2], w0[3], offset); + w3[0] = hc_bytealign_S (w0[1], w0[2], offset); + w2[3] = hc_bytealign_S (w0[0], w0[1], offset); + w2[2] = hc_bytealign_S ( 0, w0[0], offset); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -35854,26 +41559,40 @@ DECLSPEC void switch_buffer_by_offset_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 break; - case 13: - c3[1] = hc_byte_perm_S ( 0, w3[3], selector); - c3[0] = hc_byte_perm_S (w3[3], w3[2], selector); - c2[3] = hc_byte_perm_S (w3[2], w3[1], selector); - c2[2] = hc_byte_perm_S (w3[1], w3[0], selector); - c2[1] = hc_byte_perm_S (w3[0], w2[3], selector); - c2[0] = hc_byte_perm_S (w2[3], w2[2], selector); - c1[3] = hc_byte_perm_S (w2[2], w2[1], selector); - c1[2] = hc_byte_perm_S (w2[1], w2[0], selector); - c1[1] = hc_byte_perm_S (w2[0], w1[3], selector); - c1[0] = hc_byte_perm_S (w1[3], w1[2], selector); - c0[3] = hc_byte_perm_S (w1[2], w1[1], selector); - c0[2] = hc_byte_perm_S (w1[1], w1[0], selector); - c0[1] = hc_byte_perm_S (w1[0], w0[3], selector); - c0[0] = hc_byte_perm_S (w0[3], w0[2], selector); - w3[3] = hc_byte_perm_S (w0[2], w0[1], selector); - w3[2] = hc_byte_perm_S (w0[1], w0[0], selector); - w3[1] = hc_byte_perm_S (w0[0], 0, selector); - w3[0] = 0; - w2[3] = 0; + case 11: + c2[3] = hc_bytealign_S (w7[3], 0, offset); + c2[2] = hc_bytealign_S (w7[2], w7[3], offset); + c2[1] = hc_bytealign_S (w7[1], w7[2], offset); + c2[0] = hc_bytealign_S (w7[0], w7[1], offset); + c1[3] = hc_bytealign_S (w6[3], w7[0], offset); + c1[2] = hc_bytealign_S (w6[2], w6[3], offset); + c1[1] = hc_bytealign_S (w6[1], w6[2], offset); + c1[0] = hc_bytealign_S (w6[0], w6[1], offset); + c0[3] = hc_bytealign_S (w5[3], w6[0], offset); + c0[2] = hc_bytealign_S (w5[2], w5[3], offset); + c0[1] = hc_bytealign_S (w5[1], w5[2], offset); + c0[0] = hc_bytealign_S (w5[0], w5[1], offset); + w7[3] = hc_bytealign_S (w4[3], w5[0], offset); + w7[2] = hc_bytealign_S (w4[2], w4[3], offset); + w7[1] = hc_bytealign_S (w4[1], w4[2], offset); + w7[0] = hc_bytealign_S (w4[0], w4[1], offset); + w6[3] = hc_bytealign_S (w3[3], w4[0], offset); + w6[2] = hc_bytealign_S (w3[2], w3[3], offset); + w6[1] = hc_bytealign_S (w3[1], w3[2], offset); + w6[0] = hc_bytealign_S (w3[0], w3[1], offset); + w5[3] = hc_bytealign_S (w2[3], w3[0], offset); + w5[2] = hc_bytealign_S (w2[2], w2[3], offset); + w5[1] = hc_bytealign_S (w2[1], w2[2], offset); + w5[0] = hc_bytealign_S (w2[0], w2[1], offset); + w4[3] = hc_bytealign_S (w1[3], w2[0], offset); + w4[2] = hc_bytealign_S (w1[2], w1[3], offset); + w4[1] = hc_bytealign_S (w1[1], w1[2], offset); + w4[0] = hc_bytealign_S (w1[0], w1[1], offset); + w3[3] = hc_bytealign_S (w0[3], w1[0], offset); + w3[2] = hc_bytealign_S (w0[2], w0[3], offset); + w3[1] = hc_bytealign_S (w0[1], w0[2], offset); + w3[0] = hc_bytealign_S (w0[0], w0[1], offset); + w2[3] = hc_bytealign_S ( 0, w0[0], offset); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -35888,26 +41607,40 @@ DECLSPEC void switch_buffer_by_offset_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 break; - case 14: - c3[2] = hc_byte_perm_S ( 0, w3[3], selector); - c3[1] = hc_byte_perm_S (w3[3], w3[2], selector); - c3[0] = hc_byte_perm_S (w3[2], w3[1], selector); - c2[3] = hc_byte_perm_S (w3[1], w3[0], selector); - c2[2] = hc_byte_perm_S (w3[0], w2[3], selector); - c2[1] = hc_byte_perm_S (w2[3], w2[2], selector); - c2[0] = hc_byte_perm_S (w2[2], w2[1], selector); - c1[3] = hc_byte_perm_S (w2[1], w2[0], selector); - c1[2] = hc_byte_perm_S (w2[0], w1[3], selector); - c1[1] = hc_byte_perm_S (w1[3], w1[2], selector); - c1[0] = hc_byte_perm_S (w1[2], w1[1], selector); - c0[3] = hc_byte_perm_S (w1[1], w1[0], selector); - c0[2] = hc_byte_perm_S (w1[0], w0[3], selector); - c0[1] = hc_byte_perm_S (w0[3], w0[2], selector); - c0[0] = hc_byte_perm_S (w0[2], w0[1], selector); - w3[3] = hc_byte_perm_S (w0[1], w0[0], selector); - w3[2] = hc_byte_perm_S (w0[0], 0, selector); - w3[1] = 0; - w3[0] = 0; + case 12: + c3[0] = hc_bytealign_S (w7[3], 0, offset); + c2[3] = hc_bytealign_S (w7[2], w7[3], offset); + c2[2] = hc_bytealign_S (w7[1], w7[2], offset); + c2[1] = hc_bytealign_S (w7[0], w7[1], offset); + c2[0] = hc_bytealign_S (w6[3], w7[0], offset); + c1[3] = hc_bytealign_S (w6[2], w6[3], offset); + c1[2] = hc_bytealign_S (w6[1], w6[2], offset); + c1[1] = hc_bytealign_S (w6[0], w6[1], offset); + c1[0] = hc_bytealign_S (w5[3], w6[0], offset); + c0[3] = hc_bytealign_S (w5[2], w5[3], offset); + c0[2] = hc_bytealign_S (w5[1], w5[2], offset); + c0[1] = hc_bytealign_S (w5[0], w5[1], offset); + c0[0] = hc_bytealign_S (w4[3], w5[0], offset); + w7[3] = hc_bytealign_S (w4[2], w4[3], offset); + w7[2] = hc_bytealign_S (w4[1], w4[2], offset); + w7[1] = hc_bytealign_S (w4[0], w4[1], offset); + w7[0] = hc_bytealign_S (w3[3], w4[0], offset); + w6[3] = hc_bytealign_S (w3[2], w3[3], offset); + w6[2] = hc_bytealign_S (w3[1], w3[2], offset); + w6[1] = hc_bytealign_S (w3[0], w3[1], offset); + w6[0] = hc_bytealign_S (w2[3], w3[0], offset); + w5[3] = hc_bytealign_S (w2[2], w2[3], offset); + w5[2] = hc_bytealign_S (w2[1], w2[2], offset); + w5[1] = hc_bytealign_S (w2[0], w2[1], offset); + w5[0] = hc_bytealign_S (w1[3], w2[0], offset); + w4[3] = hc_bytealign_S (w1[2], w1[3], offset); + w4[2] = hc_bytealign_S (w1[1], w1[2], offset); + w4[1] = hc_bytealign_S (w1[0], w1[1], offset); + w4[0] = hc_bytealign_S (w0[3], w1[0], offset); + w3[3] = hc_bytealign_S (w0[2], w0[3], offset); + w3[2] = hc_bytealign_S (w0[1], w0[2], offset); + w3[1] = hc_bytealign_S (w0[0], w0[1], offset); + w3[0] = hc_bytealign_S ( 0, w0[0], offset); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -35923,26 +41656,40 @@ DECLSPEC void switch_buffer_by_offset_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 break; - case 15: - c3[3] = hc_byte_perm_S ( 0, w3[3], selector); - c3[2] = hc_byte_perm_S (w3[3], w3[2], selector); - c3[1] = hc_byte_perm_S (w3[2], w3[1], selector); - c3[0] = hc_byte_perm_S (w3[1], w3[0], selector); - c2[3] = hc_byte_perm_S (w3[0], w2[3], selector); - c2[2] = hc_byte_perm_S (w2[3], w2[2], selector); - c2[1] = hc_byte_perm_S (w2[2], w2[1], selector); - c2[0] = hc_byte_perm_S (w2[1], w2[0], selector); - c1[3] = hc_byte_perm_S (w2[0], w1[3], selector); - c1[2] = hc_byte_perm_S (w1[3], w1[2], selector); - c1[1] = hc_byte_perm_S (w1[2], w1[1], selector); - c1[0] = hc_byte_perm_S (w1[1], w1[0], selector); - c0[3] = hc_byte_perm_S (w1[0], w0[3], selector); - c0[2] = hc_byte_perm_S (w0[3], w0[2], selector); - c0[1] = hc_byte_perm_S (w0[2], w0[1], selector); - c0[0] = hc_byte_perm_S (w0[1], w0[0], selector); - w3[3] = hc_byte_perm_S (w0[0], 0, selector); - w3[2] = 0; - w3[1] = 0; + case 13: + c3[1] = hc_bytealign_S (w7[3], 0, offset); + c3[0] = hc_bytealign_S (w7[2], w7[3], offset); + c2[3] = hc_bytealign_S (w7[1], w7[2], offset); + c2[2] = hc_bytealign_S (w7[0], w7[1], offset); + c2[1] = hc_bytealign_S (w6[3], w7[0], offset); + c2[0] = hc_bytealign_S (w6[2], w6[3], offset); + c1[3] = hc_bytealign_S (w6[1], w6[2], offset); + c1[2] = hc_bytealign_S (w6[0], w6[1], offset); + c1[1] = hc_bytealign_S (w5[3], w6[0], offset); + c1[0] = hc_bytealign_S (w5[2], w5[3], offset); + c0[3] = hc_bytealign_S (w5[1], w5[2], offset); + c0[2] = hc_bytealign_S (w5[0], w5[1], offset); + c0[1] = hc_bytealign_S (w4[3], w5[0], offset); + c0[0] = hc_bytealign_S (w4[2], w4[3], offset); + w7[3] = hc_bytealign_S (w4[1], w4[2], offset); + w7[2] = hc_bytealign_S (w4[0], w4[1], offset); + w7[1] = hc_bytealign_S (w3[3], w4[0], offset); + w7[0] = hc_bytealign_S (w3[2], w3[3], offset); + w6[3] = hc_bytealign_S (w3[1], w3[2], offset); + w6[2] = hc_bytealign_S (w3[0], w3[1], offset); + w6[1] = hc_bytealign_S (w2[3], w3[0], offset); + w6[0] = hc_bytealign_S (w2[2], w2[3], offset); + w5[3] = hc_bytealign_S (w2[1], w2[2], offset); + w5[2] = hc_bytealign_S (w2[0], w2[1], offset); + w5[1] = hc_bytealign_S (w1[3], w2[0], offset); + w5[0] = hc_bytealign_S (w1[2], w1[3], offset); + w4[3] = hc_bytealign_S (w1[1], w1[2], offset); + w4[2] = hc_bytealign_S (w1[0], w1[1], offset); + w4[1] = hc_bytealign_S (w0[3], w1[0], offset); + w4[0] = hc_bytealign_S (w0[2], w0[3], offset); + w3[3] = hc_bytealign_S (w0[1], w0[2], offset); + w3[2] = hc_bytealign_S (w0[0], w0[1], offset); + w3[1] = hc_bytealign_S ( 0, w0[0], offset); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -35957,191 +41704,104 @@ DECLSPEC void switch_buffer_by_offset_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 w0[1] = 0; w0[0] = 0; - break; - } - #endif -} - -DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, u32 *w4, u32 *w5, u32 *w6, u32 *w7, const u32 offset) -{ - const int offset_switch = offset / 4; - - #if (defined IS_AMD && HAS_VPERM == 0) || defined IS_GENERIC - switch (offset_switch) - { - case 0: - w7[3] = hc_bytealign_S (w7[2], w7[3], offset); - w7[2] = hc_bytealign_S (w7[1], w7[2], offset); - w7[1] = hc_bytealign_S (w7[0], w7[1], offset); - w7[0] = hc_bytealign_S (w6[3], w7[0], offset); - w6[3] = hc_bytealign_S (w6[2], w6[3], offset); - w6[2] = hc_bytealign_S (w6[1], w6[2], offset); - w6[1] = hc_bytealign_S (w6[0], w6[1], offset); - w6[0] = hc_bytealign_S (w5[3], w6[0], offset); - w5[3] = hc_bytealign_S (w5[2], w5[3], offset); - w5[2] = hc_bytealign_S (w5[1], w5[2], offset); - w5[1] = hc_bytealign_S (w5[0], w5[1], offset); - w5[0] = hc_bytealign_S (w4[3], w5[0], offset); - w4[3] = hc_bytealign_S (w4[2], w4[3], offset); - w4[2] = hc_bytealign_S (w4[1], w4[2], offset); - w4[1] = hc_bytealign_S (w4[0], w4[1], offset); - w4[0] = hc_bytealign_S (w3[3], w4[0], offset); - w3[3] = hc_bytealign_S (w3[2], w3[3], offset); - w3[2] = hc_bytealign_S (w3[1], w3[2], offset); - w3[1] = hc_bytealign_S (w3[0], w3[1], offset); - w3[0] = hc_bytealign_S (w2[3], w3[0], offset); - w2[3] = hc_bytealign_S (w2[2], w2[3], offset); - w2[2] = hc_bytealign_S (w2[1], w2[2], offset); - w2[1] = hc_bytealign_S (w2[0], w2[1], offset); - w2[0] = hc_bytealign_S (w1[3], w2[0], offset); - w1[3] = hc_bytealign_S (w1[2], w1[3], offset); - w1[2] = hc_bytealign_S (w1[1], w1[2], offset); - w1[1] = hc_bytealign_S (w1[0], w1[1], offset); - w1[0] = hc_bytealign_S (w0[3], w1[0], offset); - w0[3] = hc_bytealign_S (w0[2], w0[3], offset); - w0[2] = hc_bytealign_S (w0[1], w0[2], offset); - w0[1] = hc_bytealign_S (w0[0], w0[1], offset); - w0[0] = hc_bytealign_S ( 0, w0[0], offset); - - break; - - case 1: - w7[3] = hc_bytealign_S (w7[1], w7[2], offset); - w7[2] = hc_bytealign_S (w7[0], w7[1], offset); - w7[1] = hc_bytealign_S (w6[3], w7[0], offset); - w7[0] = hc_bytealign_S (w6[2], w6[3], offset); - w6[3] = hc_bytealign_S (w6[1], w6[2], offset); - w6[2] = hc_bytealign_S (w6[0], w6[1], offset); - w6[1] = hc_bytealign_S (w5[3], w6[0], offset); - w6[0] = hc_bytealign_S (w5[2], w5[3], offset); - w5[3] = hc_bytealign_S (w5[1], w5[2], offset); - w5[2] = hc_bytealign_S (w5[0], w5[1], offset); - w5[1] = hc_bytealign_S (w4[3], w5[0], offset); - w5[0] = hc_bytealign_S (w4[2], w4[3], offset); - w4[3] = hc_bytealign_S (w4[1], w4[2], offset); - w4[2] = hc_bytealign_S (w4[0], w4[1], offset); - w4[1] = hc_bytealign_S (w3[3], w4[0], offset); - w4[0] = hc_bytealign_S (w3[2], w3[3], offset); - w3[3] = hc_bytealign_S (w3[1], w3[2], offset); - w3[2] = hc_bytealign_S (w3[0], w3[1], offset); - w3[1] = hc_bytealign_S (w2[3], w3[0], offset); - w3[0] = hc_bytealign_S (w2[2], w2[3], offset); - w2[3] = hc_bytealign_S (w2[1], w2[2], offset); - w2[2] = hc_bytealign_S (w2[0], w2[1], offset); - w2[1] = hc_bytealign_S (w1[3], w2[0], offset); - w2[0] = hc_bytealign_S (w1[2], w1[3], offset); - w1[3] = hc_bytealign_S (w1[1], w1[2], offset); - w1[2] = hc_bytealign_S (w1[0], w1[1], offset); - w1[1] = hc_bytealign_S (w0[3], w1[0], offset); - w1[0] = hc_bytealign_S (w0[2], w0[3], offset); - w0[3] = hc_bytealign_S (w0[1], w0[2], offset); - w0[2] = hc_bytealign_S (w0[0], w0[1], offset); - w0[1] = hc_bytealign_S ( 0, w0[0], offset); - w0[0] = 0; - - break; - - case 2: - w7[3] = hc_bytealign_S (w7[0], w7[1], offset); - w7[2] = hc_bytealign_S (w6[3], w7[0], offset); - w7[1] = hc_bytealign_S (w6[2], w6[3], offset); - w7[0] = hc_bytealign_S (w6[1], w6[2], offset); - w6[3] = hc_bytealign_S (w6[0], w6[1], offset); - w6[2] = hc_bytealign_S (w5[3], w6[0], offset); - w6[1] = hc_bytealign_S (w5[2], w5[3], offset); - w6[0] = hc_bytealign_S (w5[1], w5[2], offset); - w5[3] = hc_bytealign_S (w5[0], w5[1], offset); - w5[2] = hc_bytealign_S (w4[3], w5[0], offset); - w5[1] = hc_bytealign_S (w4[2], w4[3], offset); - w5[0] = hc_bytealign_S (w4[1], w4[2], offset); - w4[3] = hc_bytealign_S (w4[0], w4[1], offset); - w4[2] = hc_bytealign_S (w3[3], w4[0], offset); - w4[1] = hc_bytealign_S (w3[2], w3[3], offset); - w4[0] = hc_bytealign_S (w3[1], w3[2], offset); - w3[3] = hc_bytealign_S (w3[0], w3[1], offset); - w3[2] = hc_bytealign_S (w2[3], w3[0], offset); - w3[1] = hc_bytealign_S (w2[2], w2[3], offset); - w3[0] = hc_bytealign_S (w2[1], w2[2], offset); - w2[3] = hc_bytealign_S (w2[0], w2[1], offset); - w2[2] = hc_bytealign_S (w1[3], w2[0], offset); - w2[1] = hc_bytealign_S (w1[2], w1[3], offset); - w2[0] = hc_bytealign_S (w1[1], w1[2], offset); - w1[3] = hc_bytealign_S (w1[0], w1[1], offset); - w1[2] = hc_bytealign_S (w0[3], w1[0], offset); - w1[1] = hc_bytealign_S (w0[2], w0[3], offset); - w1[0] = hc_bytealign_S (w0[1], w0[2], offset); - w0[3] = hc_bytealign_S (w0[0], w0[1], offset); - w0[2] = hc_bytealign_S ( 0, w0[0], offset); - w0[1] = 0; - w0[0] = 0; - break; - case 3: - w7[3] = hc_bytealign_S (w6[3], w7[0], offset); - w7[2] = hc_bytealign_S (w6[2], w6[3], offset); - w7[1] = hc_bytealign_S (w6[1], w6[2], offset); - w7[0] = hc_bytealign_S (w6[0], w6[1], offset); - w6[3] = hc_bytealign_S (w5[3], w6[0], offset); - w6[2] = hc_bytealign_S (w5[2], w5[3], offset); - w6[1] = hc_bytealign_S (w5[1], w5[2], offset); - w6[0] = hc_bytealign_S (w5[0], w5[1], offset); - w5[3] = hc_bytealign_S (w4[3], w5[0], offset); - w5[2] = hc_bytealign_S (w4[2], w4[3], offset); - w5[1] = hc_bytealign_S (w4[1], w4[2], offset); - w5[0] = hc_bytealign_S (w4[0], w4[1], offset); - w4[3] = hc_bytealign_S (w3[3], w4[0], offset); - w4[2] = hc_bytealign_S (w3[2], w3[3], offset); - w4[1] = hc_bytealign_S (w3[1], w3[2], offset); - w4[0] = hc_bytealign_S (w3[0], w3[1], offset); - w3[3] = hc_bytealign_S (w2[3], w3[0], offset); - w3[2] = hc_bytealign_S (w2[2], w2[3], offset); - w3[1] = hc_bytealign_S (w2[1], w2[2], offset); - w3[0] = hc_bytealign_S (w2[0], w2[1], offset); - w2[3] = hc_bytealign_S (w1[3], w2[0], offset); - w2[2] = hc_bytealign_S (w1[2], w1[3], offset); - w2[1] = hc_bytealign_S (w1[1], w1[2], offset); - w2[0] = hc_bytealign_S (w1[0], w1[1], offset); - w1[3] = hc_bytealign_S (w0[3], w1[0], offset); - w1[2] = hc_bytealign_S (w0[2], w0[3], offset); - w1[1] = hc_bytealign_S (w0[1], w0[2], offset); - w1[0] = hc_bytealign_S (w0[0], w0[1], offset); - w0[3] = hc_bytealign_S ( 0, w0[0], offset); + case 14: + c3[2] = hc_bytealign_S (w7[3], 0, offset); + c3[1] = hc_bytealign_S (w7[2], w7[3], offset); + c3[0] = hc_bytealign_S (w7[1], w7[2], offset); + c2[3] = hc_bytealign_S (w7[0], w7[1], offset); + c2[2] = hc_bytealign_S (w6[3], w7[0], offset); + c2[1] = hc_bytealign_S (w6[2], w6[3], offset); + c2[0] = hc_bytealign_S (w6[1], w6[2], offset); + c1[3] = hc_bytealign_S (w6[0], w6[1], offset); + c1[2] = hc_bytealign_S (w5[3], w6[0], offset); + c1[1] = hc_bytealign_S (w5[2], w5[3], offset); + c1[0] = hc_bytealign_S (w5[1], w5[2], offset); + c0[3] = hc_bytealign_S (w5[0], w5[1], offset); + c0[2] = hc_bytealign_S (w4[3], w5[0], offset); + c0[1] = hc_bytealign_S (w4[2], w4[3], offset); + c0[0] = hc_bytealign_S (w4[1], w4[2], offset); + w7[3] = hc_bytealign_S (w4[0], w4[1], offset); + w7[2] = hc_bytealign_S (w3[3], w4[0], offset); + w7[1] = hc_bytealign_S (w3[2], w3[3], offset); + w7[0] = hc_bytealign_S (w3[1], w3[2], offset); + w6[3] = hc_bytealign_S (w3[0], w3[1], offset); + w6[2] = hc_bytealign_S (w2[3], w3[0], offset); + w6[1] = hc_bytealign_S (w2[2], w2[3], offset); + w6[0] = hc_bytealign_S (w2[1], w2[2], offset); + w5[3] = hc_bytealign_S (w2[0], w2[1], offset); + w5[2] = hc_bytealign_S (w1[3], w2[0], offset); + w5[1] = hc_bytealign_S (w1[2], w1[3], offset); + w5[0] = hc_bytealign_S (w1[1], w1[2], offset); + w4[3] = hc_bytealign_S (w1[0], w1[1], offset); + w4[2] = hc_bytealign_S (w0[3], w1[0], offset); + w4[1] = hc_bytealign_S (w0[2], w0[3], offset); + w4[0] = hc_bytealign_S (w0[1], w0[2], offset); + w3[3] = hc_bytealign_S (w0[0], w0[1], offset); + w3[2] = hc_bytealign_S ( 0, w0[0], offset); + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; - case 4: - w7[3] = hc_bytealign_S (w6[2], w6[3], offset); - w7[2] = hc_bytealign_S (w6[1], w6[2], offset); - w7[1] = hc_bytealign_S (w6[0], w6[1], offset); - w7[0] = hc_bytealign_S (w5[3], w6[0], offset); - w6[3] = hc_bytealign_S (w5[2], w5[3], offset); - w6[2] = hc_bytealign_S (w5[1], w5[2], offset); - w6[1] = hc_bytealign_S (w5[0], w5[1], offset); - w6[0] = hc_bytealign_S (w4[3], w5[0], offset); - w5[3] = hc_bytealign_S (w4[2], w4[3], offset); - w5[2] = hc_bytealign_S (w4[1], w4[2], offset); - w5[1] = hc_bytealign_S (w4[0], w4[1], offset); - w5[0] = hc_bytealign_S (w3[3], w4[0], offset); - w4[3] = hc_bytealign_S (w3[2], w3[3], offset); - w4[2] = hc_bytealign_S (w3[1], w3[2], offset); - w4[1] = hc_bytealign_S (w3[0], w3[1], offset); - w4[0] = hc_bytealign_S (w2[3], w3[0], offset); - w3[3] = hc_bytealign_S (w2[2], w2[3], offset); - w3[2] = hc_bytealign_S (w2[1], w2[2], offset); - w3[1] = hc_bytealign_S (w2[0], w2[1], offset); - w3[0] = hc_bytealign_S (w1[3], w2[0], offset); - w2[3] = hc_bytealign_S (w1[2], w1[3], offset); - w2[2] = hc_bytealign_S (w1[1], w1[2], offset); - w2[1] = hc_bytealign_S (w1[0], w1[1], offset); - w2[0] = hc_bytealign_S (w0[3], w1[0], offset); - w1[3] = hc_bytealign_S (w0[2], w0[3], offset); - w1[2] = hc_bytealign_S (w0[1], w0[2], offset); - w1[1] = hc_bytealign_S (w0[0], w0[1], offset); - w1[0] = hc_bytealign_S ( 0, w0[0], offset); + case 15: + c3[3] = hc_bytealign_S (w7[3], 0, offset); + c3[2] = hc_bytealign_S (w7[2], w7[3], offset); + c3[1] = hc_bytealign_S (w7[1], w7[2], offset); + c3[0] = hc_bytealign_S (w7[0], w7[1], offset); + c2[3] = hc_bytealign_S (w6[3], w7[0], offset); + c2[2] = hc_bytealign_S (w6[2], w6[3], offset); + c2[1] = hc_bytealign_S (w6[1], w6[2], offset); + c2[0] = hc_bytealign_S (w6[0], w6[1], offset); + c1[3] = hc_bytealign_S (w5[3], w6[0], offset); + c1[2] = hc_bytealign_S (w5[2], w5[3], offset); + c1[1] = hc_bytealign_S (w5[1], w5[2], offset); + c1[0] = hc_bytealign_S (w5[0], w5[1], offset); + c0[3] = hc_bytealign_S (w4[3], w5[0], offset); + c0[2] = hc_bytealign_S (w4[2], w4[3], offset); + c0[1] = hc_bytealign_S (w4[1], w4[2], offset); + c0[0] = hc_bytealign_S (w4[0], w4[1], offset); + w7[3] = hc_bytealign_S (w3[3], w4[0], offset); + w7[2] = hc_bytealign_S (w3[2], w3[3], offset); + w7[1] = hc_bytealign_S (w3[1], w3[2], offset); + w7[0] = hc_bytealign_S (w3[0], w3[1], offset); + w6[3] = hc_bytealign_S (w2[3], w3[0], offset); + w6[2] = hc_bytealign_S (w2[2], w2[3], offset); + w6[1] = hc_bytealign_S (w2[1], w2[2], offset); + w6[0] = hc_bytealign_S (w2[0], w2[1], offset); + w5[3] = hc_bytealign_S (w1[3], w2[0], offset); + w5[2] = hc_bytealign_S (w1[2], w1[3], offset); + w5[1] = hc_bytealign_S (w1[1], w1[2], offset); + w5[0] = hc_bytealign_S (w1[0], w1[1], offset); + w4[3] = hc_bytealign_S (w0[3], w1[0], offset); + w4[2] = hc_bytealign_S (w0[2], w0[3], offset); + w4[1] = hc_bytealign_S (w0[1], w0[2], offset); + w4[0] = hc_bytealign_S (w0[0], w0[1], offset); + w3[3] = hc_bytealign_S ( 0, w0[0], offset); + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; @@ -36149,34 +41809,51 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; - case 5: - w7[3] = hc_bytealign_S (w6[1], w6[2], offset); - w7[2] = hc_bytealign_S (w6[0], w6[1], offset); - w7[1] = hc_bytealign_S (w5[3], w6[0], offset); - w7[0] = hc_bytealign_S (w5[2], w5[3], offset); - w6[3] = hc_bytealign_S (w5[1], w5[2], offset); - w6[2] = hc_bytealign_S (w5[0], w5[1], offset); - w6[1] = hc_bytealign_S (w4[3], w5[0], offset); - w6[0] = hc_bytealign_S (w4[2], w4[3], offset); - w5[3] = hc_bytealign_S (w4[1], w4[2], offset); - w5[2] = hc_bytealign_S (w4[0], w4[1], offset); - w5[1] = hc_bytealign_S (w3[3], w4[0], offset); - w5[0] = hc_bytealign_S (w3[2], w3[3], offset); - w4[3] = hc_bytealign_S (w3[1], w3[2], offset); - w4[2] = hc_bytealign_S (w3[0], w3[1], offset); - w4[1] = hc_bytealign_S (w2[3], w3[0], offset); - w4[0] = hc_bytealign_S (w2[2], w2[3], offset); - w3[3] = hc_bytealign_S (w2[1], w2[2], offset); - w3[2] = hc_bytealign_S (w2[0], w2[1], offset); - w3[1] = hc_bytealign_S (w1[3], w2[0], offset); - w3[0] = hc_bytealign_S (w1[2], w1[3], offset); - w2[3] = hc_bytealign_S (w1[1], w1[2], offset); - w2[2] = hc_bytealign_S (w1[0], w1[1], offset); - w2[1] = hc_bytealign_S (w0[3], w1[0], offset); - w2[0] = hc_bytealign_S (w0[2], w0[3], offset); - w1[3] = hc_bytealign_S (w0[1], w0[2], offset); - w1[2] = hc_bytealign_S (w0[0], w0[1], offset); - w1[1] = hc_bytealign_S ( 0, w0[0], offset); + case 16: + c4[0] = hc_bytealign_S (w7[3], 0, offset); + c3[3] = hc_bytealign_S (w7[2], w7[3], offset); + c3[2] = hc_bytealign_S (w7[1], w7[2], offset); + c3[1] = hc_bytealign_S (w7[0], w7[1], offset); + c3[0] = hc_bytealign_S (w6[3], w7[0], offset); + c2[3] = hc_bytealign_S (w6[2], w6[3], offset); + c2[2] = hc_bytealign_S (w6[1], w6[2], offset); + c2[1] = hc_bytealign_S (w6[0], w6[1], offset); + c2[0] = hc_bytealign_S (w5[3], w6[0], offset); + c1[3] = hc_bytealign_S (w5[2], w5[3], offset); + c1[2] = hc_bytealign_S (w5[1], w5[2], offset); + c1[1] = hc_bytealign_S (w5[0], w5[1], offset); + c1[0] = hc_bytealign_S (w4[3], w5[0], offset); + c0[3] = hc_bytealign_S (w4[2], w4[3], offset); + c0[2] = hc_bytealign_S (w4[1], w4[2], offset); + c0[1] = hc_bytealign_S (w4[0], w4[1], offset); + c0[0] = hc_bytealign_S (w3[3], w4[0], offset); + w7[3] = hc_bytealign_S (w3[2], w3[3], offset); + w7[2] = hc_bytealign_S (w3[1], w3[2], offset); + w7[1] = hc_bytealign_S (w3[0], w3[1], offset); + w7[0] = hc_bytealign_S (w2[3], w3[0], offset); + w6[3] = hc_bytealign_S (w2[2], w2[3], offset); + w6[2] = hc_bytealign_S (w2[1], w2[2], offset); + w6[1] = hc_bytealign_S (w2[0], w2[1], offset); + w6[0] = hc_bytealign_S (w1[3], w2[0], offset); + w5[3] = hc_bytealign_S (w1[2], w1[3], offset); + w5[2] = hc_bytealign_S (w1[1], w1[2], offset); + w5[1] = hc_bytealign_S (w1[0], w1[1], offset); + w5[0] = hc_bytealign_S (w0[3], w1[0], offset); + w4[3] = hc_bytealign_S (w0[2], w0[3], offset); + w4[2] = hc_bytealign_S (w0[1], w0[2], offset); + w4[1] = hc_bytealign_S (w0[0], w0[1], offset); + w4[0] = hc_bytealign_S ( 0, w0[0], offset); + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; @@ -36185,33 +41862,51 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; - case 6: - w7[3] = hc_bytealign_S (w6[0], w6[1], offset); - w7[2] = hc_bytealign_S (w5[3], w6[0], offset); - w7[1] = hc_bytealign_S (w5[2], w5[3], offset); - w7[0] = hc_bytealign_S (w5[1], w5[2], offset); - w6[3] = hc_bytealign_S (w5[0], w5[1], offset); - w6[2] = hc_bytealign_S (w4[3], w5[0], offset); - w6[1] = hc_bytealign_S (w4[2], w4[3], offset); - w6[0] = hc_bytealign_S (w4[1], w4[2], offset); - w5[3] = hc_bytealign_S (w4[0], w4[1], offset); - w5[2] = hc_bytealign_S (w3[3], w4[0], offset); - w5[1] = hc_bytealign_S (w3[2], w3[3], offset); - w5[0] = hc_bytealign_S (w3[1], w3[2], offset); - w4[3] = hc_bytealign_S (w3[0], w3[1], offset); - w4[2] = hc_bytealign_S (w2[3], w3[0], offset); - w4[1] = hc_bytealign_S (w2[2], w2[3], offset); - w4[0] = hc_bytealign_S (w2[1], w2[2], offset); - w3[3] = hc_bytealign_S (w2[0], w2[1], offset); - w3[2] = hc_bytealign_S (w1[3], w2[0], offset); - w3[1] = hc_bytealign_S (w1[2], w1[3], offset); - w3[0] = hc_bytealign_S (w1[1], w1[2], offset); - w2[3] = hc_bytealign_S (w1[0], w1[1], offset); - w2[2] = hc_bytealign_S (w0[3], w1[0], offset); - w2[1] = hc_bytealign_S (w0[2], w0[3], offset); - w2[0] = hc_bytealign_S (w0[1], w0[2], offset); - w1[3] = hc_bytealign_S (w0[0], w0[1], offset); - w1[2] = hc_bytealign_S ( 0, w0[0], offset); + case 17: + c4[1] = hc_bytealign_S (w7[3], 0, offset); + c4[0] = hc_bytealign_S (w7[2], w7[3], offset); + c3[3] = hc_bytealign_S (w7[1], w7[2], offset); + c3[2] = hc_bytealign_S (w7[0], w7[1], offset); + c3[1] = hc_bytealign_S (w6[3], w7[0], offset); + c3[0] = hc_bytealign_S (w6[2], w6[3], offset); + c2[3] = hc_bytealign_S (w6[1], w6[2], offset); + c2[2] = hc_bytealign_S (w6[0], w6[1], offset); + c2[1] = hc_bytealign_S (w5[3], w6[0], offset); + c2[0] = hc_bytealign_S (w5[2], w5[3], offset); + c1[3] = hc_bytealign_S (w5[1], w5[2], offset); + c1[2] = hc_bytealign_S (w5[0], w5[1], offset); + c1[1] = hc_bytealign_S (w4[3], w5[0], offset); + c1[0] = hc_bytealign_S (w4[2], w4[3], offset); + c0[3] = hc_bytealign_S (w4[1], w4[2], offset); + c0[2] = hc_bytealign_S (w4[0], w4[1], offset); + c0[1] = hc_bytealign_S (w3[3], w4[0], offset); + c0[0] = hc_bytealign_S (w3[2], w3[3], offset); + w7[3] = hc_bytealign_S (w3[1], w3[2], offset); + w7[2] = hc_bytealign_S (w3[0], w3[1], offset); + w7[1] = hc_bytealign_S (w2[3], w3[0], offset); + w7[0] = hc_bytealign_S (w2[2], w2[3], offset); + w6[3] = hc_bytealign_S (w2[1], w2[2], offset); + w6[2] = hc_bytealign_S (w2[0], w2[1], offset); + w6[1] = hc_bytealign_S (w1[3], w2[0], offset); + w6[0] = hc_bytealign_S (w1[2], w1[3], offset); + w5[3] = hc_bytealign_S (w1[1], w1[2], offset); + w5[2] = hc_bytealign_S (w1[0], w1[1], offset); + w5[1] = hc_bytealign_S (w0[3], w1[0], offset); + w5[0] = hc_bytealign_S (w0[2], w0[3], offset); + w4[3] = hc_bytealign_S (w0[1], w0[2], offset); + w4[2] = hc_bytealign_S (w0[0], w0[1], offset); + w4[1] = hc_bytealign_S ( 0, w0[0], offset); + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -36221,32 +41916,107 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; - case 7: - w7[3] = hc_bytealign_S (w5[3], w6[0], offset); - w7[2] = hc_bytealign_S (w5[2], w5[3], offset); - w7[1] = hc_bytealign_S (w5[1], w5[2], offset); - w7[0] = hc_bytealign_S (w5[0], w5[1], offset); - w6[3] = hc_bytealign_S (w4[3], w5[0], offset); - w6[2] = hc_bytealign_S (w4[2], w4[3], offset); - w6[1] = hc_bytealign_S (w4[1], w4[2], offset); - w6[0] = hc_bytealign_S (w4[0], w4[1], offset); - w5[3] = hc_bytealign_S (w3[3], w4[0], offset); - w5[2] = hc_bytealign_S (w3[2], w3[3], offset); - w5[1] = hc_bytealign_S (w3[1], w3[2], offset); - w5[0] = hc_bytealign_S (w3[0], w3[1], offset); - w4[3] = hc_bytealign_S (w2[3], w3[0], offset); - w4[2] = hc_bytealign_S (w2[2], w2[3], offset); - w4[1] = hc_bytealign_S (w2[1], w2[2], offset); - w4[0] = hc_bytealign_S (w2[0], w2[1], offset); - w3[3] = hc_bytealign_S (w1[3], w2[0], offset); - w3[2] = hc_bytealign_S (w1[2], w1[3], offset); - w3[1] = hc_bytealign_S (w1[1], w1[2], offset); - w3[0] = hc_bytealign_S (w1[0], w1[1], offset); - w2[3] = hc_bytealign_S (w0[3], w1[0], offset); - w2[2] = hc_bytealign_S (w0[2], w0[3], offset); - w2[1] = hc_bytealign_S (w0[1], w0[2], offset); - w2[0] = hc_bytealign_S (w0[0], w0[1], offset); - w1[3] = hc_bytealign_S ( 0, w0[0], offset); + case 18: + c4[2] = hc_bytealign_S (w7[3], 0, offset); + c4[1] = hc_bytealign_S (w7[2], w7[3], offset); + c4[0] = hc_bytealign_S (w7[1], w7[2], offset); + c3[3] = hc_bytealign_S (w7[0], w7[1], offset); + c3[2] = hc_bytealign_S (w6[3], w7[0], offset); + c3[1] = hc_bytealign_S (w6[2], w6[3], offset); + c3[0] = hc_bytealign_S (w6[1], w6[2], offset); + c2[3] = hc_bytealign_S (w6[0], w6[1], offset); + c2[2] = hc_bytealign_S (w5[3], w6[0], offset); + c2[1] = hc_bytealign_S (w5[2], w5[3], offset); + c2[0] = hc_bytealign_S (w5[1], w5[2], offset); + c1[3] = hc_bytealign_S (w5[0], w5[1], offset); + c1[2] = hc_bytealign_S (w4[3], w5[0], offset); + c1[1] = hc_bytealign_S (w4[2], w4[3], offset); + c1[0] = hc_bytealign_S (w4[1], w4[2], offset); + c0[3] = hc_bytealign_S (w4[0], w4[1], offset); + c0[2] = hc_bytealign_S (w3[3], w4[0], offset); + c0[1] = hc_bytealign_S (w3[2], w3[3], offset); + c0[0] = hc_bytealign_S (w3[1], w3[2], offset); + w7[3] = hc_bytealign_S (w3[0], w3[1], offset); + w7[2] = hc_bytealign_S (w2[3], w3[0], offset); + w7[1] = hc_bytealign_S (w2[2], w2[3], offset); + w7[0] = hc_bytealign_S (w2[1], w2[2], offset); + w6[3] = hc_bytealign_S (w2[0], w2[1], offset); + w6[2] = hc_bytealign_S (w1[3], w2[0], offset); + w6[1] = hc_bytealign_S (w1[2], w1[3], offset); + w6[0] = hc_bytealign_S (w1[1], w1[2], offset); + w5[3] = hc_bytealign_S (w1[0], w1[1], offset); + w5[2] = hc_bytealign_S (w0[3], w1[0], offset); + w5[1] = hc_bytealign_S (w0[2], w0[3], offset); + w5[0] = hc_bytealign_S (w0[1], w0[2], offset); + w4[3] = hc_bytealign_S (w0[0], w0[1], offset); + w4[2] = hc_bytealign_S ( 0, w0[0], offset); + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 19: + c4[3] = hc_bytealign_S (w7[3], 0, offset); + c4[2] = hc_bytealign_S (w7[2], w7[3], offset); + c4[1] = hc_bytealign_S (w7[1], w7[2], offset); + c4[0] = hc_bytealign_S (w7[0], w7[1], offset); + c3[3] = hc_bytealign_S (w6[3], w7[0], offset); + c3[2] = hc_bytealign_S (w6[2], w6[3], offset); + c3[1] = hc_bytealign_S (w6[1], w6[2], offset); + c3[0] = hc_bytealign_S (w6[0], w6[1], offset); + c2[3] = hc_bytealign_S (w5[3], w6[0], offset); + c2[2] = hc_bytealign_S (w5[2], w5[3], offset); + c2[1] = hc_bytealign_S (w5[1], w5[2], offset); + c2[0] = hc_bytealign_S (w5[0], w5[1], offset); + c1[3] = hc_bytealign_S (w4[3], w5[0], offset); + c1[2] = hc_bytealign_S (w4[2], w4[3], offset); + c1[1] = hc_bytealign_S (w4[1], w4[2], offset); + c1[0] = hc_bytealign_S (w4[0], w4[1], offset); + c0[3] = hc_bytealign_S (w3[3], w4[0], offset); + c0[2] = hc_bytealign_S (w3[2], w3[3], offset); + c0[1] = hc_bytealign_S (w3[1], w3[2], offset); + c0[0] = hc_bytealign_S (w3[0], w3[1], offset); + w7[3] = hc_bytealign_S (w2[3], w3[0], offset); + w7[2] = hc_bytealign_S (w2[2], w2[3], offset); + w7[1] = hc_bytealign_S (w2[1], w2[2], offset); + w7[0] = hc_bytealign_S (w2[0], w2[1], offset); + w6[3] = hc_bytealign_S (w1[3], w2[0], offset); + w6[2] = hc_bytealign_S (w1[2], w1[3], offset); + w6[1] = hc_bytealign_S (w1[1], w1[2], offset); + w6[0] = hc_bytealign_S (w1[0], w1[1], offset); + w5[3] = hc_bytealign_S (w0[3], w1[0], offset); + w5[2] = hc_bytealign_S (w0[2], w0[3], offset); + w5[1] = hc_bytealign_S (w0[1], w0[2], offset); + w5[0] = hc_bytealign_S (w0[0], w0[1], offset); + w4[3] = hc_bytealign_S ( 0, w0[0], offset); + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -36257,31 +42027,52 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; - case 8: - w7[3] = hc_bytealign_S (w5[2], w5[3], offset); - w7[2] = hc_bytealign_S (w5[1], w5[2], offset); - w7[1] = hc_bytealign_S (w5[0], w5[1], offset); - w7[0] = hc_bytealign_S (w4[3], w5[0], offset); - w6[3] = hc_bytealign_S (w4[2], w4[3], offset); - w6[2] = hc_bytealign_S (w4[1], w4[2], offset); - w6[1] = hc_bytealign_S (w4[0], w4[1], offset); - w6[0] = hc_bytealign_S (w3[3], w4[0], offset); - w5[3] = hc_bytealign_S (w3[2], w3[3], offset); - w5[2] = hc_bytealign_S (w3[1], w3[2], offset); - w5[1] = hc_bytealign_S (w3[0], w3[1], offset); - w5[0] = hc_bytealign_S (w2[3], w3[0], offset); - w4[3] = hc_bytealign_S (w2[2], w2[3], offset); - w4[2] = hc_bytealign_S (w2[1], w2[2], offset); - w4[1] = hc_bytealign_S (w2[0], w2[1], offset); - w4[0] = hc_bytealign_S (w1[3], w2[0], offset); - w3[3] = hc_bytealign_S (w1[2], w1[3], offset); - w3[2] = hc_bytealign_S (w1[1], w1[2], offset); - w3[1] = hc_bytealign_S (w1[0], w1[1], offset); - w3[0] = hc_bytealign_S (w0[3], w1[0], offset); - w2[3] = hc_bytealign_S (w0[2], w0[3], offset); - w2[2] = hc_bytealign_S (w0[1], w0[2], offset); - w2[1] = hc_bytealign_S (w0[0], w0[1], offset); - w2[0] = hc_bytealign_S ( 0, w0[0], offset); + case 20: + c5[0] = hc_bytealign_S (w7[3], 0, offset); + c4[3] = hc_bytealign_S (w7[2], w7[3], offset); + c4[2] = hc_bytealign_S (w7[1], w7[2], offset); + c4[1] = hc_bytealign_S (w7[0], w7[1], offset); + c4[0] = hc_bytealign_S (w6[3], w7[0], offset); + c3[3] = hc_bytealign_S (w6[2], w6[3], offset); + c3[2] = hc_bytealign_S (w6[1], w6[2], offset); + c3[1] = hc_bytealign_S (w6[0], w6[1], offset); + c3[0] = hc_bytealign_S (w5[3], w6[0], offset); + c2[3] = hc_bytealign_S (w5[2], w5[3], offset); + c2[2] = hc_bytealign_S (w5[1], w5[2], offset); + c2[1] = hc_bytealign_S (w5[0], w5[1], offset); + c2[0] = hc_bytealign_S (w4[3], w5[0], offset); + c1[3] = hc_bytealign_S (w4[2], w4[3], offset); + c1[2] = hc_bytealign_S (w4[1], w4[2], offset); + c1[1] = hc_bytealign_S (w4[0], w4[1], offset); + c1[0] = hc_bytealign_S (w3[3], w4[0], offset); + c0[3] = hc_bytealign_S (w3[2], w3[3], offset); + c0[2] = hc_bytealign_S (w3[1], w3[2], offset); + c0[1] = hc_bytealign_S (w3[0], w3[1], offset); + c0[0] = hc_bytealign_S (w2[3], w3[0], offset); + w7[3] = hc_bytealign_S (w2[2], w2[3], offset); + w7[2] = hc_bytealign_S (w2[1], w2[2], offset); + w7[1] = hc_bytealign_S (w2[0], w2[1], offset); + w7[0] = hc_bytealign_S (w1[3], w2[0], offset); + w6[3] = hc_bytealign_S (w1[2], w1[3], offset); + w6[2] = hc_bytealign_S (w1[1], w1[2], offset); + w6[1] = hc_bytealign_S (w1[0], w1[1], offset); + w6[0] = hc_bytealign_S (w0[3], w1[0], offset); + w5[3] = hc_bytealign_S (w0[2], w0[3], offset); + w5[2] = hc_bytealign_S (w0[1], w0[2], offset); + w5[1] = hc_bytealign_S (w0[0], w0[1], offset); + w5[0] = hc_bytealign_S ( 0, w0[0], offset); + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -36293,30 +42084,52 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; - case 9: - w7[3] = hc_bytealign_S (w5[1], w5[2], offset); - w7[2] = hc_bytealign_S (w5[0], w5[1], offset); - w7[1] = hc_bytealign_S (w4[3], w5[0], offset); - w7[0] = hc_bytealign_S (w4[2], w4[3], offset); - w6[3] = hc_bytealign_S (w4[1], w4[2], offset); - w6[2] = hc_bytealign_S (w4[0], w4[1], offset); - w6[1] = hc_bytealign_S (w3[3], w4[0], offset); - w6[0] = hc_bytealign_S (w3[2], w3[3], offset); - w5[3] = hc_bytealign_S (w3[1], w3[2], offset); - w5[2] = hc_bytealign_S (w3[0], w3[1], offset); - w5[1] = hc_bytealign_S (w2[3], w3[0], offset); - w5[0] = hc_bytealign_S (w2[2], w2[3], offset); - w4[3] = hc_bytealign_S (w2[1], w2[2], offset); - w4[2] = hc_bytealign_S (w2[0], w2[1], offset); - w4[1] = hc_bytealign_S (w1[3], w2[0], offset); - w4[0] = hc_bytealign_S (w1[2], w1[3], offset); - w3[3] = hc_bytealign_S (w1[1], w1[2], offset); - w3[2] = hc_bytealign_S (w1[0], w1[1], offset); - w3[1] = hc_bytealign_S (w0[3], w1[0], offset); - w3[0] = hc_bytealign_S (w0[2], w0[3], offset); - w2[3] = hc_bytealign_S (w0[1], w0[2], offset); - w2[2] = hc_bytealign_S (w0[0], w0[1], offset); - w2[1] = hc_bytealign_S ( 0, w0[0], offset); + case 21: + c5[1] = hc_bytealign_S (w7[3], 0, offset); + c5[0] = hc_bytealign_S (w7[2], w7[3], offset); + c4[3] = hc_bytealign_S (w7[1], w7[2], offset); + c4[2] = hc_bytealign_S (w7[0], w7[1], offset); + c4[1] = hc_bytealign_S (w6[3], w7[0], offset); + c4[0] = hc_bytealign_S (w6[2], w6[3], offset); + c3[3] = hc_bytealign_S (w6[1], w6[2], offset); + c3[2] = hc_bytealign_S (w6[0], w6[1], offset); + c3[1] = hc_bytealign_S (w5[3], w6[0], offset); + c3[0] = hc_bytealign_S (w5[2], w5[3], offset); + c2[3] = hc_bytealign_S (w5[1], w5[2], offset); + c2[2] = hc_bytealign_S (w5[0], w5[1], offset); + c2[1] = hc_bytealign_S (w4[3], w5[0], offset); + c2[0] = hc_bytealign_S (w4[2], w4[3], offset); + c1[3] = hc_bytealign_S (w4[1], w4[2], offset); + c1[2] = hc_bytealign_S (w4[0], w4[1], offset); + c1[1] = hc_bytealign_S (w3[3], w4[0], offset); + c1[0] = hc_bytealign_S (w3[2], w3[3], offset); + c0[3] = hc_bytealign_S (w3[1], w3[2], offset); + c0[2] = hc_bytealign_S (w3[0], w3[1], offset); + c0[1] = hc_bytealign_S (w2[3], w3[0], offset); + c0[0] = hc_bytealign_S (w2[2], w2[3], offset); + w7[3] = hc_bytealign_S (w2[1], w2[2], offset); + w7[2] = hc_bytealign_S (w2[0], w2[1], offset); + w7[1] = hc_bytealign_S (w1[3], w2[0], offset); + w7[0] = hc_bytealign_S (w1[2], w1[3], offset); + w6[3] = hc_bytealign_S (w1[1], w1[2], offset); + w6[2] = hc_bytealign_S (w1[0], w1[1], offset); + w6[1] = hc_bytealign_S (w0[3], w1[0], offset); + w6[0] = hc_bytealign_S (w0[2], w0[3], offset); + w5[3] = hc_bytealign_S (w0[1], w0[2], offset); + w5[2] = hc_bytealign_S (w0[0], w0[1], offset); + w5[1] = hc_bytealign_S ( 0, w0[0], offset); + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -36329,29 +42142,52 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; - case 10: - w7[3] = hc_bytealign_S (w5[0], w5[1], offset); - w7[2] = hc_bytealign_S (w4[3], w5[0], offset); - w7[1] = hc_bytealign_S (w4[2], w4[3], offset); - w7[0] = hc_bytealign_S (w4[1], w4[2], offset); - w6[3] = hc_bytealign_S (w4[0], w4[1], offset); - w6[2] = hc_bytealign_S (w3[3], w4[0], offset); - w6[1] = hc_bytealign_S (w3[2], w3[3], offset); - w6[0] = hc_bytealign_S (w3[1], w3[2], offset); - w5[3] = hc_bytealign_S (w3[0], w3[1], offset); - w5[2] = hc_bytealign_S (w2[3], w3[0], offset); - w5[1] = hc_bytealign_S (w2[2], w2[3], offset); - w5[0] = hc_bytealign_S (w2[1], w2[2], offset); - w4[3] = hc_bytealign_S (w2[0], w2[1], offset); - w4[2] = hc_bytealign_S (w1[3], w2[0], offset); - w4[1] = hc_bytealign_S (w1[2], w1[3], offset); - w4[0] = hc_bytealign_S (w1[1], w1[2], offset); - w3[3] = hc_bytealign_S (w1[0], w1[1], offset); - w3[2] = hc_bytealign_S (w0[3], w1[0], offset); - w3[1] = hc_bytealign_S (w0[2], w0[3], offset); - w3[0] = hc_bytealign_S (w0[1], w0[2], offset); - w2[3] = hc_bytealign_S (w0[0], w0[1], offset); - w2[2] = hc_bytealign_S ( 0, w0[0], offset); + case 22: + c5[2] = hc_bytealign_S (w7[3], 0, offset); + c5[1] = hc_bytealign_S (w7[2], w7[3], offset); + c5[0] = hc_bytealign_S (w7[1], w7[2], offset); + c4[3] = hc_bytealign_S (w7[0], w7[1], offset); + c4[2] = hc_bytealign_S (w6[3], w7[0], offset); + c4[1] = hc_bytealign_S (w6[2], w6[3], offset); + c4[0] = hc_bytealign_S (w6[1], w6[2], offset); + c3[3] = hc_bytealign_S (w6[0], w6[1], offset); + c3[2] = hc_bytealign_S (w5[3], w6[0], offset); + c3[1] = hc_bytealign_S (w5[2], w5[3], offset); + c3[0] = hc_bytealign_S (w5[1], w5[2], offset); + c2[3] = hc_bytealign_S (w5[0], w5[1], offset); + c2[2] = hc_bytealign_S (w4[3], w5[0], offset); + c2[1] = hc_bytealign_S (w4[2], w4[3], offset); + c2[0] = hc_bytealign_S (w4[1], w4[2], offset); + c1[3] = hc_bytealign_S (w4[0], w4[1], offset); + c1[2] = hc_bytealign_S (w3[3], w4[0], offset); + c1[1] = hc_bytealign_S (w3[2], w3[3], offset); + c1[0] = hc_bytealign_S (w3[1], w3[2], offset); + c0[3] = hc_bytealign_S (w3[0], w3[1], offset); + c0[2] = hc_bytealign_S (w2[3], w3[0], offset); + c0[1] = hc_bytealign_S (w2[2], w2[3], offset); + c0[0] = hc_bytealign_S (w2[1], w2[2], offset); + w7[3] = hc_bytealign_S (w2[0], w2[1], offset); + w7[2] = hc_bytealign_S (w1[3], w2[0], offset); + w7[1] = hc_bytealign_S (w1[2], w1[3], offset); + w7[0] = hc_bytealign_S (w1[1], w1[2], offset); + w6[3] = hc_bytealign_S (w1[0], w1[1], offset); + w6[2] = hc_bytealign_S (w0[3], w1[0], offset); + w6[1] = hc_bytealign_S (w0[2], w0[3], offset); + w6[0] = hc_bytealign_S (w0[1], w0[2], offset); + w5[3] = hc_bytealign_S (w0[0], w0[1], offset); + w5[2] = hc_bytealign_S ( 0, w0[0], offset); + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -36365,28 +42201,52 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; - case 11: - w7[3] = hc_bytealign_S (w4[3], w5[0], offset); - w7[2] = hc_bytealign_S (w4[2], w4[3], offset); - w7[1] = hc_bytealign_S (w4[1], w4[2], offset); - w7[0] = hc_bytealign_S (w4[0], w4[1], offset); - w6[3] = hc_bytealign_S (w3[3], w4[0], offset); - w6[2] = hc_bytealign_S (w3[2], w3[3], offset); - w6[1] = hc_bytealign_S (w3[1], w3[2], offset); - w6[0] = hc_bytealign_S (w3[0], w3[1], offset); - w5[3] = hc_bytealign_S (w2[3], w3[0], offset); - w5[2] = hc_bytealign_S (w2[2], w2[3], offset); - w5[1] = hc_bytealign_S (w2[1], w2[2], offset); - w5[0] = hc_bytealign_S (w2[0], w2[1], offset); - w4[3] = hc_bytealign_S (w1[3], w2[0], offset); - w4[2] = hc_bytealign_S (w1[2], w1[3], offset); - w4[1] = hc_bytealign_S (w1[1], w1[2], offset); - w4[0] = hc_bytealign_S (w1[0], w1[1], offset); - w3[3] = hc_bytealign_S (w0[3], w1[0], offset); - w3[2] = hc_bytealign_S (w0[2], w0[3], offset); - w3[1] = hc_bytealign_S (w0[1], w0[2], offset); - w3[0] = hc_bytealign_S (w0[0], w0[1], offset); - w2[3] = hc_bytealign_S ( 0, w0[0], offset); + case 23: + c5[3] = hc_bytealign_S (w7[3], 0, offset); + c5[2] = hc_bytealign_S (w7[2], w7[3], offset); + c5[1] = hc_bytealign_S (w7[1], w7[2], offset); + c5[0] = hc_bytealign_S (w7[0], w7[1], offset); + c4[3] = hc_bytealign_S (w6[3], w7[0], offset); + c4[2] = hc_bytealign_S (w6[2], w6[3], offset); + c4[1] = hc_bytealign_S (w6[1], w6[2], offset); + c4[0] = hc_bytealign_S (w6[0], w6[1], offset); + c3[3] = hc_bytealign_S (w5[3], w6[0], offset); + c3[2] = hc_bytealign_S (w5[2], w5[3], offset); + c3[1] = hc_bytealign_S (w5[1], w5[2], offset); + c3[0] = hc_bytealign_S (w5[0], w5[1], offset); + c2[3] = hc_bytealign_S (w4[3], w5[0], offset); + c2[2] = hc_bytealign_S (w4[2], w4[3], offset); + c2[1] = hc_bytealign_S (w4[1], w4[2], offset); + c2[0] = hc_bytealign_S (w4[0], w4[1], offset); + c1[3] = hc_bytealign_S (w3[3], w4[0], offset); + c1[2] = hc_bytealign_S (w3[2], w3[3], offset); + c1[1] = hc_bytealign_S (w3[1], w3[2], offset); + c1[0] = hc_bytealign_S (w3[0], w3[1], offset); + c0[3] = hc_bytealign_S (w2[3], w3[0], offset); + c0[2] = hc_bytealign_S (w2[2], w2[3], offset); + c0[1] = hc_bytealign_S (w2[1], w2[2], offset); + c0[0] = hc_bytealign_S (w2[0], w2[1], offset); + w7[3] = hc_bytealign_S (w1[3], w2[0], offset); + w7[2] = hc_bytealign_S (w1[2], w1[3], offset); + w7[1] = hc_bytealign_S (w1[1], w1[2], offset); + w7[0] = hc_bytealign_S (w1[0], w1[1], offset); + w6[3] = hc_bytealign_S (w0[3], w1[0], offset); + w6[2] = hc_bytealign_S (w0[2], w0[3], offset); + w6[1] = hc_bytealign_S (w0[1], w0[2], offset); + w6[0] = hc_bytealign_S (w0[0], w0[1], offset); + w5[3] = hc_bytealign_S ( 0, w0[0], offset); + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -36394,34 +42254,59 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * w1[2] = 0; w1[1] = 0; w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - break; - - case 12: - w7[3] = hc_bytealign_S (w4[2], w4[3], offset); - w7[2] = hc_bytealign_S (w4[1], w4[2], offset); - w7[1] = hc_bytealign_S (w4[0], w4[1], offset); - w7[0] = hc_bytealign_S (w3[3], w4[0], offset); - w6[3] = hc_bytealign_S (w3[2], w3[3], offset); - w6[2] = hc_bytealign_S (w3[1], w3[2], offset); - w6[1] = hc_bytealign_S (w3[0], w3[1], offset); - w6[0] = hc_bytealign_S (w2[3], w3[0], offset); - w5[3] = hc_bytealign_S (w2[2], w2[3], offset); - w5[2] = hc_bytealign_S (w2[1], w2[2], offset); - w5[1] = hc_bytealign_S (w2[0], w2[1], offset); - w5[0] = hc_bytealign_S (w1[3], w2[0], offset); - w4[3] = hc_bytealign_S (w1[2], w1[3], offset); - w4[2] = hc_bytealign_S (w1[1], w1[2], offset); - w4[1] = hc_bytealign_S (w1[0], w1[1], offset); - w4[0] = hc_bytealign_S (w0[3], w1[0], offset); - w3[3] = hc_bytealign_S (w0[2], w0[3], offset); - w3[2] = hc_bytealign_S (w0[1], w0[2], offset); - w3[1] = hc_bytealign_S (w0[0], w0[1], offset); - w3[0] = hc_bytealign_S ( 0, w0[0], offset); + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 24: + c6[0] = hc_bytealign_S (w7[3], 0, offset); + c5[3] = hc_bytealign_S (w7[2], w7[3], offset); + c5[2] = hc_bytealign_S (w7[1], w7[2], offset); + c5[1] = hc_bytealign_S (w7[0], w7[1], offset); + c5[0] = hc_bytealign_S (w6[3], w7[0], offset); + c4[3] = hc_bytealign_S (w6[2], w6[3], offset); + c4[2] = hc_bytealign_S (w6[1], w6[2], offset); + c4[1] = hc_bytealign_S (w6[0], w6[1], offset); + c4[0] = hc_bytealign_S (w5[3], w6[0], offset); + c3[3] = hc_bytealign_S (w5[2], w5[3], offset); + c3[2] = hc_bytealign_S (w5[1], w5[2], offset); + c3[1] = hc_bytealign_S (w5[0], w5[1], offset); + c3[0] = hc_bytealign_S (w4[3], w5[0], offset); + c2[3] = hc_bytealign_S (w4[2], w4[3], offset); + c2[2] = hc_bytealign_S (w4[1], w4[2], offset); + c2[1] = hc_bytealign_S (w4[0], w4[1], offset); + c2[0] = hc_bytealign_S (w3[3], w4[0], offset); + c1[3] = hc_bytealign_S (w3[2], w3[3], offset); + c1[2] = hc_bytealign_S (w3[1], w3[2], offset); + c1[1] = hc_bytealign_S (w3[0], w3[1], offset); + c1[0] = hc_bytealign_S (w2[3], w3[0], offset); + c0[3] = hc_bytealign_S (w2[2], w2[3], offset); + c0[2] = hc_bytealign_S (w2[1], w2[2], offset); + c0[1] = hc_bytealign_S (w2[0], w2[1], offset); + c0[0] = hc_bytealign_S (w1[3], w2[0], offset); + w7[3] = hc_bytealign_S (w1[2], w1[3], offset); + w7[2] = hc_bytealign_S (w1[1], w1[2], offset); + w7[1] = hc_bytealign_S (w1[0], w1[1], offset); + w7[0] = hc_bytealign_S (w0[3], w1[0], offset); + w6[3] = hc_bytealign_S (w0[2], w0[3], offset); + w6[2] = hc_bytealign_S (w0[1], w0[2], offset); + w6[1] = hc_bytealign_S (w0[0], w0[1], offset); + w6[0] = hc_bytealign_S ( 0, w0[0], offset); + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -36437,26 +42322,52 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; - case 13: - w7[3] = hc_bytealign_S (w4[1], w4[2], offset); - w7[2] = hc_bytealign_S (w4[0], w4[1], offset); - w7[1] = hc_bytealign_S (w3[3], w4[0], offset); - w7[0] = hc_bytealign_S (w3[2], w3[3], offset); - w6[3] = hc_bytealign_S (w3[1], w3[2], offset); - w6[2] = hc_bytealign_S (w3[0], w3[1], offset); - w6[1] = hc_bytealign_S (w2[3], w3[0], offset); - w6[0] = hc_bytealign_S (w2[2], w2[3], offset); - w5[3] = hc_bytealign_S (w2[1], w2[2], offset); - w5[2] = hc_bytealign_S (w2[0], w2[1], offset); - w5[1] = hc_bytealign_S (w1[3], w2[0], offset); - w5[0] = hc_bytealign_S (w1[2], w1[3], offset); - w4[3] = hc_bytealign_S (w1[1], w1[2], offset); - w4[2] = hc_bytealign_S (w1[0], w1[1], offset); - w4[1] = hc_bytealign_S (w0[3], w1[0], offset); - w4[0] = hc_bytealign_S (w0[2], w0[3], offset); - w3[3] = hc_bytealign_S (w0[1], w0[2], offset); - w3[2] = hc_bytealign_S (w0[0], w0[1], offset); - w3[1] = hc_bytealign_S ( 0, w0[0], offset); + case 25: + c6[1] = hc_bytealign_S (w7[3], 0, offset); + c6[0] = hc_bytealign_S (w7[2], w7[3], offset); + c5[3] = hc_bytealign_S (w7[1], w7[2], offset); + c5[2] = hc_bytealign_S (w7[0], w7[1], offset); + c5[1] = hc_bytealign_S (w6[3], w7[0], offset); + c5[0] = hc_bytealign_S (w6[2], w6[3], offset); + c4[3] = hc_bytealign_S (w6[1], w6[2], offset); + c4[2] = hc_bytealign_S (w6[0], w6[1], offset); + c4[1] = hc_bytealign_S (w5[3], w6[0], offset); + c4[0] = hc_bytealign_S (w5[2], w5[3], offset); + c3[3] = hc_bytealign_S (w5[1], w5[2], offset); + c3[2] = hc_bytealign_S (w5[0], w5[1], offset); + c3[1] = hc_bytealign_S (w4[3], w5[0], offset); + c3[0] = hc_bytealign_S (w4[2], w4[3], offset); + c2[3] = hc_bytealign_S (w4[1], w4[2], offset); + c2[2] = hc_bytealign_S (w4[0], w4[1], offset); + c2[1] = hc_bytealign_S (w3[3], w4[0], offset); + c2[0] = hc_bytealign_S (w3[2], w3[3], offset); + c1[3] = hc_bytealign_S (w3[1], w3[2], offset); + c1[2] = hc_bytealign_S (w3[0], w3[1], offset); + c1[1] = hc_bytealign_S (w2[3], w3[0], offset); + c1[0] = hc_bytealign_S (w2[2], w2[3], offset); + c0[3] = hc_bytealign_S (w2[1], w2[2], offset); + c0[2] = hc_bytealign_S (w2[0], w2[1], offset); + c0[1] = hc_bytealign_S (w1[3], w2[0], offset); + c0[0] = hc_bytealign_S (w1[2], w1[3], offset); + w7[3] = hc_bytealign_S (w1[1], w1[2], offset); + w7[2] = hc_bytealign_S (w1[0], w1[1], offset); + w7[1] = hc_bytealign_S (w0[3], w1[0], offset); + w7[0] = hc_bytealign_S (w0[2], w0[3], offset); + w6[3] = hc_bytealign_S (w0[1], w0[2], offset); + w6[2] = hc_bytealign_S (w0[0], w0[1], offset); + w6[1] = hc_bytealign_S ( 0, w0[0], offset); + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -36473,25 +42384,52 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; - case 14: - w7[3] = hc_bytealign_S (w4[0], w4[1], offset); - w7[2] = hc_bytealign_S (w3[3], w4[0], offset); - w7[1] = hc_bytealign_S (w3[2], w3[3], offset); - w7[0] = hc_bytealign_S (w3[1], w3[2], offset); - w6[3] = hc_bytealign_S (w3[0], w3[1], offset); - w6[2] = hc_bytealign_S (w2[3], w3[0], offset); - w6[1] = hc_bytealign_S (w2[2], w2[3], offset); - w6[0] = hc_bytealign_S (w2[1], w2[2], offset); - w5[3] = hc_bytealign_S (w2[0], w2[1], offset); - w5[2] = hc_bytealign_S (w1[3], w2[0], offset); - w5[1] = hc_bytealign_S (w1[2], w1[3], offset); - w5[0] = hc_bytealign_S (w1[1], w1[2], offset); - w4[3] = hc_bytealign_S (w1[0], w1[1], offset); - w4[2] = hc_bytealign_S (w0[3], w1[0], offset); - w4[1] = hc_bytealign_S (w0[2], w0[3], offset); - w4[0] = hc_bytealign_S (w0[1], w0[2], offset); - w3[3] = hc_bytealign_S (w0[0], w0[1], offset); - w3[2] = hc_bytealign_S ( 0, w0[0], offset); + case 26: + c6[2] = hc_bytealign_S (w7[3], 0, offset); + c6[1] = hc_bytealign_S (w7[2], w7[3], offset); + c6[0] = hc_bytealign_S (w7[1], w7[2], offset); + c5[3] = hc_bytealign_S (w7[0], w7[1], offset); + c5[2] = hc_bytealign_S (w6[3], w7[0], offset); + c5[1] = hc_bytealign_S (w6[2], w6[3], offset); + c5[0] = hc_bytealign_S (w6[1], w6[2], offset); + c4[3] = hc_bytealign_S (w6[0], w6[1], offset); + c4[2] = hc_bytealign_S (w5[3], w6[0], offset); + c4[1] = hc_bytealign_S (w5[2], w5[3], offset); + c4[0] = hc_bytealign_S (w5[1], w5[2], offset); + c3[3] = hc_bytealign_S (w5[0], w5[1], offset); + c3[2] = hc_bytealign_S (w4[3], w5[0], offset); + c3[1] = hc_bytealign_S (w4[2], w4[3], offset); + c3[0] = hc_bytealign_S (w4[1], w4[2], offset); + c2[3] = hc_bytealign_S (w4[0], w4[1], offset); + c2[2] = hc_bytealign_S (w3[3], w4[0], offset); + c2[1] = hc_bytealign_S (w3[2], w3[3], offset); + c2[0] = hc_bytealign_S (w3[1], w3[2], offset); + c1[3] = hc_bytealign_S (w3[0], w3[1], offset); + c1[2] = hc_bytealign_S (w2[3], w3[0], offset); + c1[1] = hc_bytealign_S (w2[2], w2[3], offset); + c1[0] = hc_bytealign_S (w2[1], w2[2], offset); + c0[3] = hc_bytealign_S (w2[0], w2[1], offset); + c0[2] = hc_bytealign_S (w1[3], w2[0], offset); + c0[1] = hc_bytealign_S (w1[2], w1[3], offset); + c0[0] = hc_bytealign_S (w1[1], w1[2], offset); + w7[3] = hc_bytealign_S (w1[0], w1[1], offset); + w7[2] = hc_bytealign_S (w0[3], w1[0], offset); + w7[1] = hc_bytealign_S (w0[2], w0[3], offset); + w7[0] = hc_bytealign_S (w0[1], w0[2], offset); + w6[3] = hc_bytealign_S (w0[0], w0[1], offset); + w6[2] = hc_bytealign_S ( 0, w0[0], offset); + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -36509,24 +42447,52 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; - case 15: - w7[3] = hc_bytealign_S (w3[3], w4[0], offset); - w7[2] = hc_bytealign_S (w3[2], w3[3], offset); - w7[1] = hc_bytealign_S (w3[1], w3[2], offset); - w7[0] = hc_bytealign_S (w3[0], w3[1], offset); - w6[3] = hc_bytealign_S (w2[3], w3[0], offset); - w6[2] = hc_bytealign_S (w2[2], w2[3], offset); - w6[1] = hc_bytealign_S (w2[1], w2[2], offset); - w6[0] = hc_bytealign_S (w2[0], w2[1], offset); - w5[3] = hc_bytealign_S (w1[3], w2[0], offset); - w5[2] = hc_bytealign_S (w1[2], w1[3], offset); - w5[1] = hc_bytealign_S (w1[1], w1[2], offset); - w5[0] = hc_bytealign_S (w1[0], w1[1], offset); - w4[3] = hc_bytealign_S (w0[3], w1[0], offset); - w4[2] = hc_bytealign_S (w0[2], w0[3], offset); - w4[1] = hc_bytealign_S (w0[1], w0[2], offset); - w4[0] = hc_bytealign_S (w0[0], w0[1], offset); - w3[3] = hc_bytealign_S ( 0, w0[0], offset); + case 27: + c6[3] = hc_bytealign_S (w7[3], 0, offset); + c6[2] = hc_bytealign_S (w7[2], w7[3], offset); + c6[1] = hc_bytealign_S (w7[1], w7[2], offset); + c6[0] = hc_bytealign_S (w7[0], w7[1], offset); + c5[3] = hc_bytealign_S (w6[3], w7[0], offset); + c5[2] = hc_bytealign_S (w6[2], w6[3], offset); + c5[1] = hc_bytealign_S (w6[1], w6[2], offset); + c5[0] = hc_bytealign_S (w6[0], w6[1], offset); + c4[3] = hc_bytealign_S (w5[3], w6[0], offset); + c4[2] = hc_bytealign_S (w5[2], w5[3], offset); + c4[1] = hc_bytealign_S (w5[1], w5[2], offset); + c4[0] = hc_bytealign_S (w5[0], w5[1], offset); + c3[3] = hc_bytealign_S (w4[3], w5[0], offset); + c3[2] = hc_bytealign_S (w4[2], w4[3], offset); + c3[1] = hc_bytealign_S (w4[1], w4[2], offset); + c3[0] = hc_bytealign_S (w4[0], w4[1], offset); + c2[3] = hc_bytealign_S (w3[3], w4[0], offset); + c2[2] = hc_bytealign_S (w3[2], w3[3], offset); + c2[1] = hc_bytealign_S (w3[1], w3[2], offset); + c2[0] = hc_bytealign_S (w3[0], w3[1], offset); + c1[3] = hc_bytealign_S (w2[3], w3[0], offset); + c1[2] = hc_bytealign_S (w2[2], w2[3], offset); + c1[1] = hc_bytealign_S (w2[1], w2[2], offset); + c1[0] = hc_bytealign_S (w2[0], w2[1], offset); + c0[3] = hc_bytealign_S (w1[3], w2[0], offset); + c0[2] = hc_bytealign_S (w1[2], w1[3], offset); + c0[1] = hc_bytealign_S (w1[1], w1[2], offset); + c0[0] = hc_bytealign_S (w1[0], w1[1], offset); + w7[3] = hc_bytealign_S (w0[3], w1[0], offset); + w7[2] = hc_bytealign_S (w0[2], w0[3], offset); + w7[1] = hc_bytealign_S (w0[1], w0[2], offset); + w7[0] = hc_bytealign_S (w0[0], w0[1], offset); + w6[3] = hc_bytealign_S ( 0, w0[0], offset); + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -36545,23 +42511,52 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; - case 16: - w7[3] = hc_bytealign_S (w3[2], w3[3], offset); - w7[2] = hc_bytealign_S (w3[1], w3[2], offset); - w7[1] = hc_bytealign_S (w3[0], w3[1], offset); - w7[0] = hc_bytealign_S (w2[3], w3[0], offset); - w6[3] = hc_bytealign_S (w2[2], w2[3], offset); - w6[2] = hc_bytealign_S (w2[1], w2[2], offset); - w6[1] = hc_bytealign_S (w2[0], w2[1], offset); - w6[0] = hc_bytealign_S (w1[3], w2[0], offset); - w5[3] = hc_bytealign_S (w1[2], w1[3], offset); - w5[2] = hc_bytealign_S (w1[1], w1[2], offset); - w5[1] = hc_bytealign_S (w1[0], w1[1], offset); - w5[0] = hc_bytealign_S (w0[3], w1[0], offset); - w4[3] = hc_bytealign_S (w0[2], w0[3], offset); - w4[2] = hc_bytealign_S (w0[1], w0[2], offset); - w4[1] = hc_bytealign_S (w0[0], w0[1], offset); - w4[0] = hc_bytealign_S ( 0, w0[0], offset); + case 28: + c7[0] = hc_bytealign_S (w7[3], 0, offset); + c6[3] = hc_bytealign_S (w7[2], w7[3], offset); + c6[2] = hc_bytealign_S (w7[1], w7[2], offset); + c6[1] = hc_bytealign_S (w7[0], w7[1], offset); + c6[0] = hc_bytealign_S (w6[3], w7[0], offset); + c5[3] = hc_bytealign_S (w6[2], w6[3], offset); + c5[2] = hc_bytealign_S (w6[1], w6[2], offset); + c5[1] = hc_bytealign_S (w6[0], w6[1], offset); + c5[0] = hc_bytealign_S (w5[3], w6[0], offset); + c4[3] = hc_bytealign_S (w5[2], w5[3], offset); + c4[2] = hc_bytealign_S (w5[1], w5[2], offset); + c4[1] = hc_bytealign_S (w5[0], w5[1], offset); + c4[0] = hc_bytealign_S (w4[3], w5[0], offset); + c3[3] = hc_bytealign_S (w4[2], w4[3], offset); + c3[2] = hc_bytealign_S (w4[1], w4[2], offset); + c3[1] = hc_bytealign_S (w4[0], w4[1], offset); + c3[0] = hc_bytealign_S (w3[3], w4[0], offset); + c2[3] = hc_bytealign_S (w3[2], w3[3], offset); + c2[2] = hc_bytealign_S (w3[1], w3[2], offset); + c2[1] = hc_bytealign_S (w3[0], w3[1], offset); + c2[0] = hc_bytealign_S (w2[3], w3[0], offset); + c1[3] = hc_bytealign_S (w2[2], w2[3], offset); + c1[2] = hc_bytealign_S (w2[1], w2[2], offset); + c1[1] = hc_bytealign_S (w2[0], w2[1], offset); + c1[0] = hc_bytealign_S (w1[3], w2[0], offset); + c0[3] = hc_bytealign_S (w1[2], w1[3], offset); + c0[2] = hc_bytealign_S (w1[1], w1[2], offset); + c0[1] = hc_bytealign_S (w1[0], w1[1], offset); + c0[0] = hc_bytealign_S (w0[3], w1[0], offset); + w7[3] = hc_bytealign_S (w0[2], w0[3], offset); + w7[2] = hc_bytealign_S (w0[1], w0[2], offset); + w7[1] = hc_bytealign_S (w0[0], w0[1], offset); + w7[0] = hc_bytealign_S ( 0, w0[0], offset); + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; @@ -36581,22 +42576,52 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; - case 17: - w7[3] = hc_bytealign_S (w3[1], w3[2], offset); - w7[2] = hc_bytealign_S (w3[0], w3[1], offset); - w7[1] = hc_bytealign_S (w2[3], w3[0], offset); - w7[0] = hc_bytealign_S (w2[2], w2[3], offset); - w6[3] = hc_bytealign_S (w2[1], w2[2], offset); - w6[2] = hc_bytealign_S (w2[0], w2[1], offset); - w6[1] = hc_bytealign_S (w1[3], w2[0], offset); - w6[0] = hc_bytealign_S (w1[2], w1[3], offset); - w5[3] = hc_bytealign_S (w1[1], w1[2], offset); - w5[2] = hc_bytealign_S (w1[0], w1[1], offset); - w5[1] = hc_bytealign_S (w0[3], w1[0], offset); - w5[0] = hc_bytealign_S (w0[2], w0[3], offset); - w4[3] = hc_bytealign_S (w0[1], w0[2], offset); - w4[2] = hc_bytealign_S (w0[0], w0[1], offset); - w4[1] = hc_bytealign_S ( 0, w0[0], offset); + case 29: + c7[1] = hc_bytealign_S (w7[3], 0, offset); + c7[0] = hc_bytealign_S (w7[2], w7[3], offset); + c6[3] = hc_bytealign_S (w7[1], w7[2], offset); + c6[2] = hc_bytealign_S (w7[0], w7[1], offset); + c6[1] = hc_bytealign_S (w6[3], w7[0], offset); + c6[0] = hc_bytealign_S (w6[2], w6[3], offset); + c5[3] = hc_bytealign_S (w6[1], w6[2], offset); + c5[2] = hc_bytealign_S (w6[0], w6[1], offset); + c5[1] = hc_bytealign_S (w5[3], w6[0], offset); + c5[0] = hc_bytealign_S (w5[2], w5[3], offset); + c4[3] = hc_bytealign_S (w5[1], w5[2], offset); + c4[2] = hc_bytealign_S (w5[0], w5[1], offset); + c4[1] = hc_bytealign_S (w4[3], w5[0], offset); + c4[0] = hc_bytealign_S (w4[2], w4[3], offset); + c3[3] = hc_bytealign_S (w4[1], w4[2], offset); + c3[2] = hc_bytealign_S (w4[0], w4[1], offset); + c3[1] = hc_bytealign_S (w3[3], w4[0], offset); + c3[0] = hc_bytealign_S (w3[2], w3[3], offset); + c2[3] = hc_bytealign_S (w3[1], w3[2], offset); + c2[2] = hc_bytealign_S (w3[0], w3[1], offset); + c2[1] = hc_bytealign_S (w2[3], w3[0], offset); + c2[0] = hc_bytealign_S (w2[2], w2[3], offset); + c1[3] = hc_bytealign_S (w2[1], w2[2], offset); + c1[2] = hc_bytealign_S (w2[0], w2[1], offset); + c1[1] = hc_bytealign_S (w1[3], w2[0], offset); + c1[0] = hc_bytealign_S (w1[2], w1[3], offset); + c0[3] = hc_bytealign_S (w1[1], w1[2], offset); + c0[2] = hc_bytealign_S (w1[0], w1[1], offset); + c0[1] = hc_bytealign_S (w0[3], w1[0], offset); + c0[0] = hc_bytealign_S (w0[2], w0[3], offset); + w7[3] = hc_bytealign_S (w0[1], w0[2], offset); + w7[2] = hc_bytealign_S (w0[0], w0[1], offset); + w7[1] = hc_bytealign_S ( 0, w0[0], offset); + w7[0] = 0; + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; @@ -36617,21 +42642,52 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; - case 18: - w7[3] = hc_bytealign_S (w3[0], w3[1], offset); - w7[2] = hc_bytealign_S (w2[3], w3[0], offset); - w7[1] = hc_bytealign_S (w2[2], w2[3], offset); - w7[0] = hc_bytealign_S (w2[1], w2[2], offset); - w6[3] = hc_bytealign_S (w2[0], w2[1], offset); - w6[2] = hc_bytealign_S (w1[3], w2[0], offset); - w6[1] = hc_bytealign_S (w1[2], w1[3], offset); - w6[0] = hc_bytealign_S (w1[1], w1[2], offset); - w5[3] = hc_bytealign_S (w1[0], w1[1], offset); - w5[2] = hc_bytealign_S (w0[3], w1[0], offset); - w5[1] = hc_bytealign_S (w0[2], w0[3], offset); - w5[0] = hc_bytealign_S (w0[1], w0[2], offset); - w4[3] = hc_bytealign_S (w0[0], w0[1], offset); - w4[2] = hc_bytealign_S ( 0, w0[0], offset); + case 30: + c7[2] = hc_bytealign_S (w7[3], 0, offset); + c7[1] = hc_bytealign_S (w7[2], w7[3], offset); + c7[0] = hc_bytealign_S (w7[1], w7[2], offset); + c6[3] = hc_bytealign_S (w7[0], w7[1], offset); + c6[2] = hc_bytealign_S (w6[3], w7[0], offset); + c6[1] = hc_bytealign_S (w6[2], w6[3], offset); + c6[0] = hc_bytealign_S (w6[1], w6[2], offset); + c5[3] = hc_bytealign_S (w6[0], w6[1], offset); + c5[2] = hc_bytealign_S (w5[3], w6[0], offset); + c5[1] = hc_bytealign_S (w5[2], w5[3], offset); + c5[0] = hc_bytealign_S (w5[1], w5[2], offset); + c4[3] = hc_bytealign_S (w5[0], w5[1], offset); + c4[2] = hc_bytealign_S (w4[3], w5[0], offset); + c4[1] = hc_bytealign_S (w4[2], w4[3], offset); + c4[0] = hc_bytealign_S (w4[1], w4[2], offset); + c3[3] = hc_bytealign_S (w4[0], w4[1], offset); + c3[2] = hc_bytealign_S (w3[3], w4[0], offset); + c3[1] = hc_bytealign_S (w3[2], w3[3], offset); + c3[0] = hc_bytealign_S (w3[1], w3[2], offset); + c2[3] = hc_bytealign_S (w3[0], w3[1], offset); + c2[2] = hc_bytealign_S (w2[3], w3[0], offset); + c2[1] = hc_bytealign_S (w2[2], w2[3], offset); + c2[0] = hc_bytealign_S (w2[1], w2[2], offset); + c1[3] = hc_bytealign_S (w2[0], w2[1], offset); + c1[2] = hc_bytealign_S (w1[3], w2[0], offset); + c1[1] = hc_bytealign_S (w1[2], w1[3], offset); + c1[0] = hc_bytealign_S (w1[1], w1[2], offset); + c0[3] = hc_bytealign_S (w1[0], w1[1], offset); + c0[2] = hc_bytealign_S (w0[3], w1[0], offset); + c0[1] = hc_bytealign_S (w0[2], w0[3], offset); + c0[0] = hc_bytealign_S (w0[1], w0[2], offset); + w7[3] = hc_bytealign_S (w0[0], w0[1], offset); + w7[2] = hc_bytealign_S ( 0, w0[0], offset); + w7[1] = 0; + w7[0] = 0; + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; @@ -36653,20 +42709,52 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; - case 19: - w7[3] = hc_bytealign_S (w2[3], w3[0], offset); - w7[2] = hc_bytealign_S (w2[2], w2[3], offset); - w7[1] = hc_bytealign_S (w2[1], w2[2], offset); - w7[0] = hc_bytealign_S (w2[0], w2[1], offset); - w6[3] = hc_bytealign_S (w1[3], w2[0], offset); - w6[2] = hc_bytealign_S (w1[2], w1[3], offset); - w6[1] = hc_bytealign_S (w1[1], w1[2], offset); - w6[0] = hc_bytealign_S (w1[0], w1[1], offset); - w5[3] = hc_bytealign_S (w0[3], w1[0], offset); - w5[2] = hc_bytealign_S (w0[2], w0[3], offset); - w5[1] = hc_bytealign_S (w0[1], w0[2], offset); - w5[0] = hc_bytealign_S (w0[0], w0[1], offset); - w4[3] = hc_bytealign_S ( 0, w0[0], offset); + case 31: + c7[3] = hc_bytealign_S (w7[3], 0, offset); + c7[2] = hc_bytealign_S (w7[2], w7[3], offset); + c7[1] = hc_bytealign_S (w7[1], w7[2], offset); + c7[0] = hc_bytealign_S (w7[0], w7[1], offset); + c6[3] = hc_bytealign_S (w6[3], w7[0], offset); + c6[2] = hc_bytealign_S (w6[2], w6[3], offset); + c6[1] = hc_bytealign_S (w6[1], w6[2], offset); + c6[0] = hc_bytealign_S (w6[0], w6[1], offset); + c5[3] = hc_bytealign_S (w5[3], w6[0], offset); + c5[2] = hc_bytealign_S (w5[2], w5[3], offset); + c5[1] = hc_bytealign_S (w5[1], w5[2], offset); + c5[0] = hc_bytealign_S (w5[0], w5[1], offset); + c4[3] = hc_bytealign_S (w4[3], w5[0], offset); + c4[2] = hc_bytealign_S (w4[2], w4[3], offset); + c4[1] = hc_bytealign_S (w4[1], w4[2], offset); + c4[0] = hc_bytealign_S (w4[0], w4[1], offset); + c3[3] = hc_bytealign_S (w3[3], w4[0], offset); + c3[2] = hc_bytealign_S (w3[2], w3[3], offset); + c3[1] = hc_bytealign_S (w3[1], w3[2], offset); + c3[0] = hc_bytealign_S (w3[0], w3[1], offset); + c2[3] = hc_bytealign_S (w2[3], w3[0], offset); + c2[2] = hc_bytealign_S (w2[2], w2[3], offset); + c2[1] = hc_bytealign_S (w2[1], w2[2], offset); + c2[0] = hc_bytealign_S (w2[0], w2[1], offset); + c1[3] = hc_bytealign_S (w1[3], w2[0], offset); + c1[2] = hc_bytealign_S (w1[2], w1[3], offset); + c1[1] = hc_bytealign_S (w1[1], w1[2], offset); + c1[0] = hc_bytealign_S (w1[0], w1[1], offset); + c0[3] = hc_bytealign_S (w0[3], w1[0], offset); + c0[2] = hc_bytealign_S (w0[2], w0[3], offset); + c0[1] = hc_bytealign_S (w0[1], w0[2], offset); + c0[0] = hc_bytealign_S (w0[0], w0[1], offset); + w7[3] = hc_bytealign_S ( 0, w0[0], offset); + w7[2] = 0; + w7[1] = 0; + w7[0] = 0; + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; @@ -36687,36 +42775,255 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * w0[1] = 0; w0[0] = 0; + break; + } + #endif + + #if (defined IS_AMD && HAS_VPERM == 1) || defined IS_NV + + const int offset_mod_4 = offset & 3; + + const int offset_minus_4 = 4 - offset_mod_4; + + #if defined IS_NV + const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; + #endif + + #if defined IS_AMD + const int selector = 0x0706050403020100 >> (offset_minus_4 * 8); + #endif + + switch (offset_switch) + { + case 0: + c0[0] = hc_byte_perm_S (w7[3], 0, selector); + w7[3] = hc_byte_perm_S (w7[2], w7[3], selector); + w7[2] = hc_byte_perm_S (w7[1], w7[2], selector); + w7[1] = hc_byte_perm_S (w7[0], w7[1], selector); + w7[0] = hc_byte_perm_S (w6[3], w7[0], selector); + w6[3] = hc_byte_perm_S (w6[2], w6[3], selector); + w6[2] = hc_byte_perm_S (w6[1], w6[2], selector); + w6[1] = hc_byte_perm_S (w6[0], w6[1], selector); + w6[0] = hc_byte_perm_S (w5[3], w6[0], selector); + w5[3] = hc_byte_perm_S (w5[2], w5[3], selector); + w5[2] = hc_byte_perm_S (w5[1], w5[2], selector); + w5[1] = hc_byte_perm_S (w5[0], w5[1], selector); + w5[0] = hc_byte_perm_S (w4[3], w5[0], selector); + w4[3] = hc_byte_perm_S (w4[2], w4[3], selector); + w4[2] = hc_byte_perm_S (w4[1], w4[2], selector); + w4[1] = hc_byte_perm_S (w4[0], w4[1], selector); + w4[0] = hc_byte_perm_S (w3[3], w4[0], selector); + w3[3] = hc_byte_perm_S (w3[2], w3[3], selector); + w3[2] = hc_byte_perm_S (w3[1], w3[2], selector); + w3[1] = hc_byte_perm_S (w3[0], w3[1], selector); + w3[0] = hc_byte_perm_S (w2[3], w3[0], selector); + w2[3] = hc_byte_perm_S (w2[2], w2[3], selector); + w2[2] = hc_byte_perm_S (w2[1], w2[2], selector); + w2[1] = hc_byte_perm_S (w2[0], w2[1], selector); + w2[0] = hc_byte_perm_S (w1[3], w2[0], selector); + w1[3] = hc_byte_perm_S (w1[2], w1[3], selector); + w1[2] = hc_byte_perm_S (w1[1], w1[2], selector); + w1[1] = hc_byte_perm_S (w1[0], w1[1], selector); + w1[0] = hc_byte_perm_S (w0[3], w1[0], selector); + w0[3] = hc_byte_perm_S (w0[2], w0[3], selector); + w0[2] = hc_byte_perm_S (w0[1], w0[2], selector); + w0[1] = hc_byte_perm_S (w0[0], w0[1], selector); + w0[0] = hc_byte_perm_S ( 0, w0[0], selector); + + break; + + case 1: + c0[1] = hc_byte_perm_S (w7[3], 0, selector); + c0[0] = hc_byte_perm_S (w7[2], w7[3], selector); + w7[3] = hc_byte_perm_S (w7[1], w7[2], selector); + w7[2] = hc_byte_perm_S (w7[0], w7[1], selector); + w7[1] = hc_byte_perm_S (w6[3], w7[0], selector); + w7[0] = hc_byte_perm_S (w6[2], w6[3], selector); + w6[3] = hc_byte_perm_S (w6[1], w6[2], selector); + w6[2] = hc_byte_perm_S (w6[0], w6[1], selector); + w6[1] = hc_byte_perm_S (w5[3], w6[0], selector); + w6[0] = hc_byte_perm_S (w5[2], w5[3], selector); + w5[3] = hc_byte_perm_S (w5[1], w5[2], selector); + w5[2] = hc_byte_perm_S (w5[0], w5[1], selector); + w5[1] = hc_byte_perm_S (w4[3], w5[0], selector); + w5[0] = hc_byte_perm_S (w4[2], w4[3], selector); + w4[3] = hc_byte_perm_S (w4[1], w4[2], selector); + w4[2] = hc_byte_perm_S (w4[0], w4[1], selector); + w4[1] = hc_byte_perm_S (w3[3], w4[0], selector); + w4[0] = hc_byte_perm_S (w3[2], w3[3], selector); + w3[3] = hc_byte_perm_S (w3[1], w3[2], selector); + w3[2] = hc_byte_perm_S (w3[0], w3[1], selector); + w3[1] = hc_byte_perm_S (w2[3], w3[0], selector); + w3[0] = hc_byte_perm_S (w2[2], w2[3], selector); + w2[3] = hc_byte_perm_S (w2[1], w2[2], selector); + w2[2] = hc_byte_perm_S (w2[0], w2[1], selector); + w2[1] = hc_byte_perm_S (w1[3], w2[0], selector); + w2[0] = hc_byte_perm_S (w1[2], w1[3], selector); + w1[3] = hc_byte_perm_S (w1[1], w1[2], selector); + w1[2] = hc_byte_perm_S (w1[0], w1[1], selector); + w1[1] = hc_byte_perm_S (w0[3], w1[0], selector); + w1[0] = hc_byte_perm_S (w0[2], w0[3], selector); + w0[3] = hc_byte_perm_S (w0[1], w0[2], selector); + w0[2] = hc_byte_perm_S (w0[0], w0[1], selector); + w0[1] = hc_byte_perm_S ( 0, w0[0], selector); + w0[0] = 0; + + break; + + case 2: + c0[2] = hc_byte_perm_S (w7[3], 0, selector); + c0[1] = hc_byte_perm_S (w7[2], w7[3], selector); + c0[0] = hc_byte_perm_S (w7[1], w7[2], selector); + w7[3] = hc_byte_perm_S (w7[0], w7[1], selector); + w7[2] = hc_byte_perm_S (w6[3], w7[0], selector); + w7[1] = hc_byte_perm_S (w6[2], w6[3], selector); + w7[0] = hc_byte_perm_S (w6[1], w6[2], selector); + w6[3] = hc_byte_perm_S (w6[0], w6[1], selector); + w6[2] = hc_byte_perm_S (w5[3], w6[0], selector); + w6[1] = hc_byte_perm_S (w5[2], w5[3], selector); + w6[0] = hc_byte_perm_S (w5[1], w5[2], selector); + w5[3] = hc_byte_perm_S (w5[0], w5[1], selector); + w5[2] = hc_byte_perm_S (w4[3], w5[0], selector); + w5[1] = hc_byte_perm_S (w4[2], w4[3], selector); + w5[0] = hc_byte_perm_S (w4[1], w4[2], selector); + w4[3] = hc_byte_perm_S (w4[0], w4[1], selector); + w4[2] = hc_byte_perm_S (w3[3], w4[0], selector); + w4[1] = hc_byte_perm_S (w3[2], w3[3], selector); + w4[0] = hc_byte_perm_S (w3[1], w3[2], selector); + w3[3] = hc_byte_perm_S (w3[0], w3[1], selector); + w3[2] = hc_byte_perm_S (w2[3], w3[0], selector); + w3[1] = hc_byte_perm_S (w2[2], w2[3], selector); + w3[0] = hc_byte_perm_S (w2[1], w2[2], selector); + w2[3] = hc_byte_perm_S (w2[0], w2[1], selector); + w2[2] = hc_byte_perm_S (w1[3], w2[0], selector); + w2[1] = hc_byte_perm_S (w1[2], w1[3], selector); + w2[0] = hc_byte_perm_S (w1[1], w1[2], selector); + w1[3] = hc_byte_perm_S (w1[0], w1[1], selector); + w1[2] = hc_byte_perm_S (w0[3], w1[0], selector); + w1[1] = hc_byte_perm_S (w0[2], w0[3], selector); + w1[0] = hc_byte_perm_S (w0[1], w0[2], selector); + w0[3] = hc_byte_perm_S (w0[0], w0[1], selector); + w0[2] = hc_byte_perm_S ( 0, w0[0], selector); + w0[1] = 0; + w0[0] = 0; + + break; + + case 3: + c0[3] = hc_byte_perm_S (w7[3], 0, selector); + c0[2] = hc_byte_perm_S (w7[2], w7[3], selector); + c0[1] = hc_byte_perm_S (w7[1], w7[2], selector); + c0[0] = hc_byte_perm_S (w7[0], w7[1], selector); + w7[3] = hc_byte_perm_S (w6[3], w7[0], selector); + w7[2] = hc_byte_perm_S (w6[2], w6[3], selector); + w7[1] = hc_byte_perm_S (w6[1], w6[2], selector); + w7[0] = hc_byte_perm_S (w6[0], w6[1], selector); + w6[3] = hc_byte_perm_S (w5[3], w6[0], selector); + w6[2] = hc_byte_perm_S (w5[2], w5[3], selector); + w6[1] = hc_byte_perm_S (w5[1], w5[2], selector); + w6[0] = hc_byte_perm_S (w5[0], w5[1], selector); + w5[3] = hc_byte_perm_S (w4[3], w5[0], selector); + w5[2] = hc_byte_perm_S (w4[2], w4[3], selector); + w5[1] = hc_byte_perm_S (w4[1], w4[2], selector); + w5[0] = hc_byte_perm_S (w4[0], w4[1], selector); + w4[3] = hc_byte_perm_S (w3[3], w4[0], selector); + w4[2] = hc_byte_perm_S (w3[2], w3[3], selector); + w4[1] = hc_byte_perm_S (w3[1], w3[2], selector); + w4[0] = hc_byte_perm_S (w3[0], w3[1], selector); + w3[3] = hc_byte_perm_S (w2[3], w3[0], selector); + w3[2] = hc_byte_perm_S (w2[2], w2[3], selector); + w3[1] = hc_byte_perm_S (w2[1], w2[2], selector); + w3[0] = hc_byte_perm_S (w2[0], w2[1], selector); + w2[3] = hc_byte_perm_S (w1[3], w2[0], selector); + w2[2] = hc_byte_perm_S (w1[2], w1[3], selector); + w2[1] = hc_byte_perm_S (w1[1], w1[2], selector); + w2[0] = hc_byte_perm_S (w1[0], w1[1], selector); + w1[3] = hc_byte_perm_S (w0[3], w1[0], selector); + w1[2] = hc_byte_perm_S (w0[2], w0[3], selector); + w1[1] = hc_byte_perm_S (w0[1], w0[2], selector); + w1[0] = hc_byte_perm_S (w0[0], w0[1], selector); + w0[3] = hc_byte_perm_S ( 0, w0[0], selector); + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 4: + c1[0] = hc_byte_perm_S (w7[3], 0, selector); + c0[3] = hc_byte_perm_S (w7[2], w7[3], selector); + c0[2] = hc_byte_perm_S (w7[1], w7[2], selector); + c0[1] = hc_byte_perm_S (w7[0], w7[1], selector); + c0[0] = hc_byte_perm_S (w6[3], w7[0], selector); + w7[3] = hc_byte_perm_S (w6[2], w6[3], selector); + w7[2] = hc_byte_perm_S (w6[1], w6[2], selector); + w7[1] = hc_byte_perm_S (w6[0], w6[1], selector); + w7[0] = hc_byte_perm_S (w5[3], w6[0], selector); + w6[3] = hc_byte_perm_S (w5[2], w5[3], selector); + w6[2] = hc_byte_perm_S (w5[1], w5[2], selector); + w6[1] = hc_byte_perm_S (w5[0], w5[1], selector); + w6[0] = hc_byte_perm_S (w4[3], w5[0], selector); + w5[3] = hc_byte_perm_S (w4[2], w4[3], selector); + w5[2] = hc_byte_perm_S (w4[1], w4[2], selector); + w5[1] = hc_byte_perm_S (w4[0], w4[1], selector); + w5[0] = hc_byte_perm_S (w3[3], w4[0], selector); + w4[3] = hc_byte_perm_S (w3[2], w3[3], selector); + w4[2] = hc_byte_perm_S (w3[1], w3[2], selector); + w4[1] = hc_byte_perm_S (w3[0], w3[1], selector); + w4[0] = hc_byte_perm_S (w2[3], w3[0], selector); + w3[3] = hc_byte_perm_S (w2[2], w2[3], selector); + w3[2] = hc_byte_perm_S (w2[1], w2[2], selector); + w3[1] = hc_byte_perm_S (w2[0], w2[1], selector); + w3[0] = hc_byte_perm_S (w1[3], w2[0], selector); + w2[3] = hc_byte_perm_S (w1[2], w1[3], selector); + w2[2] = hc_byte_perm_S (w1[1], w1[2], selector); + w2[1] = hc_byte_perm_S (w1[0], w1[1], selector); + w2[0] = hc_byte_perm_S (w0[3], w1[0], selector); + w1[3] = hc_byte_perm_S (w0[2], w0[3], selector); + w1[2] = hc_byte_perm_S (w0[1], w0[2], selector); + w1[1] = hc_byte_perm_S (w0[0], w0[1], selector); + w1[0] = hc_byte_perm_S ( 0, w0[0], selector); + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; - case 20: - w7[3] = hc_bytealign_S (w2[2], w2[3], offset); - w7[2] = hc_bytealign_S (w2[1], w2[2], offset); - w7[1] = hc_bytealign_S (w2[0], w2[1], offset); - w7[0] = hc_bytealign_S (w1[3], w2[0], offset); - w6[3] = hc_bytealign_S (w1[2], w1[3], offset); - w6[2] = hc_bytealign_S (w1[1], w1[2], offset); - w6[1] = hc_bytealign_S (w1[0], w1[1], offset); - w6[0] = hc_bytealign_S (w0[3], w1[0], offset); - w5[3] = hc_bytealign_S (w0[2], w0[3], offset); - w5[2] = hc_bytealign_S (w0[1], w0[2], offset); - w5[1] = hc_bytealign_S (w0[0], w0[1], offset); - w5[0] = hc_bytealign_S ( 0, w0[0], offset); - w4[3] = 0; - w4[2] = 0; - w4[1] = 0; - w4[0] = 0; - w3[3] = 0; - w3[2] = 0; - w3[1] = 0; - w3[0] = 0; - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; + case 5: + c1[1] = hc_byte_perm_S (w7[3], 0, selector); + c1[0] = hc_byte_perm_S (w7[2], w7[3], selector); + c0[3] = hc_byte_perm_S (w7[1], w7[2], selector); + c0[2] = hc_byte_perm_S (w7[0], w7[1], selector); + c0[1] = hc_byte_perm_S (w6[3], w7[0], selector); + c0[0] = hc_byte_perm_S (w6[2], w6[3], selector); + w7[3] = hc_byte_perm_S (w6[1], w6[2], selector); + w7[2] = hc_byte_perm_S (w6[0], w6[1], selector); + w7[1] = hc_byte_perm_S (w5[3], w6[0], selector); + w7[0] = hc_byte_perm_S (w5[2], w5[3], selector); + w6[3] = hc_byte_perm_S (w5[1], w5[2], selector); + w6[2] = hc_byte_perm_S (w5[0], w5[1], selector); + w6[1] = hc_byte_perm_S (w4[3], w5[0], selector); + w6[0] = hc_byte_perm_S (w4[2], w4[3], selector); + w5[3] = hc_byte_perm_S (w4[1], w4[2], selector); + w5[2] = hc_byte_perm_S (w4[0], w4[1], selector); + w5[1] = hc_byte_perm_S (w3[3], w4[0], selector); + w5[0] = hc_byte_perm_S (w3[2], w3[3], selector); + w4[3] = hc_byte_perm_S (w3[1], w3[2], selector); + w4[2] = hc_byte_perm_S (w3[0], w3[1], selector); + w4[1] = hc_byte_perm_S (w2[3], w3[0], selector); + w4[0] = hc_byte_perm_S (w2[2], w2[3], selector); + w3[3] = hc_byte_perm_S (w2[1], w2[2], selector); + w3[2] = hc_byte_perm_S (w2[0], w2[1], selector); + w3[1] = hc_byte_perm_S (w1[3], w2[0], selector); + w3[0] = hc_byte_perm_S (w1[2], w1[3], selector); + w2[3] = hc_byte_perm_S (w1[1], w1[2], selector); + w2[2] = hc_byte_perm_S (w1[0], w1[1], selector); + w2[1] = hc_byte_perm_S (w0[3], w1[0], selector); + w2[0] = hc_byte_perm_S (w0[2], w0[3], selector); + w1[3] = hc_byte_perm_S (w0[1], w0[2], selector); + w1[2] = hc_byte_perm_S (w0[0], w0[1], selector); + w1[1] = hc_byte_perm_S ( 0, w0[0], selector); w1[0] = 0; w0[3] = 0; w0[2] = 0; @@ -36725,33 +43032,40 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; - case 21: - w7[3] = hc_bytealign_S (w2[1], w2[2], offset); - w7[2] = hc_bytealign_S (w2[0], w2[1], offset); - w7[1] = hc_bytealign_S (w1[3], w2[0], offset); - w7[0] = hc_bytealign_S (w1[2], w1[3], offset); - w6[3] = hc_bytealign_S (w1[1], w1[2], offset); - w6[2] = hc_bytealign_S (w1[0], w1[1], offset); - w6[1] = hc_bytealign_S (w0[3], w1[0], offset); - w6[0] = hc_bytealign_S (w0[2], w0[3], offset); - w5[3] = hc_bytealign_S (w0[1], w0[2], offset); - w5[2] = hc_bytealign_S (w0[0], w0[1], offset); - w5[1] = hc_bytealign_S ( 0, w0[0], offset); - w5[0] = 0; - w4[3] = 0; - w4[2] = 0; - w4[1] = 0; - w4[0] = 0; - w3[3] = 0; - w3[2] = 0; - w3[1] = 0; - w3[0] = 0; - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; + case 6: + c1[2] = hc_byte_perm_S (w7[3], 0, selector); + c1[1] = hc_byte_perm_S (w7[2], w7[3], selector); + c1[0] = hc_byte_perm_S (w7[1], w7[2], selector); + c0[3] = hc_byte_perm_S (w7[0], w7[1], selector); + c0[2] = hc_byte_perm_S (w6[3], w7[0], selector); + c0[1] = hc_byte_perm_S (w6[2], w6[3], selector); + c0[0] = hc_byte_perm_S (w6[1], w6[2], selector); + w7[3] = hc_byte_perm_S (w6[0], w6[1], selector); + w7[2] = hc_byte_perm_S (w5[3], w6[0], selector); + w7[1] = hc_byte_perm_S (w5[2], w5[3], selector); + w7[0] = hc_byte_perm_S (w5[1], w5[2], selector); + w6[3] = hc_byte_perm_S (w5[0], w5[1], selector); + w6[2] = hc_byte_perm_S (w4[3], w5[0], selector); + w6[1] = hc_byte_perm_S (w4[2], w4[3], selector); + w6[0] = hc_byte_perm_S (w4[1], w4[2], selector); + w5[3] = hc_byte_perm_S (w4[0], w4[1], selector); + w5[2] = hc_byte_perm_S (w3[3], w4[0], selector); + w5[1] = hc_byte_perm_S (w3[2], w3[3], selector); + w5[0] = hc_byte_perm_S (w3[1], w3[2], selector); + w4[3] = hc_byte_perm_S (w3[0], w3[1], selector); + w4[2] = hc_byte_perm_S (w2[3], w3[0], selector); + w4[1] = hc_byte_perm_S (w2[2], w2[3], selector); + w4[0] = hc_byte_perm_S (w2[1], w2[2], selector); + w3[3] = hc_byte_perm_S (w2[0], w2[1], selector); + w3[2] = hc_byte_perm_S (w1[3], w2[0], selector); + w3[1] = hc_byte_perm_S (w1[2], w1[3], selector); + w3[0] = hc_byte_perm_S (w1[1], w1[2], selector); + w2[3] = hc_byte_perm_S (w1[0], w1[1], selector); + w2[2] = hc_byte_perm_S (w0[3], w1[0], selector); + w2[1] = hc_byte_perm_S (w0[2], w0[3], selector); + w2[0] = hc_byte_perm_S (w0[1], w0[2], selector); + w1[3] = hc_byte_perm_S (w0[0], w0[1], selector); + w1[2] = hc_byte_perm_S ( 0, w0[0], selector); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -36761,32 +43075,40 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; - case 22: - w7[3] = hc_bytealign_S (w2[0], w2[1], offset); - w7[2] = hc_bytealign_S (w1[3], w2[0], offset); - w7[1] = hc_bytealign_S (w1[2], w1[3], offset); - w7[0] = hc_bytealign_S (w1[1], w1[2], offset); - w6[3] = hc_bytealign_S (w1[0], w1[1], offset); - w6[2] = hc_bytealign_S (w0[3], w1[0], offset); - w6[1] = hc_bytealign_S (w0[2], w0[3], offset); - w6[0] = hc_bytealign_S (w0[1], w0[2], offset); - w5[3] = hc_bytealign_S (w0[0], w0[1], offset); - w5[2] = hc_bytealign_S ( 0, w0[0], offset); - w5[1] = 0; - w5[0] = 0; - w4[3] = 0; - w4[2] = 0; - w4[1] = 0; - w4[0] = 0; - w3[3] = 0; - w3[2] = 0; - w3[1] = 0; - w3[0] = 0; - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; + case 7: + c1[3] = hc_byte_perm_S (w7[3], 0, selector); + c1[2] = hc_byte_perm_S (w7[2], w7[3], selector); + c1[1] = hc_byte_perm_S (w7[1], w7[2], selector); + c1[0] = hc_byte_perm_S (w7[0], w7[1], selector); + c0[3] = hc_byte_perm_S (w6[3], w7[0], selector); + c0[2] = hc_byte_perm_S (w6[2], w6[3], selector); + c0[1] = hc_byte_perm_S (w6[1], w6[2], selector); + c0[0] = hc_byte_perm_S (w6[0], w6[1], selector); + w7[3] = hc_byte_perm_S (w5[3], w6[0], selector); + w7[2] = hc_byte_perm_S (w5[2], w5[3], selector); + w7[1] = hc_byte_perm_S (w5[1], w5[2], selector); + w7[0] = hc_byte_perm_S (w5[0], w5[1], selector); + w6[3] = hc_byte_perm_S (w4[3], w5[0], selector); + w6[2] = hc_byte_perm_S (w4[2], w4[3], selector); + w6[1] = hc_byte_perm_S (w4[1], w4[2], selector); + w6[0] = hc_byte_perm_S (w4[0], w4[1], selector); + w5[3] = hc_byte_perm_S (w3[3], w4[0], selector); + w5[2] = hc_byte_perm_S (w3[2], w3[3], selector); + w5[1] = hc_byte_perm_S (w3[1], w3[2], selector); + w5[0] = hc_byte_perm_S (w3[0], w3[1], selector); + w4[3] = hc_byte_perm_S (w2[3], w3[0], selector); + w4[2] = hc_byte_perm_S (w2[2], w2[3], selector); + w4[1] = hc_byte_perm_S (w2[1], w2[2], selector); + w4[0] = hc_byte_perm_S (w2[0], w2[1], selector); + w3[3] = hc_byte_perm_S (w1[3], w2[0], selector); + w3[2] = hc_byte_perm_S (w1[2], w1[3], selector); + w3[1] = hc_byte_perm_S (w1[1], w1[2], selector); + w3[0] = hc_byte_perm_S (w1[0], w1[1], selector); + w2[3] = hc_byte_perm_S (w0[3], w1[0], selector); + w2[2] = hc_byte_perm_S (w0[2], w0[3], selector); + w2[1] = hc_byte_perm_S (w0[1], w0[2], selector); + w2[0] = hc_byte_perm_S (w0[0], w0[1], selector); + w1[3] = hc_byte_perm_S ( 0, w0[0], selector); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -36797,31 +43119,40 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; - case 23: - w7[3] = hc_bytealign_S (w1[3], w2[0], offset); - w7[2] = hc_bytealign_S (w1[2], w1[3], offset); - w7[1] = hc_bytealign_S (w1[1], w1[2], offset); - w7[0] = hc_bytealign_S (w1[0], w1[1], offset); - w6[3] = hc_bytealign_S (w0[3], w1[0], offset); - w6[2] = hc_bytealign_S (w0[2], w0[3], offset); - w6[1] = hc_bytealign_S (w0[1], w0[2], offset); - w6[0] = hc_bytealign_S (w0[0], w0[1], offset); - w5[3] = hc_bytealign_S ( 0, w0[0], offset); - w5[2] = 0; - w5[1] = 0; - w5[0] = 0; - w4[3] = 0; - w4[2] = 0; - w4[1] = 0; - w4[0] = 0; - w3[3] = 0; - w3[2] = 0; - w3[1] = 0; - w3[0] = 0; - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; + case 8: + c2[0] = hc_byte_perm_S (w7[3], 0, selector); + c1[3] = hc_byte_perm_S (w7[2], w7[3], selector); + c1[2] = hc_byte_perm_S (w7[1], w7[2], selector); + c1[1] = hc_byte_perm_S (w7[0], w7[1], selector); + c1[0] = hc_byte_perm_S (w6[3], w7[0], selector); + c0[3] = hc_byte_perm_S (w6[2], w6[3], selector); + c0[2] = hc_byte_perm_S (w6[1], w6[2], selector); + c0[1] = hc_byte_perm_S (w6[0], w6[1], selector); + c0[0] = hc_byte_perm_S (w5[3], w6[0], selector); + w7[3] = hc_byte_perm_S (w5[2], w5[3], selector); + w7[2] = hc_byte_perm_S (w5[1], w5[2], selector); + w7[1] = hc_byte_perm_S (w5[0], w5[1], selector); + w7[0] = hc_byte_perm_S (w4[3], w5[0], selector); + w6[3] = hc_byte_perm_S (w4[2], w4[3], selector); + w6[2] = hc_byte_perm_S (w4[1], w4[2], selector); + w6[1] = hc_byte_perm_S (w4[0], w4[1], selector); + w6[0] = hc_byte_perm_S (w3[3], w4[0], selector); + w5[3] = hc_byte_perm_S (w3[2], w3[3], selector); + w5[2] = hc_byte_perm_S (w3[1], w3[2], selector); + w5[1] = hc_byte_perm_S (w3[0], w3[1], selector); + w5[0] = hc_byte_perm_S (w2[3], w3[0], selector); + w4[3] = hc_byte_perm_S (w2[2], w2[3], selector); + w4[2] = hc_byte_perm_S (w2[1], w2[2], selector); + w4[1] = hc_byte_perm_S (w2[0], w2[1], selector); + w4[0] = hc_byte_perm_S (w1[3], w2[0], selector); + w3[3] = hc_byte_perm_S (w1[2], w1[3], selector); + w3[2] = hc_byte_perm_S (w1[1], w1[2], selector); + w3[1] = hc_byte_perm_S (w1[0], w1[1], selector); + w3[0] = hc_byte_perm_S (w0[3], w1[0], selector); + w2[3] = hc_byte_perm_S (w0[2], w0[3], selector); + w2[2] = hc_byte_perm_S (w0[1], w0[2], selector); + w2[1] = hc_byte_perm_S (w0[0], w0[1], selector); + w2[0] = hc_byte_perm_S ( 0, w0[0], selector); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -36833,30 +43164,40 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; - case 24: - w7[3] = hc_bytealign_S (w1[2], w1[3], offset); - w7[2] = hc_bytealign_S (w1[1], w1[2], offset); - w7[1] = hc_bytealign_S (w1[0], w1[1], offset); - w7[0] = hc_bytealign_S (w0[3], w1[0], offset); - w6[3] = hc_bytealign_S (w0[2], w0[3], offset); - w6[2] = hc_bytealign_S (w0[1], w0[2], offset); - w6[1] = hc_bytealign_S (w0[0], w0[1], offset); - w6[0] = hc_bytealign_S ( 0, w0[0], offset); - w5[3] = 0; - w5[2] = 0; - w5[1] = 0; - w5[0] = 0; - w4[3] = 0; - w4[2] = 0; - w4[1] = 0; - w4[0] = 0; - w3[3] = 0; - w3[2] = 0; - w3[1] = 0; - w3[0] = 0; - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; + case 9: + c2[1] = hc_byte_perm_S (w7[3], 0, selector); + c2[0] = hc_byte_perm_S (w7[2], w7[3], selector); + c1[3] = hc_byte_perm_S (w7[1], w7[2], selector); + c1[2] = hc_byte_perm_S (w7[0], w7[1], selector); + c1[1] = hc_byte_perm_S (w6[3], w7[0], selector); + c1[0] = hc_byte_perm_S (w6[2], w6[3], selector); + c0[3] = hc_byte_perm_S (w6[1], w6[2], selector); + c0[2] = hc_byte_perm_S (w6[0], w6[1], selector); + c0[1] = hc_byte_perm_S (w5[3], w6[0], selector); + c0[0] = hc_byte_perm_S (w5[2], w5[3], selector); + w7[3] = hc_byte_perm_S (w5[1], w5[2], selector); + w7[2] = hc_byte_perm_S (w5[0], w5[1], selector); + w7[1] = hc_byte_perm_S (w4[3], w5[0], selector); + w7[0] = hc_byte_perm_S (w4[2], w4[3], selector); + w6[3] = hc_byte_perm_S (w4[1], w4[2], selector); + w6[2] = hc_byte_perm_S (w4[0], w4[1], selector); + w6[1] = hc_byte_perm_S (w3[3], w4[0], selector); + w6[0] = hc_byte_perm_S (w3[2], w3[3], selector); + w5[3] = hc_byte_perm_S (w3[1], w3[2], selector); + w5[2] = hc_byte_perm_S (w3[0], w3[1], selector); + w5[1] = hc_byte_perm_S (w2[3], w3[0], selector); + w5[0] = hc_byte_perm_S (w2[2], w2[3], selector); + w4[3] = hc_byte_perm_S (w2[1], w2[2], selector); + w4[2] = hc_byte_perm_S (w2[0], w2[1], selector); + w4[1] = hc_byte_perm_S (w1[3], w2[0], selector); + w4[0] = hc_byte_perm_S (w1[2], w1[3], selector); + w3[3] = hc_byte_perm_S (w1[1], w1[2], selector); + w3[2] = hc_byte_perm_S (w1[0], w1[1], selector); + w3[1] = hc_byte_perm_S (w0[3], w1[0], selector); + w3[0] = hc_byte_perm_S (w0[2], w0[3], selector); + w2[3] = hc_byte_perm_S (w0[1], w0[2], selector); + w2[2] = hc_byte_perm_S (w0[0], w0[1], selector); + w2[1] = hc_byte_perm_S ( 0, w0[0], selector); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -36869,29 +43210,40 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; - case 25: - w7[3] = hc_bytealign_S (w1[1], w1[2], offset); - w7[2] = hc_bytealign_S (w1[0], w1[1], offset); - w7[1] = hc_bytealign_S (w0[3], w1[0], offset); - w7[0] = hc_bytealign_S (w0[2], w0[3], offset); - w6[3] = hc_bytealign_S (w0[1], w0[2], offset); - w6[2] = hc_bytealign_S (w0[0], w0[1], offset); - w6[1] = hc_bytealign_S ( 0, w0[0], offset); - w6[0] = 0; - w5[3] = 0; - w5[2] = 0; - w5[1] = 0; - w5[0] = 0; - w4[3] = 0; - w4[2] = 0; - w4[1] = 0; - w4[0] = 0; - w3[3] = 0; - w3[2] = 0; - w3[1] = 0; - w3[0] = 0; - w2[3] = 0; - w2[2] = 0; + case 10: + c2[2] = hc_byte_perm_S (w7[3], 0, selector); + c2[1] = hc_byte_perm_S (w7[2], w7[3], selector); + c2[0] = hc_byte_perm_S (w7[1], w7[2], selector); + c1[3] = hc_byte_perm_S (w7[0], w7[1], selector); + c1[2] = hc_byte_perm_S (w6[3], w7[0], selector); + c1[1] = hc_byte_perm_S (w6[2], w6[3], selector); + c1[0] = hc_byte_perm_S (w6[1], w6[2], selector); + c0[3] = hc_byte_perm_S (w6[0], w6[1], selector); + c0[2] = hc_byte_perm_S (w5[3], w6[0], selector); + c0[1] = hc_byte_perm_S (w5[2], w5[3], selector); + c0[0] = hc_byte_perm_S (w5[1], w5[2], selector); + w7[3] = hc_byte_perm_S (w5[0], w5[1], selector); + w7[2] = hc_byte_perm_S (w4[3], w5[0], selector); + w7[1] = hc_byte_perm_S (w4[2], w4[3], selector); + w7[0] = hc_byte_perm_S (w4[1], w4[2], selector); + w6[3] = hc_byte_perm_S (w4[0], w4[1], selector); + w6[2] = hc_byte_perm_S (w3[3], w4[0], selector); + w6[1] = hc_byte_perm_S (w3[2], w3[3], selector); + w6[0] = hc_byte_perm_S (w3[1], w3[2], selector); + w5[3] = hc_byte_perm_S (w3[0], w3[1], selector); + w5[2] = hc_byte_perm_S (w2[3], w3[0], selector); + w5[1] = hc_byte_perm_S (w2[2], w2[3], selector); + w5[0] = hc_byte_perm_S (w2[1], w2[2], selector); + w4[3] = hc_byte_perm_S (w2[0], w2[1], selector); + w4[2] = hc_byte_perm_S (w1[3], w2[0], selector); + w4[1] = hc_byte_perm_S (w1[2], w1[3], selector); + w4[0] = hc_byte_perm_S (w1[1], w1[2], selector); + w3[3] = hc_byte_perm_S (w1[0], w1[1], selector); + w3[2] = hc_byte_perm_S (w0[3], w1[0], selector); + w3[1] = hc_byte_perm_S (w0[2], w0[3], selector); + w3[0] = hc_byte_perm_S (w0[1], w0[2], selector); + w2[3] = hc_byte_perm_S (w0[0], w0[1], selector); + w2[2] = hc_byte_perm_S ( 0, w0[0], selector); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -36905,28 +43257,40 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; - case 26: - w7[3] = hc_bytealign_S (w1[0], w1[1], offset); - w7[2] = hc_bytealign_S (w0[3], w1[0], offset); - w7[1] = hc_bytealign_S (w0[2], w0[3], offset); - w7[0] = hc_bytealign_S (w0[1], w0[2], offset); - w6[3] = hc_bytealign_S (w0[0], w0[1], offset); - w6[2] = hc_bytealign_S ( 0, w0[0], offset); - w6[1] = 0; - w6[0] = 0; - w5[3] = 0; - w5[2] = 0; - w5[1] = 0; - w5[0] = 0; - w4[3] = 0; - w4[2] = 0; - w4[1] = 0; - w4[0] = 0; - w3[3] = 0; - w3[2] = 0; - w3[1] = 0; - w3[0] = 0; - w2[3] = 0; + case 11: + c2[3] = hc_byte_perm_S (w7[3], 0, selector); + c2[2] = hc_byte_perm_S (w7[2], w7[3], selector); + c2[1] = hc_byte_perm_S (w7[1], w7[2], selector); + c2[0] = hc_byte_perm_S (w7[0], w7[1], selector); + c1[3] = hc_byte_perm_S (w6[3], w7[0], selector); + c1[2] = hc_byte_perm_S (w6[2], w6[3], selector); + c1[1] = hc_byte_perm_S (w6[1], w6[2], selector); + c1[0] = hc_byte_perm_S (w6[0], w6[1], selector); + c0[3] = hc_byte_perm_S (w5[3], w6[0], selector); + c0[2] = hc_byte_perm_S (w5[2], w5[3], selector); + c0[1] = hc_byte_perm_S (w5[1], w5[2], selector); + c0[0] = hc_byte_perm_S (w5[0], w5[1], selector); + w7[3] = hc_byte_perm_S (w4[3], w5[0], selector); + w7[2] = hc_byte_perm_S (w4[2], w4[3], selector); + w7[1] = hc_byte_perm_S (w4[1], w4[2], selector); + w7[0] = hc_byte_perm_S (w4[0], w4[1], selector); + w6[3] = hc_byte_perm_S (w3[3], w4[0], selector); + w6[2] = hc_byte_perm_S (w3[2], w3[3], selector); + w6[1] = hc_byte_perm_S (w3[1], w3[2], selector); + w6[0] = hc_byte_perm_S (w3[0], w3[1], selector); + w5[3] = hc_byte_perm_S (w2[3], w3[0], selector); + w5[2] = hc_byte_perm_S (w2[2], w2[3], selector); + w5[1] = hc_byte_perm_S (w2[1], w2[2], selector); + w5[0] = hc_byte_perm_S (w2[0], w2[1], selector); + w4[3] = hc_byte_perm_S (w1[3], w2[0], selector); + w4[2] = hc_byte_perm_S (w1[2], w1[3], selector); + w4[1] = hc_byte_perm_S (w1[1], w1[2], selector); + w4[0] = hc_byte_perm_S (w1[0], w1[1], selector); + w3[3] = hc_byte_perm_S (w0[3], w1[0], selector); + w3[2] = hc_byte_perm_S (w0[2], w0[3], selector); + w3[1] = hc_byte_perm_S (w0[1], w0[2], selector); + w3[0] = hc_byte_perm_S (w0[0], w0[1], selector); + w2[3] = hc_byte_perm_S ( 0, w0[0], selector); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -36941,27 +43305,40 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; - case 27: - w7[3] = hc_bytealign_S (w0[3], w1[0], offset); - w7[2] = hc_bytealign_S (w0[2], w0[3], offset); - w7[1] = hc_bytealign_S (w0[1], w0[2], offset); - w7[0] = hc_bytealign_S (w0[0], w0[1], offset); - w6[3] = hc_bytealign_S ( 0, w0[0], offset); - w6[2] = 0; - w6[1] = 0; - w6[0] = 0; - w5[3] = 0; - w5[2] = 0; - w5[1] = 0; - w5[0] = 0; - w4[3] = 0; - w4[2] = 0; - w4[1] = 0; - w4[0] = 0; - w3[3] = 0; - w3[2] = 0; - w3[1] = 0; - w3[0] = 0; + case 12: + c3[0] = hc_byte_perm_S (w7[3], 0, selector); + c2[3] = hc_byte_perm_S (w7[2], w7[3], selector); + c2[2] = hc_byte_perm_S (w7[1], w7[2], selector); + c2[1] = hc_byte_perm_S (w7[0], w7[1], selector); + c2[0] = hc_byte_perm_S (w6[3], w7[0], selector); + c1[3] = hc_byte_perm_S (w6[2], w6[3], selector); + c1[2] = hc_byte_perm_S (w6[1], w6[2], selector); + c1[1] = hc_byte_perm_S (w6[0], w6[1], selector); + c1[0] = hc_byte_perm_S (w5[3], w6[0], selector); + c0[3] = hc_byte_perm_S (w5[2], w5[3], selector); + c0[2] = hc_byte_perm_S (w5[1], w5[2], selector); + c0[1] = hc_byte_perm_S (w5[0], w5[1], selector); + c0[0] = hc_byte_perm_S (w4[3], w5[0], selector); + w7[3] = hc_byte_perm_S (w4[2], w4[3], selector); + w7[2] = hc_byte_perm_S (w4[1], w4[2], selector); + w7[1] = hc_byte_perm_S (w4[0], w4[1], selector); + w7[0] = hc_byte_perm_S (w3[3], w4[0], selector); + w6[3] = hc_byte_perm_S (w3[2], w3[3], selector); + w6[2] = hc_byte_perm_S (w3[1], w3[2], selector); + w6[1] = hc_byte_perm_S (w3[0], w3[1], selector); + w6[0] = hc_byte_perm_S (w2[3], w3[0], selector); + w5[3] = hc_byte_perm_S (w2[2], w2[3], selector); + w5[2] = hc_byte_perm_S (w2[1], w2[2], selector); + w5[1] = hc_byte_perm_S (w2[0], w2[1], selector); + w5[0] = hc_byte_perm_S (w1[3], w2[0], selector); + w4[3] = hc_byte_perm_S (w1[2], w1[3], selector); + w4[2] = hc_byte_perm_S (w1[1], w1[2], selector); + w4[1] = hc_byte_perm_S (w1[0], w1[1], selector); + w4[0] = hc_byte_perm_S (w0[3], w1[0], selector); + w3[3] = hc_byte_perm_S (w0[2], w0[3], selector); + w3[2] = hc_byte_perm_S (w0[1], w0[2], selector); + w3[1] = hc_byte_perm_S (w0[0], w0[1], selector); + w3[0] = hc_byte_perm_S ( 0, w0[0], selector); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -36977,26 +43354,40 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; - case 28: - w7[3] = hc_bytealign_S (w0[2], w0[3], offset); - w7[2] = hc_bytealign_S (w0[1], w0[2], offset); - w7[1] = hc_bytealign_S (w0[0], w0[1], offset); - w7[0] = hc_bytealign_S ( 0, w0[0], offset); - w6[3] = 0; - w6[2] = 0; - w6[1] = 0; - w6[0] = 0; - w5[3] = 0; - w5[2] = 0; - w5[1] = 0; - w5[0] = 0; - w4[3] = 0; - w4[2] = 0; - w4[1] = 0; - w4[0] = 0; - w3[3] = 0; - w3[2] = 0; - w3[1] = 0; + case 13: + c3[1] = hc_byte_perm_S (w7[3], 0, selector); + c3[0] = hc_byte_perm_S (w7[2], w7[3], selector); + c2[3] = hc_byte_perm_S (w7[1], w7[2], selector); + c2[2] = hc_byte_perm_S (w7[0], w7[1], selector); + c2[1] = hc_byte_perm_S (w6[3], w7[0], selector); + c2[0] = hc_byte_perm_S (w6[2], w6[3], selector); + c1[3] = hc_byte_perm_S (w6[1], w6[2], selector); + c1[2] = hc_byte_perm_S (w6[0], w6[1], selector); + c1[1] = hc_byte_perm_S (w5[3], w6[0], selector); + c1[0] = hc_byte_perm_S (w5[2], w5[3], selector); + c0[3] = hc_byte_perm_S (w5[1], w5[2], selector); + c0[2] = hc_byte_perm_S (w5[0], w5[1], selector); + c0[1] = hc_byte_perm_S (w4[3], w5[0], selector); + c0[0] = hc_byte_perm_S (w4[2], w4[3], selector); + w7[3] = hc_byte_perm_S (w4[1], w4[2], selector); + w7[2] = hc_byte_perm_S (w4[0], w4[1], selector); + w7[1] = hc_byte_perm_S (w3[3], w4[0], selector); + w7[0] = hc_byte_perm_S (w3[2], w3[3], selector); + w6[3] = hc_byte_perm_S (w3[1], w3[2], selector); + w6[2] = hc_byte_perm_S (w3[0], w3[1], selector); + w6[1] = hc_byte_perm_S (w2[3], w3[0], selector); + w6[0] = hc_byte_perm_S (w2[2], w2[3], selector); + w5[3] = hc_byte_perm_S (w2[1], w2[2], selector); + w5[2] = hc_byte_perm_S (w2[0], w2[1], selector); + w5[1] = hc_byte_perm_S (w1[3], w2[0], selector); + w5[0] = hc_byte_perm_S (w1[2], w1[3], selector); + w4[3] = hc_byte_perm_S (w1[1], w1[2], selector); + w4[2] = hc_byte_perm_S (w1[0], w1[1], selector); + w4[1] = hc_byte_perm_S (w0[3], w1[0], selector); + w4[0] = hc_byte_perm_S (w0[2], w0[3], selector); + w3[3] = hc_byte_perm_S (w0[1], w0[2], selector); + w3[2] = hc_byte_perm_S (w0[0], w0[1], selector); + w3[1] = hc_byte_perm_S ( 0, w0[0], selector); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -37013,25 +43404,40 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; - case 29: - w7[3] = hc_bytealign_S (w0[1], w0[2], offset); - w7[2] = hc_bytealign_S (w0[0], w0[1], offset); - w7[1] = hc_bytealign_S ( 0, w0[0], offset); - w7[0] = 0; - w6[3] = 0; - w6[2] = 0; - w6[1] = 0; - w6[0] = 0; - w5[3] = 0; - w5[2] = 0; - w5[1] = 0; - w5[0] = 0; - w4[3] = 0; - w4[2] = 0; - w4[1] = 0; - w4[0] = 0; - w3[3] = 0; - w3[2] = 0; + case 14: + c3[2] = hc_byte_perm_S (w7[3], 0, selector); + c3[1] = hc_byte_perm_S (w7[2], w7[3], selector); + c3[0] = hc_byte_perm_S (w7[1], w7[2], selector); + c2[3] = hc_byte_perm_S (w7[0], w7[1], selector); + c2[2] = hc_byte_perm_S (w6[3], w7[0], selector); + c2[1] = hc_byte_perm_S (w6[2], w6[3], selector); + c2[0] = hc_byte_perm_S (w6[1], w6[2], selector); + c1[3] = hc_byte_perm_S (w6[0], w6[1], selector); + c1[2] = hc_byte_perm_S (w5[3], w6[0], selector); + c1[1] = hc_byte_perm_S (w5[2], w5[3], selector); + c1[0] = hc_byte_perm_S (w5[1], w5[2], selector); + c0[3] = hc_byte_perm_S (w5[0], w5[1], selector); + c0[2] = hc_byte_perm_S (w4[3], w5[0], selector); + c0[1] = hc_byte_perm_S (w4[2], w4[3], selector); + c0[0] = hc_byte_perm_S (w4[1], w4[2], selector); + w7[3] = hc_byte_perm_S (w4[0], w4[1], selector); + w7[2] = hc_byte_perm_S (w3[3], w4[0], selector); + w7[1] = hc_byte_perm_S (w3[2], w3[3], selector); + w7[0] = hc_byte_perm_S (w3[1], w3[2], selector); + w6[3] = hc_byte_perm_S (w3[0], w3[1], selector); + w6[2] = hc_byte_perm_S (w2[3], w3[0], selector); + w6[1] = hc_byte_perm_S (w2[2], w2[3], selector); + w6[0] = hc_byte_perm_S (w2[1], w2[2], selector); + w5[3] = hc_byte_perm_S (w2[0], w2[1], selector); + w5[2] = hc_byte_perm_S (w1[3], w2[0], selector); + w5[1] = hc_byte_perm_S (w1[2], w1[3], selector); + w5[0] = hc_byte_perm_S (w1[1], w1[2], selector); + w4[3] = hc_byte_perm_S (w1[0], w1[1], selector); + w4[2] = hc_byte_perm_S (w0[3], w1[0], selector); + w4[1] = hc_byte_perm_S (w0[2], w0[3], selector); + w4[0] = hc_byte_perm_S (w0[1], w0[2], selector); + w3[3] = hc_byte_perm_S (w0[0], w0[1], selector); + w3[2] = hc_byte_perm_S ( 0, w0[0], selector); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -37049,24 +43455,40 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; - case 30: - w7[3] = hc_bytealign_S (w0[0], w0[1], offset); - w7[2] = hc_bytealign_S ( 0, w0[0], offset); - w7[1] = 0; - w7[0] = 0; - w6[3] = 0; - w6[2] = 0; - w6[1] = 0; - w6[0] = 0; - w5[3] = 0; - w5[2] = 0; - w5[1] = 0; - w5[0] = 0; - w4[3] = 0; - w4[2] = 0; - w4[1] = 0; - w4[0] = 0; - w3[3] = 0; + case 15: + c3[3] = hc_byte_perm_S (w7[3], 0, selector); + c3[2] = hc_byte_perm_S (w7[2], w7[3], selector); + c3[1] = hc_byte_perm_S (w7[1], w7[2], selector); + c3[0] = hc_byte_perm_S (w7[0], w7[1], selector); + c2[3] = hc_byte_perm_S (w6[3], w7[0], selector); + c2[2] = hc_byte_perm_S (w6[2], w6[3], selector); + c2[1] = hc_byte_perm_S (w6[1], w6[2], selector); + c2[0] = hc_byte_perm_S (w6[0], w6[1], selector); + c1[3] = hc_byte_perm_S (w5[3], w6[0], selector); + c1[2] = hc_byte_perm_S (w5[2], w5[3], selector); + c1[1] = hc_byte_perm_S (w5[1], w5[2], selector); + c1[0] = hc_byte_perm_S (w5[0], w5[1], selector); + c0[3] = hc_byte_perm_S (w4[3], w5[0], selector); + c0[2] = hc_byte_perm_S (w4[2], w4[3], selector); + c0[1] = hc_byte_perm_S (w4[1], w4[2], selector); + c0[0] = hc_byte_perm_S (w4[0], w4[1], selector); + w7[3] = hc_byte_perm_S (w3[3], w4[0], selector); + w7[2] = hc_byte_perm_S (w3[2], w3[3], selector); + w7[1] = hc_byte_perm_S (w3[1], w3[2], selector); + w7[0] = hc_byte_perm_S (w3[0], w3[1], selector); + w6[3] = hc_byte_perm_S (w2[3], w3[0], selector); + w6[2] = hc_byte_perm_S (w2[2], w2[3], selector); + w6[1] = hc_byte_perm_S (w2[1], w2[2], selector); + w6[0] = hc_byte_perm_S (w2[0], w2[1], selector); + w5[3] = hc_byte_perm_S (w1[3], w2[0], selector); + w5[2] = hc_byte_perm_S (w1[2], w1[3], selector); + w5[1] = hc_byte_perm_S (w1[1], w1[2], selector); + w5[0] = hc_byte_perm_S (w1[0], w1[1], selector); + w4[3] = hc_byte_perm_S (w0[3], w1[0], selector); + w4[2] = hc_byte_perm_S (w0[2], w0[3], selector); + w4[1] = hc_byte_perm_S (w0[1], w0[2], selector); + w4[0] = hc_byte_perm_S (w0[0], w0[1], selector); + w3[3] = hc_byte_perm_S ( 0, w0[0], selector); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -37085,23 +43507,40 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * break; - case 31: - w7[3] = hc_bytealign_S ( 0, w0[0], offset); - w7[2] = 0; - w7[1] = 0; - w7[0] = 0; - w6[3] = 0; - w6[2] = 0; - w6[1] = 0; - w6[0] = 0; - w5[3] = 0; - w5[2] = 0; - w5[1] = 0; - w5[0] = 0; - w4[3] = 0; - w4[2] = 0; - w4[1] = 0; - w4[0] = 0; + case 16: + c4[0] = hc_byte_perm_S (w7[3], 0, selector); + c3[3] = hc_byte_perm_S (w7[2], w7[3], selector); + c3[2] = hc_byte_perm_S (w7[1], w7[2], selector); + c3[1] = hc_byte_perm_S (w7[0], w7[1], selector); + c3[0] = hc_byte_perm_S (w6[3], w7[0], selector); + c2[3] = hc_byte_perm_S (w6[2], w6[3], selector); + c2[2] = hc_byte_perm_S (w6[1], w6[2], selector); + c2[1] = hc_byte_perm_S (w6[0], w6[1], selector); + c2[0] = hc_byte_perm_S (w5[3], w6[0], selector); + c1[3] = hc_byte_perm_S (w5[2], w5[3], selector); + c1[2] = hc_byte_perm_S (w5[1], w5[2], selector); + c1[1] = hc_byte_perm_S (w5[0], w5[1], selector); + c1[0] = hc_byte_perm_S (w4[3], w5[0], selector); + c0[3] = hc_byte_perm_S (w4[2], w4[3], selector); + c0[2] = hc_byte_perm_S (w4[1], w4[2], selector); + c0[1] = hc_byte_perm_S (w4[0], w4[1], selector); + c0[0] = hc_byte_perm_S (w3[3], w4[0], selector); + w7[3] = hc_byte_perm_S (w3[2], w3[3], selector); + w7[2] = hc_byte_perm_S (w3[1], w3[2], selector); + w7[1] = hc_byte_perm_S (w3[0], w3[1], selector); + w7[0] = hc_byte_perm_S (w2[3], w3[0], selector); + w6[3] = hc_byte_perm_S (w2[2], w2[3], selector); + w6[2] = hc_byte_perm_S (w2[1], w2[2], selector); + w6[1] = hc_byte_perm_S (w2[0], w2[1], selector); + w6[0] = hc_byte_perm_S (w1[3], w2[0], selector); + w5[3] = hc_byte_perm_S (w1[2], w1[3], selector); + w5[2] = hc_byte_perm_S (w1[1], w1[2], selector); + w5[1] = hc_byte_perm_S (w1[0], w1[1], selector); + w5[0] = hc_byte_perm_S (w0[3], w1[0], selector); + w4[3] = hc_byte_perm_S (w0[2], w0[3], selector); + w4[2] = hc_byte_perm_S (w0[1], w0[2], selector); + w4[1] = hc_byte_perm_S (w0[0], w0[1], selector); + w4[0] = hc_byte_perm_S ( 0, w0[0], selector); w3[3] = 0; w3[2] = 0; w3[1] = 0; @@ -37120,296 +43559,277 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * w0[0] = 0; break; - } - #endif - - #if (defined IS_AMD && HAS_VPERM == 1) || defined IS_NV - - const int offset_mod_4 = offset & 3; - - const int offset_minus_4 = 4 - offset_mod_4; - - #if defined IS_NV - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - #endif - - #if defined IS_AMD - const int selector = 0x0706050403020100 >> (offset_minus_4 * 8); - #endif - - switch (offset_switch) - { - case 0: - w7[3] = hc_byte_perm_S (w7[2], w7[3], selector); - w7[2] = hc_byte_perm_S (w7[1], w7[2], selector); - w7[1] = hc_byte_perm_S (w7[0], w7[1], selector); - w7[0] = hc_byte_perm_S (w6[3], w7[0], selector); - w6[3] = hc_byte_perm_S (w6[2], w6[3], selector); - w6[2] = hc_byte_perm_S (w6[1], w6[2], selector); - w6[1] = hc_byte_perm_S (w6[0], w6[1], selector); - w6[0] = hc_byte_perm_S (w5[3], w6[0], selector); - w5[3] = hc_byte_perm_S (w5[2], w5[3], selector); - w5[2] = hc_byte_perm_S (w5[1], w5[2], selector); - w5[1] = hc_byte_perm_S (w5[0], w5[1], selector); - w5[0] = hc_byte_perm_S (w4[3], w5[0], selector); - w4[3] = hc_byte_perm_S (w4[2], w4[3], selector); - w4[2] = hc_byte_perm_S (w4[1], w4[2], selector); - w4[1] = hc_byte_perm_S (w4[0], w4[1], selector); - w4[0] = hc_byte_perm_S (w3[3], w4[0], selector); - w3[3] = hc_byte_perm_S (w3[2], w3[3], selector); - w3[2] = hc_byte_perm_S (w3[1], w3[2], selector); - w3[1] = hc_byte_perm_S (w3[0], w3[1], selector); - w3[0] = hc_byte_perm_S (w2[3], w3[0], selector); - w2[3] = hc_byte_perm_S (w2[2], w2[3], selector); - w2[2] = hc_byte_perm_S (w2[1], w2[2], selector); - w2[1] = hc_byte_perm_S (w2[0], w2[1], selector); - w2[0] = hc_byte_perm_S (w1[3], w2[0], selector); - w1[3] = hc_byte_perm_S (w1[2], w1[3], selector); - w1[2] = hc_byte_perm_S (w1[1], w1[2], selector); - w1[1] = hc_byte_perm_S (w1[0], w1[1], selector); - w1[0] = hc_byte_perm_S (w0[3], w1[0], selector); - w0[3] = hc_byte_perm_S (w0[2], w0[3], selector); - w0[2] = hc_byte_perm_S (w0[1], w0[2], selector); - w0[1] = hc_byte_perm_S (w0[0], w0[1], selector); - w0[0] = hc_byte_perm_S ( 0, w0[0], selector); - break; - - case 1: - w7[3] = hc_byte_perm_S (w7[1], w7[2], selector); - w7[2] = hc_byte_perm_S (w7[0], w7[1], selector); - w7[1] = hc_byte_perm_S (w6[3], w7[0], selector); - w7[0] = hc_byte_perm_S (w6[2], w6[3], selector); - w6[3] = hc_byte_perm_S (w6[1], w6[2], selector); - w6[2] = hc_byte_perm_S (w6[0], w6[1], selector); - w6[1] = hc_byte_perm_S (w5[3], w6[0], selector); - w6[0] = hc_byte_perm_S (w5[2], w5[3], selector); - w5[3] = hc_byte_perm_S (w5[1], w5[2], selector); - w5[2] = hc_byte_perm_S (w5[0], w5[1], selector); - w5[1] = hc_byte_perm_S (w4[3], w5[0], selector); - w5[0] = hc_byte_perm_S (w4[2], w4[3], selector); - w4[3] = hc_byte_perm_S (w4[1], w4[2], selector); - w4[2] = hc_byte_perm_S (w4[0], w4[1], selector); - w4[1] = hc_byte_perm_S (w3[3], w4[0], selector); - w4[0] = hc_byte_perm_S (w3[2], w3[3], selector); - w3[3] = hc_byte_perm_S (w3[1], w3[2], selector); - w3[2] = hc_byte_perm_S (w3[0], w3[1], selector); - w3[1] = hc_byte_perm_S (w2[3], w3[0], selector); - w3[0] = hc_byte_perm_S (w2[2], w2[3], selector); - w2[3] = hc_byte_perm_S (w2[1], w2[2], selector); - w2[2] = hc_byte_perm_S (w2[0], w2[1], selector); - w2[1] = hc_byte_perm_S (w1[3], w2[0], selector); - w2[0] = hc_byte_perm_S (w1[2], w1[3], selector); - w1[3] = hc_byte_perm_S (w1[1], w1[2], selector); - w1[2] = hc_byte_perm_S (w1[0], w1[1], selector); - w1[1] = hc_byte_perm_S (w0[3], w1[0], selector); - w1[0] = hc_byte_perm_S (w0[2], w0[3], selector); - w0[3] = hc_byte_perm_S (w0[1], w0[2], selector); - w0[2] = hc_byte_perm_S (w0[0], w0[1], selector); - w0[1] = hc_byte_perm_S ( 0, w0[0], selector); - w0[0] = 0; - break; - - case 2: - w7[3] = hc_byte_perm_S (w7[0], w7[1], selector); - w7[2] = hc_byte_perm_S (w6[3], w7[0], selector); - w7[1] = hc_byte_perm_S (w6[2], w6[3], selector); - w7[0] = hc_byte_perm_S (w6[1], w6[2], selector); - w6[3] = hc_byte_perm_S (w6[0], w6[1], selector); - w6[2] = hc_byte_perm_S (w5[3], w6[0], selector); - w6[1] = hc_byte_perm_S (w5[2], w5[3], selector); - w6[0] = hc_byte_perm_S (w5[1], w5[2], selector); - w5[3] = hc_byte_perm_S (w5[0], w5[1], selector); - w5[2] = hc_byte_perm_S (w4[3], w5[0], selector); - w5[1] = hc_byte_perm_S (w4[2], w4[3], selector); - w5[0] = hc_byte_perm_S (w4[1], w4[2], selector); - w4[3] = hc_byte_perm_S (w4[0], w4[1], selector); - w4[2] = hc_byte_perm_S (w3[3], w4[0], selector); - w4[1] = hc_byte_perm_S (w3[2], w3[3], selector); - w4[0] = hc_byte_perm_S (w3[1], w3[2], selector); - w3[3] = hc_byte_perm_S (w3[0], w3[1], selector); - w3[2] = hc_byte_perm_S (w2[3], w3[0], selector); - w3[1] = hc_byte_perm_S (w2[2], w2[3], selector); - w3[0] = hc_byte_perm_S (w2[1], w2[2], selector); - w2[3] = hc_byte_perm_S (w2[0], w2[1], selector); - w2[2] = hc_byte_perm_S (w1[3], w2[0], selector); - w2[1] = hc_byte_perm_S (w1[2], w1[3], selector); - w2[0] = hc_byte_perm_S (w1[1], w1[2], selector); - w1[3] = hc_byte_perm_S (w1[0], w1[1], selector); - w1[2] = hc_byte_perm_S (w0[3], w1[0], selector); - w1[1] = hc_byte_perm_S (w0[2], w0[3], selector); - w1[0] = hc_byte_perm_S (w0[1], w0[2], selector); - w0[3] = hc_byte_perm_S (w0[0], w0[1], selector); - w0[2] = hc_byte_perm_S ( 0, w0[0], selector); - w0[1] = 0; - w0[0] = 0; - break; - case 3: - w7[3] = hc_byte_perm_S (w6[3], w7[0], selector); - w7[2] = hc_byte_perm_S (w6[2], w6[3], selector); - w7[1] = hc_byte_perm_S (w6[1], w6[2], selector); - w7[0] = hc_byte_perm_S (w6[0], w6[1], selector); - w6[3] = hc_byte_perm_S (w5[3], w6[0], selector); - w6[2] = hc_byte_perm_S (w5[2], w5[3], selector); - w6[1] = hc_byte_perm_S (w5[1], w5[2], selector); - w6[0] = hc_byte_perm_S (w5[0], w5[1], selector); - w5[3] = hc_byte_perm_S (w4[3], w5[0], selector); - w5[2] = hc_byte_perm_S (w4[2], w4[3], selector); - w5[1] = hc_byte_perm_S (w4[1], w4[2], selector); - w5[0] = hc_byte_perm_S (w4[0], w4[1], selector); - w4[3] = hc_byte_perm_S (w3[3], w4[0], selector); - w4[2] = hc_byte_perm_S (w3[2], w3[3], selector); - w4[1] = hc_byte_perm_S (w3[1], w3[2], selector); - w4[0] = hc_byte_perm_S (w3[0], w3[1], selector); - w3[3] = hc_byte_perm_S (w2[3], w3[0], selector); - w3[2] = hc_byte_perm_S (w2[2], w2[3], selector); - w3[1] = hc_byte_perm_S (w2[1], w2[2], selector); - w3[0] = hc_byte_perm_S (w2[0], w2[1], selector); - w2[3] = hc_byte_perm_S (w1[3], w2[0], selector); - w2[2] = hc_byte_perm_S (w1[2], w1[3], selector); - w2[1] = hc_byte_perm_S (w1[1], w1[2], selector); - w2[0] = hc_byte_perm_S (w1[0], w1[1], selector); - w1[3] = hc_byte_perm_S (w0[3], w1[0], selector); - w1[2] = hc_byte_perm_S (w0[2], w0[3], selector); - w1[1] = hc_byte_perm_S (w0[1], w0[2], selector); - w1[0] = hc_byte_perm_S (w0[0], w0[1], selector); - w0[3] = hc_byte_perm_S ( 0, w0[0], selector); + case 17: + c4[1] = hc_byte_perm_S (w7[3], 0, selector); + c4[0] = hc_byte_perm_S (w7[2], w7[3], selector); + c3[3] = hc_byte_perm_S (w7[1], w7[2], selector); + c3[2] = hc_byte_perm_S (w7[0], w7[1], selector); + c3[1] = hc_byte_perm_S (w6[3], w7[0], selector); + c3[0] = hc_byte_perm_S (w6[2], w6[3], selector); + c2[3] = hc_byte_perm_S (w6[1], w6[2], selector); + c2[2] = hc_byte_perm_S (w6[0], w6[1], selector); + c2[1] = hc_byte_perm_S (w5[3], w6[0], selector); + c2[0] = hc_byte_perm_S (w5[2], w5[3], selector); + c1[3] = hc_byte_perm_S (w5[1], w5[2], selector); + c1[2] = hc_byte_perm_S (w5[0], w5[1], selector); + c1[1] = hc_byte_perm_S (w4[3], w5[0], selector); + c1[0] = hc_byte_perm_S (w4[2], w4[3], selector); + c0[3] = hc_byte_perm_S (w4[1], w4[2], selector); + c0[2] = hc_byte_perm_S (w4[0], w4[1], selector); + c0[1] = hc_byte_perm_S (w3[3], w4[0], selector); + c0[0] = hc_byte_perm_S (w3[2], w3[3], selector); + w7[3] = hc_byte_perm_S (w3[1], w3[2], selector); + w7[2] = hc_byte_perm_S (w3[0], w3[1], selector); + w7[1] = hc_byte_perm_S (w2[3], w3[0], selector); + w7[0] = hc_byte_perm_S (w2[2], w2[3], selector); + w6[3] = hc_byte_perm_S (w2[1], w2[2], selector); + w6[2] = hc_byte_perm_S (w2[0], w2[1], selector); + w6[1] = hc_byte_perm_S (w1[3], w2[0], selector); + w6[0] = hc_byte_perm_S (w1[2], w1[3], selector); + w5[3] = hc_byte_perm_S (w1[1], w1[2], selector); + w5[2] = hc_byte_perm_S (w1[0], w1[1], selector); + w5[1] = hc_byte_perm_S (w0[3], w1[0], selector); + w5[0] = hc_byte_perm_S (w0[2], w0[3], selector); + w4[3] = hc_byte_perm_S (w0[1], w0[2], selector); + w4[2] = hc_byte_perm_S (w0[0], w0[1], selector); + w4[1] = hc_byte_perm_S ( 0, w0[0], selector); + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 4: - w7[3] = hc_byte_perm_S (w6[2], w6[3], selector); - w7[2] = hc_byte_perm_S (w6[1], w6[2], selector); - w7[1] = hc_byte_perm_S (w6[0], w6[1], selector); - w7[0] = hc_byte_perm_S (w5[3], w6[0], selector); - w6[3] = hc_byte_perm_S (w5[2], w5[3], selector); - w6[2] = hc_byte_perm_S (w5[1], w5[2], selector); - w6[1] = hc_byte_perm_S (w5[0], w5[1], selector); - w6[0] = hc_byte_perm_S (w4[3], w5[0], selector); - w5[3] = hc_byte_perm_S (w4[2], w4[3], selector); - w5[2] = hc_byte_perm_S (w4[1], w4[2], selector); - w5[1] = hc_byte_perm_S (w4[0], w4[1], selector); - w5[0] = hc_byte_perm_S (w3[3], w4[0], selector); - w4[3] = hc_byte_perm_S (w3[2], w3[3], selector); - w4[2] = hc_byte_perm_S (w3[1], w3[2], selector); - w4[1] = hc_byte_perm_S (w3[0], w3[1], selector); - w4[0] = hc_byte_perm_S (w2[3], w3[0], selector); - w3[3] = hc_byte_perm_S (w2[2], w2[3], selector); - w3[2] = hc_byte_perm_S (w2[1], w2[2], selector); - w3[1] = hc_byte_perm_S (w2[0], w2[1], selector); - w3[0] = hc_byte_perm_S (w1[3], w2[0], selector); - w2[3] = hc_byte_perm_S (w1[2], w1[3], selector); - w2[2] = hc_byte_perm_S (w1[1], w1[2], selector); - w2[1] = hc_byte_perm_S (w1[0], w1[1], selector); - w2[0] = hc_byte_perm_S (w0[3], w1[0], selector); - w1[3] = hc_byte_perm_S (w0[2], w0[3], selector); - w1[2] = hc_byte_perm_S (w0[1], w0[2], selector); - w1[1] = hc_byte_perm_S (w0[0], w0[1], selector); - w1[0] = hc_byte_perm_S ( 0, w0[0], selector); + case 18: + c4[2] = hc_byte_perm_S (w7[3], 0, selector); + c4[1] = hc_byte_perm_S (w7[2], w7[3], selector); + c4[0] = hc_byte_perm_S (w7[1], w7[2], selector); + c3[3] = hc_byte_perm_S (w7[0], w7[1], selector); + c3[2] = hc_byte_perm_S (w6[3], w7[0], selector); + c3[1] = hc_byte_perm_S (w6[2], w6[3], selector); + c3[0] = hc_byte_perm_S (w6[1], w6[2], selector); + c2[3] = hc_byte_perm_S (w6[0], w6[1], selector); + c2[2] = hc_byte_perm_S (w5[3], w6[0], selector); + c2[1] = hc_byte_perm_S (w5[2], w5[3], selector); + c2[0] = hc_byte_perm_S (w5[1], w5[2], selector); + c1[3] = hc_byte_perm_S (w5[0], w5[1], selector); + c1[2] = hc_byte_perm_S (w4[3], w5[0], selector); + c1[1] = hc_byte_perm_S (w4[2], w4[3], selector); + c1[0] = hc_byte_perm_S (w4[1], w4[2], selector); + c0[3] = hc_byte_perm_S (w4[0], w4[1], selector); + c0[2] = hc_byte_perm_S (w3[3], w4[0], selector); + c0[1] = hc_byte_perm_S (w3[2], w3[3], selector); + c0[0] = hc_byte_perm_S (w3[1], w3[2], selector); + w7[3] = hc_byte_perm_S (w3[0], w3[1], selector); + w7[2] = hc_byte_perm_S (w2[3], w3[0], selector); + w7[1] = hc_byte_perm_S (w2[2], w2[3], selector); + w7[0] = hc_byte_perm_S (w2[1], w2[2], selector); + w6[3] = hc_byte_perm_S (w2[0], w2[1], selector); + w6[2] = hc_byte_perm_S (w1[3], w2[0], selector); + w6[1] = hc_byte_perm_S (w1[2], w1[3], selector); + w6[0] = hc_byte_perm_S (w1[1], w1[2], selector); + w5[3] = hc_byte_perm_S (w1[0], w1[1], selector); + w5[2] = hc_byte_perm_S (w0[3], w1[0], selector); + w5[1] = hc_byte_perm_S (w0[2], w0[3], selector); + w5[0] = hc_byte_perm_S (w0[1], w0[2], selector); + w4[3] = hc_byte_perm_S (w0[0], w0[1], selector); + w4[2] = hc_byte_perm_S ( 0, w0[0], selector); + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 5: - w7[3] = hc_byte_perm_S (w6[1], w6[2], selector); - w7[2] = hc_byte_perm_S (w6[0], w6[1], selector); - w7[1] = hc_byte_perm_S (w5[3], w6[0], selector); - w7[0] = hc_byte_perm_S (w5[2], w5[3], selector); - w6[3] = hc_byte_perm_S (w5[1], w5[2], selector); - w6[2] = hc_byte_perm_S (w5[0], w5[1], selector); - w6[1] = hc_byte_perm_S (w4[3], w5[0], selector); - w6[0] = hc_byte_perm_S (w4[2], w4[3], selector); - w5[3] = hc_byte_perm_S (w4[1], w4[2], selector); - w5[2] = hc_byte_perm_S (w4[0], w4[1], selector); - w5[1] = hc_byte_perm_S (w3[3], w4[0], selector); - w5[0] = hc_byte_perm_S (w3[2], w3[3], selector); - w4[3] = hc_byte_perm_S (w3[1], w3[2], selector); - w4[2] = hc_byte_perm_S (w3[0], w3[1], selector); - w4[1] = hc_byte_perm_S (w2[3], w3[0], selector); - w4[0] = hc_byte_perm_S (w2[2], w2[3], selector); - w3[3] = hc_byte_perm_S (w2[1], w2[2], selector); - w3[2] = hc_byte_perm_S (w2[0], w2[1], selector); - w3[1] = hc_byte_perm_S (w1[3], w2[0], selector); - w3[0] = hc_byte_perm_S (w1[2], w1[3], selector); - w2[3] = hc_byte_perm_S (w1[1], w1[2], selector); - w2[2] = hc_byte_perm_S (w1[0], w1[1], selector); - w2[1] = hc_byte_perm_S (w0[3], w1[0], selector); - w2[0] = hc_byte_perm_S (w0[2], w0[3], selector); - w1[3] = hc_byte_perm_S (w0[1], w0[2], selector); - w1[2] = hc_byte_perm_S (w0[0], w0[1], selector); - w1[1] = hc_byte_perm_S ( 0, w0[0], selector); + case 19: + c4[3] = hc_byte_perm_S (w7[3], 0, selector); + c4[2] = hc_byte_perm_S (w7[2], w7[3], selector); + c4[1] = hc_byte_perm_S (w7[1], w7[2], selector); + c4[0] = hc_byte_perm_S (w7[0], w7[1], selector); + c3[3] = hc_byte_perm_S (w6[3], w7[0], selector); + c3[2] = hc_byte_perm_S (w6[2], w6[3], selector); + c3[1] = hc_byte_perm_S (w6[1], w6[2], selector); + c3[0] = hc_byte_perm_S (w6[0], w6[1], selector); + c2[3] = hc_byte_perm_S (w5[3], w6[0], selector); + c2[2] = hc_byte_perm_S (w5[2], w5[3], selector); + c2[1] = hc_byte_perm_S (w5[1], w5[2], selector); + c2[0] = hc_byte_perm_S (w5[0], w5[1], selector); + c1[3] = hc_byte_perm_S (w4[3], w5[0], selector); + c1[2] = hc_byte_perm_S (w4[2], w4[3], selector); + c1[1] = hc_byte_perm_S (w4[1], w4[2], selector); + c1[0] = hc_byte_perm_S (w4[0], w4[1], selector); + c0[3] = hc_byte_perm_S (w3[3], w4[0], selector); + c0[2] = hc_byte_perm_S (w3[2], w3[3], selector); + c0[1] = hc_byte_perm_S (w3[1], w3[2], selector); + c0[0] = hc_byte_perm_S (w3[0], w3[1], selector); + w7[3] = hc_byte_perm_S (w2[3], w3[0], selector); + w7[2] = hc_byte_perm_S (w2[2], w2[3], selector); + w7[1] = hc_byte_perm_S (w2[1], w2[2], selector); + w7[0] = hc_byte_perm_S (w2[0], w2[1], selector); + w6[3] = hc_byte_perm_S (w1[3], w2[0], selector); + w6[2] = hc_byte_perm_S (w1[2], w1[3], selector); + w6[1] = hc_byte_perm_S (w1[1], w1[2], selector); + w6[0] = hc_byte_perm_S (w1[0], w1[1], selector); + w5[3] = hc_byte_perm_S (w0[3], w1[0], selector); + w5[2] = hc_byte_perm_S (w0[2], w0[3], selector); + w5[1] = hc_byte_perm_S (w0[1], w0[2], selector); + w5[0] = hc_byte_perm_S (w0[0], w0[1], selector); + w4[3] = hc_byte_perm_S ( 0, w0[0], selector); + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 6: - w7[3] = hc_byte_perm_S (w6[0], w6[1], selector); - w7[2] = hc_byte_perm_S (w5[3], w6[0], selector); - w7[1] = hc_byte_perm_S (w5[2], w5[3], selector); - w7[0] = hc_byte_perm_S (w5[1], w5[2], selector); - w6[3] = hc_byte_perm_S (w5[0], w5[1], selector); - w6[2] = hc_byte_perm_S (w4[3], w5[0], selector); - w6[1] = hc_byte_perm_S (w4[2], w4[3], selector); - w6[0] = hc_byte_perm_S (w4[1], w4[2], selector); - w5[3] = hc_byte_perm_S (w4[0], w4[1], selector); - w5[2] = hc_byte_perm_S (w3[3], w4[0], selector); - w5[1] = hc_byte_perm_S (w3[2], w3[3], selector); - w5[0] = hc_byte_perm_S (w3[1], w3[2], selector); - w4[3] = hc_byte_perm_S (w3[0], w3[1], selector); - w4[2] = hc_byte_perm_S (w2[3], w3[0], selector); - w4[1] = hc_byte_perm_S (w2[2], w2[3], selector); - w4[0] = hc_byte_perm_S (w2[1], w2[2], selector); - w3[3] = hc_byte_perm_S (w2[0], w2[1], selector); - w3[2] = hc_byte_perm_S (w1[3], w2[0], selector); - w3[1] = hc_byte_perm_S (w1[2], w1[3], selector); - w3[0] = hc_byte_perm_S (w1[1], w1[2], selector); - w2[3] = hc_byte_perm_S (w1[0], w1[1], selector); - w2[2] = hc_byte_perm_S (w0[3], w1[0], selector); - w2[1] = hc_byte_perm_S (w0[2], w0[3], selector); - w2[0] = hc_byte_perm_S (w0[1], w0[2], selector); - w1[3] = hc_byte_perm_S (w0[0], w0[1], selector); - w1[2] = hc_byte_perm_S ( 0, w0[0], selector); + case 20: + c5[0] = hc_byte_perm_S (w7[3], 0, selector); + c4[3] = hc_byte_perm_S (w7[2], w7[3], selector); + c4[2] = hc_byte_perm_S (w7[1], w7[2], selector); + c4[1] = hc_byte_perm_S (w7[0], w7[1], selector); + c4[0] = hc_byte_perm_S (w6[3], w7[0], selector); + c3[3] = hc_byte_perm_S (w6[2], w6[3], selector); + c3[2] = hc_byte_perm_S (w6[1], w6[2], selector); + c3[1] = hc_byte_perm_S (w6[0], w6[1], selector); + c3[0] = hc_byte_perm_S (w5[3], w6[0], selector); + c2[3] = hc_byte_perm_S (w5[2], w5[3], selector); + c2[2] = hc_byte_perm_S (w5[1], w5[2], selector); + c2[1] = hc_byte_perm_S (w5[0], w5[1], selector); + c2[0] = hc_byte_perm_S (w4[3], w5[0], selector); + c1[3] = hc_byte_perm_S (w4[2], w4[3], selector); + c1[2] = hc_byte_perm_S (w4[1], w4[2], selector); + c1[1] = hc_byte_perm_S (w4[0], w4[1], selector); + c1[0] = hc_byte_perm_S (w3[3], w4[0], selector); + c0[3] = hc_byte_perm_S (w3[2], w3[3], selector); + c0[2] = hc_byte_perm_S (w3[1], w3[2], selector); + c0[1] = hc_byte_perm_S (w3[0], w3[1], selector); + c0[0] = hc_byte_perm_S (w2[3], w3[0], selector); + w7[3] = hc_byte_perm_S (w2[2], w2[3], selector); + w7[2] = hc_byte_perm_S (w2[1], w2[2], selector); + w7[1] = hc_byte_perm_S (w2[0], w2[1], selector); + w7[0] = hc_byte_perm_S (w1[3], w2[0], selector); + w6[3] = hc_byte_perm_S (w1[2], w1[3], selector); + w6[2] = hc_byte_perm_S (w1[1], w1[2], selector); + w6[1] = hc_byte_perm_S (w1[0], w1[1], selector); + w6[0] = hc_byte_perm_S (w0[3], w1[0], selector); + w5[3] = hc_byte_perm_S (w0[2], w0[3], selector); + w5[2] = hc_byte_perm_S (w0[1], w0[2], selector); + w5[1] = hc_byte_perm_S (w0[0], w0[1], selector); + w5[0] = hc_byte_perm_S ( 0, w0[0], selector); + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 7: - w7[3] = hc_byte_perm_S (w5[3], w6[0], selector); - w7[2] = hc_byte_perm_S (w5[2], w5[3], selector); - w7[1] = hc_byte_perm_S (w5[1], w5[2], selector); - w7[0] = hc_byte_perm_S (w5[0], w5[1], selector); - w6[3] = hc_byte_perm_S (w4[3], w5[0], selector); - w6[2] = hc_byte_perm_S (w4[2], w4[3], selector); - w6[1] = hc_byte_perm_S (w4[1], w4[2], selector); - w6[0] = hc_byte_perm_S (w4[0], w4[1], selector); - w5[3] = hc_byte_perm_S (w3[3], w4[0], selector); - w5[2] = hc_byte_perm_S (w3[2], w3[3], selector); - w5[1] = hc_byte_perm_S (w3[1], w3[2], selector); - w5[0] = hc_byte_perm_S (w3[0], w3[1], selector); - w4[3] = hc_byte_perm_S (w2[3], w3[0], selector); - w4[2] = hc_byte_perm_S (w2[2], w2[3], selector); - w4[1] = hc_byte_perm_S (w2[1], w2[2], selector); - w4[0] = hc_byte_perm_S (w2[0], w2[1], selector); - w3[3] = hc_byte_perm_S (w1[3], w2[0], selector); - w3[2] = hc_byte_perm_S (w1[2], w1[3], selector); - w3[1] = hc_byte_perm_S (w1[1], w1[2], selector); - w3[0] = hc_byte_perm_S (w1[0], w1[1], selector); - w2[3] = hc_byte_perm_S (w0[3], w1[0], selector); - w2[2] = hc_byte_perm_S (w0[2], w0[3], selector); - w2[1] = hc_byte_perm_S (w0[1], w0[2], selector); - w2[0] = hc_byte_perm_S (w0[0], w0[1], selector); - w1[3] = hc_byte_perm_S ( 0, w0[0], selector); + case 21: + c5[1] = hc_byte_perm_S (w7[3], 0, selector); + c5[0] = hc_byte_perm_S (w7[2], w7[3], selector); + c4[3] = hc_byte_perm_S (w7[1], w7[2], selector); + c4[2] = hc_byte_perm_S (w7[0], w7[1], selector); + c4[1] = hc_byte_perm_S (w6[3], w7[0], selector); + c4[0] = hc_byte_perm_S (w6[2], w6[3], selector); + c3[3] = hc_byte_perm_S (w6[1], w6[2], selector); + c3[2] = hc_byte_perm_S (w6[0], w6[1], selector); + c3[1] = hc_byte_perm_S (w5[3], w6[0], selector); + c3[0] = hc_byte_perm_S (w5[2], w5[3], selector); + c2[3] = hc_byte_perm_S (w5[1], w5[2], selector); + c2[2] = hc_byte_perm_S (w5[0], w5[1], selector); + c2[1] = hc_byte_perm_S (w4[3], w5[0], selector); + c2[0] = hc_byte_perm_S (w4[2], w4[3], selector); + c1[3] = hc_byte_perm_S (w4[1], w4[2], selector); + c1[2] = hc_byte_perm_S (w4[0], w4[1], selector); + c1[1] = hc_byte_perm_S (w3[3], w4[0], selector); + c1[0] = hc_byte_perm_S (w3[2], w3[3], selector); + c0[3] = hc_byte_perm_S (w3[1], w3[2], selector); + c0[2] = hc_byte_perm_S (w3[0], w3[1], selector); + c0[1] = hc_byte_perm_S (w2[3], w3[0], selector); + c0[0] = hc_byte_perm_S (w2[2], w2[3], selector); + w7[3] = hc_byte_perm_S (w2[1], w2[2], selector); + w7[2] = hc_byte_perm_S (w2[0], w2[1], selector); + w7[1] = hc_byte_perm_S (w1[3], w2[0], selector); + w7[0] = hc_byte_perm_S (w1[2], w1[3], selector); + w6[3] = hc_byte_perm_S (w1[1], w1[2], selector); + w6[2] = hc_byte_perm_S (w1[0], w1[1], selector); + w6[1] = hc_byte_perm_S (w0[3], w1[0], selector); + w6[0] = hc_byte_perm_S (w0[2], w0[3], selector); + w5[3] = hc_byte_perm_S (w0[1], w0[2], selector); + w5[2] = hc_byte_perm_S (w0[0], w0[1], selector); + w5[1] = hc_byte_perm_S ( 0, w0[0], selector); + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -37417,33 +43837,57 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 8: - w7[3] = hc_byte_perm_S (w5[2], w5[3], selector); - w7[2] = hc_byte_perm_S (w5[1], w5[2], selector); - w7[1] = hc_byte_perm_S (w5[0], w5[1], selector); - w7[0] = hc_byte_perm_S (w4[3], w5[0], selector); - w6[3] = hc_byte_perm_S (w4[2], w4[3], selector); - w6[2] = hc_byte_perm_S (w4[1], w4[2], selector); - w6[1] = hc_byte_perm_S (w4[0], w4[1], selector); - w6[0] = hc_byte_perm_S (w3[3], w4[0], selector); - w5[3] = hc_byte_perm_S (w3[2], w3[3], selector); - w5[2] = hc_byte_perm_S (w3[1], w3[2], selector); - w5[1] = hc_byte_perm_S (w3[0], w3[1], selector); - w5[0] = hc_byte_perm_S (w2[3], w3[0], selector); - w4[3] = hc_byte_perm_S (w2[2], w2[3], selector); - w4[2] = hc_byte_perm_S (w2[1], w2[2], selector); - w4[1] = hc_byte_perm_S (w2[0], w2[1], selector); - w4[0] = hc_byte_perm_S (w1[3], w2[0], selector); - w3[3] = hc_byte_perm_S (w1[2], w1[3], selector); - w3[2] = hc_byte_perm_S (w1[1], w1[2], selector); - w3[1] = hc_byte_perm_S (w1[0], w1[1], selector); - w3[0] = hc_byte_perm_S (w0[3], w1[0], selector); - w2[3] = hc_byte_perm_S (w0[2], w0[3], selector); - w2[2] = hc_byte_perm_S (w0[1], w0[2], selector); - w2[1] = hc_byte_perm_S (w0[0], w0[1], selector); - w2[0] = hc_byte_perm_S ( 0, w0[0], selector); + case 22: + c5[2] = hc_byte_perm_S (w7[3], 0, selector); + c5[1] = hc_byte_perm_S (w7[2], w7[3], selector); + c5[0] = hc_byte_perm_S (w7[1], w7[2], selector); + c4[3] = hc_byte_perm_S (w7[0], w7[1], selector); + c4[2] = hc_byte_perm_S (w6[3], w7[0], selector); + c4[1] = hc_byte_perm_S (w6[2], w6[3], selector); + c4[0] = hc_byte_perm_S (w6[1], w6[2], selector); + c3[3] = hc_byte_perm_S (w6[0], w6[1], selector); + c3[2] = hc_byte_perm_S (w5[3], w6[0], selector); + c3[1] = hc_byte_perm_S (w5[2], w5[3], selector); + c3[0] = hc_byte_perm_S (w5[1], w5[2], selector); + c2[3] = hc_byte_perm_S (w5[0], w5[1], selector); + c2[2] = hc_byte_perm_S (w4[3], w5[0], selector); + c2[1] = hc_byte_perm_S (w4[2], w4[3], selector); + c2[0] = hc_byte_perm_S (w4[1], w4[2], selector); + c1[3] = hc_byte_perm_S (w4[0], w4[1], selector); + c1[2] = hc_byte_perm_S (w3[3], w4[0], selector); + c1[1] = hc_byte_perm_S (w3[2], w3[3], selector); + c1[0] = hc_byte_perm_S (w3[1], w3[2], selector); + c0[3] = hc_byte_perm_S (w3[0], w3[1], selector); + c0[2] = hc_byte_perm_S (w2[3], w3[0], selector); + c0[1] = hc_byte_perm_S (w2[2], w2[3], selector); + c0[0] = hc_byte_perm_S (w2[1], w2[2], selector); + w7[3] = hc_byte_perm_S (w2[0], w2[1], selector); + w7[2] = hc_byte_perm_S (w1[3], w2[0], selector); + w7[1] = hc_byte_perm_S (w1[2], w1[3], selector); + w7[0] = hc_byte_perm_S (w1[1], w1[2], selector); + w6[3] = hc_byte_perm_S (w1[0], w1[1], selector); + w6[2] = hc_byte_perm_S (w0[3], w1[0], selector); + w6[1] = hc_byte_perm_S (w0[2], w0[3], selector); + w6[0] = hc_byte_perm_S (w0[1], w0[2], selector); + w5[3] = hc_byte_perm_S (w0[0], w0[1], selector); + w5[2] = hc_byte_perm_S ( 0, w0[0], selector); + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -37452,32 +43896,180 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 9: - w7[3] = hc_byte_perm_S (w5[1], w5[2], selector); - w7[2] = hc_byte_perm_S (w5[0], w5[1], selector); - w7[1] = hc_byte_perm_S (w4[3], w5[0], selector); - w7[0] = hc_byte_perm_S (w4[2], w4[3], selector); - w6[3] = hc_byte_perm_S (w4[1], w4[2], selector); - w6[2] = hc_byte_perm_S (w4[0], w4[1], selector); - w6[1] = hc_byte_perm_S (w3[3], w4[0], selector); - w6[0] = hc_byte_perm_S (w3[2], w3[3], selector); - w5[3] = hc_byte_perm_S (w3[1], w3[2], selector); - w5[2] = hc_byte_perm_S (w3[0], w3[1], selector); - w5[1] = hc_byte_perm_S (w2[3], w3[0], selector); - w5[0] = hc_byte_perm_S (w2[2], w2[3], selector); - w4[3] = hc_byte_perm_S (w2[1], w2[2], selector); - w4[2] = hc_byte_perm_S (w2[0], w2[1], selector); - w4[1] = hc_byte_perm_S (w1[3], w2[0], selector); - w4[0] = hc_byte_perm_S (w1[2], w1[3], selector); - w3[3] = hc_byte_perm_S (w1[1], w1[2], selector); - w3[2] = hc_byte_perm_S (w1[0], w1[1], selector); - w3[1] = hc_byte_perm_S (w0[3], w1[0], selector); - w3[0] = hc_byte_perm_S (w0[2], w0[3], selector); - w2[3] = hc_byte_perm_S (w0[1], w0[2], selector); - w2[2] = hc_byte_perm_S (w0[0], w0[1], selector); - w2[1] = hc_byte_perm_S ( 0, w0[0], selector); + case 23: + c5[3] = hc_byte_perm_S (w7[3], 0, selector); + c5[2] = hc_byte_perm_S (w7[2], w7[3], selector); + c5[1] = hc_byte_perm_S (w7[1], w7[2], selector); + c5[0] = hc_byte_perm_S (w7[0], w7[1], selector); + c4[3] = hc_byte_perm_S (w6[3], w7[0], selector); + c4[2] = hc_byte_perm_S (w6[2], w6[3], selector); + c4[1] = hc_byte_perm_S (w6[1], w6[2], selector); + c4[0] = hc_byte_perm_S (w6[0], w6[1], selector); + c3[3] = hc_byte_perm_S (w5[3], w6[0], selector); + c3[2] = hc_byte_perm_S (w5[2], w5[3], selector); + c3[1] = hc_byte_perm_S (w5[1], w5[2], selector); + c3[0] = hc_byte_perm_S (w5[0], w5[1], selector); + c2[3] = hc_byte_perm_S (w4[3], w5[0], selector); + c2[2] = hc_byte_perm_S (w4[2], w4[3], selector); + c2[1] = hc_byte_perm_S (w4[1], w4[2], selector); + c2[0] = hc_byte_perm_S (w4[0], w4[1], selector); + c1[3] = hc_byte_perm_S (w3[3], w4[0], selector); + c1[2] = hc_byte_perm_S (w3[2], w3[3], selector); + c1[1] = hc_byte_perm_S (w3[1], w3[2], selector); + c1[0] = hc_byte_perm_S (w3[0], w3[1], selector); + c0[3] = hc_byte_perm_S (w2[3], w3[0], selector); + c0[2] = hc_byte_perm_S (w2[2], w2[3], selector); + c0[1] = hc_byte_perm_S (w2[1], w2[2], selector); + c0[0] = hc_byte_perm_S (w2[0], w2[1], selector); + w7[3] = hc_byte_perm_S (w1[3], w2[0], selector); + w7[2] = hc_byte_perm_S (w1[2], w1[3], selector); + w7[1] = hc_byte_perm_S (w1[1], w1[2], selector); + w7[0] = hc_byte_perm_S (w1[0], w1[1], selector); + w6[3] = hc_byte_perm_S (w0[3], w1[0], selector); + w6[2] = hc_byte_perm_S (w0[2], w0[3], selector); + w6[1] = hc_byte_perm_S (w0[1], w0[2], selector); + w6[0] = hc_byte_perm_S (w0[0], w0[1], selector); + w5[3] = hc_byte_perm_S ( 0, w0[0], selector); + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 24: + c6[0] = hc_byte_perm_S (w7[3], 0, selector); + c5[3] = hc_byte_perm_S (w7[2], w7[3], selector); + c5[2] = hc_byte_perm_S (w7[1], w7[2], selector); + c5[1] = hc_byte_perm_S (w7[0], w7[1], selector); + c5[0] = hc_byte_perm_S (w6[3], w7[0], selector); + c4[3] = hc_byte_perm_S (w6[2], w6[3], selector); + c4[2] = hc_byte_perm_S (w6[1], w6[2], selector); + c4[1] = hc_byte_perm_S (w6[0], w6[1], selector); + c4[0] = hc_byte_perm_S (w5[3], w6[0], selector); + c3[3] = hc_byte_perm_S (w5[2], w5[3], selector); + c3[2] = hc_byte_perm_S (w5[1], w5[2], selector); + c3[1] = hc_byte_perm_S (w5[0], w5[1], selector); + c3[0] = hc_byte_perm_S (w4[3], w5[0], selector); + c2[3] = hc_byte_perm_S (w4[2], w4[3], selector); + c2[2] = hc_byte_perm_S (w4[1], w4[2], selector); + c2[1] = hc_byte_perm_S (w4[0], w4[1], selector); + c2[0] = hc_byte_perm_S (w3[3], w4[0], selector); + c1[3] = hc_byte_perm_S (w3[2], w3[3], selector); + c1[2] = hc_byte_perm_S (w3[1], w3[2], selector); + c1[1] = hc_byte_perm_S (w3[0], w3[1], selector); + c1[0] = hc_byte_perm_S (w2[3], w3[0], selector); + c0[3] = hc_byte_perm_S (w2[2], w2[3], selector); + c0[2] = hc_byte_perm_S (w2[1], w2[2], selector); + c0[1] = hc_byte_perm_S (w2[0], w2[1], selector); + c0[0] = hc_byte_perm_S (w1[3], w2[0], selector); + w7[3] = hc_byte_perm_S (w1[2], w1[3], selector); + w7[2] = hc_byte_perm_S (w1[1], w1[2], selector); + w7[1] = hc_byte_perm_S (w1[0], w1[1], selector); + w7[0] = hc_byte_perm_S (w0[3], w1[0], selector); + w6[3] = hc_byte_perm_S (w0[2], w0[3], selector); + w6[2] = hc_byte_perm_S (w0[1], w0[2], selector); + w6[1] = hc_byte_perm_S (w0[0], w0[1], selector); + w6[0] = hc_byte_perm_S ( 0, w0[0], selector); + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 25: + c6[1] = hc_byte_perm_S (w7[3], 0, selector); + c6[0] = hc_byte_perm_S (w7[2], w7[3], selector); + c5[3] = hc_byte_perm_S (w7[1], w7[2], selector); + c5[2] = hc_byte_perm_S (w7[0], w7[1], selector); + c5[1] = hc_byte_perm_S (w6[3], w7[0], selector); + c5[0] = hc_byte_perm_S (w6[2], w6[3], selector); + c4[3] = hc_byte_perm_S (w6[1], w6[2], selector); + c4[2] = hc_byte_perm_S (w6[0], w6[1], selector); + c4[1] = hc_byte_perm_S (w5[3], w6[0], selector); + c4[0] = hc_byte_perm_S (w5[2], w5[3], selector); + c3[3] = hc_byte_perm_S (w5[1], w5[2], selector); + c3[2] = hc_byte_perm_S (w5[0], w5[1], selector); + c3[1] = hc_byte_perm_S (w4[3], w5[0], selector); + c3[0] = hc_byte_perm_S (w4[2], w4[3], selector); + c2[3] = hc_byte_perm_S (w4[1], w4[2], selector); + c2[2] = hc_byte_perm_S (w4[0], w4[1], selector); + c2[1] = hc_byte_perm_S (w3[3], w4[0], selector); + c2[0] = hc_byte_perm_S (w3[2], w3[3], selector); + c1[3] = hc_byte_perm_S (w3[1], w3[2], selector); + c1[2] = hc_byte_perm_S (w3[0], w3[1], selector); + c1[1] = hc_byte_perm_S (w2[3], w3[0], selector); + c1[0] = hc_byte_perm_S (w2[2], w2[3], selector); + c0[3] = hc_byte_perm_S (w2[1], w2[2], selector); + c0[2] = hc_byte_perm_S (w2[0], w2[1], selector); + c0[1] = hc_byte_perm_S (w1[3], w2[0], selector); + c0[0] = hc_byte_perm_S (w1[2], w1[3], selector); + w7[3] = hc_byte_perm_S (w1[1], w1[2], selector); + w7[2] = hc_byte_perm_S (w1[0], w1[1], selector); + w7[1] = hc_byte_perm_S (w0[3], w1[0], selector); + w7[0] = hc_byte_perm_S (w0[2], w0[3], selector); + w6[3] = hc_byte_perm_S (w0[1], w0[2], selector); + w6[2] = hc_byte_perm_S (w0[0], w0[1], selector); + w6[1] = hc_byte_perm_S ( 0, w0[0], selector); + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -37487,31 +44079,59 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 10: - w7[3] = hc_byte_perm_S (w5[0], w5[1], selector); - w7[2] = hc_byte_perm_S (w4[3], w5[0], selector); - w7[1] = hc_byte_perm_S (w4[2], w4[3], selector); - w7[0] = hc_byte_perm_S (w4[1], w4[2], selector); - w6[3] = hc_byte_perm_S (w4[0], w4[1], selector); - w6[2] = hc_byte_perm_S (w3[3], w4[0], selector); - w6[1] = hc_byte_perm_S (w3[2], w3[3], selector); - w6[0] = hc_byte_perm_S (w3[1], w3[2], selector); - w5[3] = hc_byte_perm_S (w3[0], w3[1], selector); - w5[2] = hc_byte_perm_S (w2[3], w3[0], selector); - w5[1] = hc_byte_perm_S (w2[2], w2[3], selector); - w5[0] = hc_byte_perm_S (w2[1], w2[2], selector); - w4[3] = hc_byte_perm_S (w2[0], w2[1], selector); - w4[2] = hc_byte_perm_S (w1[3], w2[0], selector); - w4[1] = hc_byte_perm_S (w1[2], w1[3], selector); - w4[0] = hc_byte_perm_S (w1[1], w1[2], selector); - w3[3] = hc_byte_perm_S (w1[0], w1[1], selector); - w3[2] = hc_byte_perm_S (w0[3], w1[0], selector); - w3[1] = hc_byte_perm_S (w0[2], w0[3], selector); - w3[0] = hc_byte_perm_S (w0[1], w0[2], selector); - w2[3] = hc_byte_perm_S (w0[0], w0[1], selector); - w2[2] = hc_byte_perm_S ( 0, w0[0], selector); + case 26: + c6[2] = hc_byte_perm_S (w7[3], 0, selector); + c6[1] = hc_byte_perm_S (w7[2], w7[3], selector); + c6[0] = hc_byte_perm_S (w7[1], w7[2], selector); + c5[3] = hc_byte_perm_S (w7[0], w7[1], selector); + c5[2] = hc_byte_perm_S (w6[3], w7[0], selector); + c5[1] = hc_byte_perm_S (w6[2], w6[3], selector); + c5[0] = hc_byte_perm_S (w6[1], w6[2], selector); + c4[3] = hc_byte_perm_S (w6[0], w6[1], selector); + c4[2] = hc_byte_perm_S (w5[3], w6[0], selector); + c4[1] = hc_byte_perm_S (w5[2], w5[3], selector); + c4[0] = hc_byte_perm_S (w5[1], w5[2], selector); + c3[3] = hc_byte_perm_S (w5[0], w5[1], selector); + c3[2] = hc_byte_perm_S (w4[3], w5[0], selector); + c3[1] = hc_byte_perm_S (w4[2], w4[3], selector); + c3[0] = hc_byte_perm_S (w4[1], w4[2], selector); + c2[3] = hc_byte_perm_S (w4[0], w4[1], selector); + c2[2] = hc_byte_perm_S (w3[3], w4[0], selector); + c2[1] = hc_byte_perm_S (w3[2], w3[3], selector); + c2[0] = hc_byte_perm_S (w3[1], w3[2], selector); + c1[3] = hc_byte_perm_S (w3[0], w3[1], selector); + c1[2] = hc_byte_perm_S (w2[3], w3[0], selector); + c1[1] = hc_byte_perm_S (w2[2], w2[3], selector); + c1[0] = hc_byte_perm_S (w2[1], w2[2], selector); + c0[3] = hc_byte_perm_S (w2[0], w2[1], selector); + c0[2] = hc_byte_perm_S (w1[3], w2[0], selector); + c0[1] = hc_byte_perm_S (w1[2], w1[3], selector); + c0[0] = hc_byte_perm_S (w1[1], w1[2], selector); + w7[3] = hc_byte_perm_S (w1[0], w1[1], selector); + w7[2] = hc_byte_perm_S (w0[3], w1[0], selector); + w7[1] = hc_byte_perm_S (w0[2], w0[3], selector); + w7[0] = hc_byte_perm_S (w0[1], w0[2], selector); + w6[3] = hc_byte_perm_S (w0[0], w0[1], selector); + w6[2] = hc_byte_perm_S ( 0, w0[0], selector); + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -37522,30 +44142,59 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 11: - w7[3] = hc_byte_perm_S (w4[3], w5[0], selector); - w7[2] = hc_byte_perm_S (w4[2], w4[3], selector); - w7[1] = hc_byte_perm_S (w4[1], w4[2], selector); - w7[0] = hc_byte_perm_S (w4[0], w4[1], selector); - w6[3] = hc_byte_perm_S (w3[3], w4[0], selector); - w6[2] = hc_byte_perm_S (w3[2], w3[3], selector); - w6[1] = hc_byte_perm_S (w3[1], w3[2], selector); - w6[0] = hc_byte_perm_S (w3[0], w3[1], selector); - w5[3] = hc_byte_perm_S (w2[3], w3[0], selector); - w5[2] = hc_byte_perm_S (w2[2], w2[3], selector); - w5[1] = hc_byte_perm_S (w2[1], w2[2], selector); - w5[0] = hc_byte_perm_S (w2[0], w2[1], selector); - w4[3] = hc_byte_perm_S (w1[3], w2[0], selector); - w4[2] = hc_byte_perm_S (w1[2], w1[3], selector); - w4[1] = hc_byte_perm_S (w1[1], w1[2], selector); - w4[0] = hc_byte_perm_S (w1[0], w1[1], selector); - w3[3] = hc_byte_perm_S (w0[3], w1[0], selector); - w3[2] = hc_byte_perm_S (w0[2], w0[3], selector); - w3[1] = hc_byte_perm_S (w0[1], w0[2], selector); - w3[0] = hc_byte_perm_S (w0[0], w0[1], selector); - w2[3] = hc_byte_perm_S ( 0, w0[0], selector); + case 27: + c6[3] = hc_byte_perm_S (w7[3], 0, selector); + c6[2] = hc_byte_perm_S (w7[2], w7[3], selector); + c6[1] = hc_byte_perm_S (w7[1], w7[2], selector); + c6[0] = hc_byte_perm_S (w7[0], w7[1], selector); + c5[3] = hc_byte_perm_S (w6[3], w7[0], selector); + c5[2] = hc_byte_perm_S (w6[2], w6[3], selector); + c5[1] = hc_byte_perm_S (w6[1], w6[2], selector); + c5[0] = hc_byte_perm_S (w6[0], w6[1], selector); + c4[3] = hc_byte_perm_S (w5[3], w6[0], selector); + c4[2] = hc_byte_perm_S (w5[2], w5[3], selector); + c4[1] = hc_byte_perm_S (w5[1], w5[2], selector); + c4[0] = hc_byte_perm_S (w5[0], w5[1], selector); + c3[3] = hc_byte_perm_S (w4[3], w5[0], selector); + c3[2] = hc_byte_perm_S (w4[2], w4[3], selector); + c3[1] = hc_byte_perm_S (w4[1], w4[2], selector); + c3[0] = hc_byte_perm_S (w4[0], w4[1], selector); + c2[3] = hc_byte_perm_S (w3[3], w4[0], selector); + c2[2] = hc_byte_perm_S (w3[2], w3[3], selector); + c2[1] = hc_byte_perm_S (w3[1], w3[2], selector); + c2[0] = hc_byte_perm_S (w3[0], w3[1], selector); + c1[3] = hc_byte_perm_S (w2[3], w3[0], selector); + c1[2] = hc_byte_perm_S (w2[2], w2[3], selector); + c1[1] = hc_byte_perm_S (w2[1], w2[2], selector); + c1[0] = hc_byte_perm_S (w2[0], w2[1], selector); + c0[3] = hc_byte_perm_S (w1[3], w2[0], selector); + c0[2] = hc_byte_perm_S (w1[2], w1[3], selector); + c0[1] = hc_byte_perm_S (w1[1], w1[2], selector); + c0[0] = hc_byte_perm_S (w1[0], w1[1], selector); + w7[3] = hc_byte_perm_S (w0[3], w1[0], selector); + w7[2] = hc_byte_perm_S (w0[2], w0[3], selector); + w7[1] = hc_byte_perm_S (w0[1], w0[2], selector); + w7[0] = hc_byte_perm_S (w0[0], w0[1], selector); + w6[3] = hc_byte_perm_S ( 0, w0[0], selector); + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -37557,29 +44206,59 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 12: - w7[3] = hc_byte_perm_S (w4[2], w4[3], selector); - w7[2] = hc_byte_perm_S (w4[1], w4[2], selector); - w7[1] = hc_byte_perm_S (w4[0], w4[1], selector); - w7[0] = hc_byte_perm_S (w3[3], w4[0], selector); - w6[3] = hc_byte_perm_S (w3[2], w3[3], selector); - w6[2] = hc_byte_perm_S (w3[1], w3[2], selector); - w6[1] = hc_byte_perm_S (w3[0], w3[1], selector); - w6[0] = hc_byte_perm_S (w2[3], w3[0], selector); - w5[3] = hc_byte_perm_S (w2[2], w2[3], selector); - w5[2] = hc_byte_perm_S (w2[1], w2[2], selector); - w5[1] = hc_byte_perm_S (w2[0], w2[1], selector); - w5[0] = hc_byte_perm_S (w1[3], w2[0], selector); - w4[3] = hc_byte_perm_S (w1[2], w1[3], selector); - w4[2] = hc_byte_perm_S (w1[1], w1[2], selector); - w4[1] = hc_byte_perm_S (w1[0], w1[1], selector); - w4[0] = hc_byte_perm_S (w0[3], w1[0], selector); - w3[3] = hc_byte_perm_S (w0[2], w0[3], selector); - w3[2] = hc_byte_perm_S (w0[1], w0[2], selector); - w3[1] = hc_byte_perm_S (w0[0], w0[1], selector); - w3[0] = hc_byte_perm_S ( 0, w0[0], selector); + case 28: + c7[0] = hc_byte_perm_S (w7[3], 0, selector); + c6[3] = hc_byte_perm_S (w7[2], w7[3], selector); + c6[2] = hc_byte_perm_S (w7[1], w7[2], selector); + c6[1] = hc_byte_perm_S (w7[0], w7[1], selector); + c6[0] = hc_byte_perm_S (w6[3], w7[0], selector); + c5[3] = hc_byte_perm_S (w6[2], w6[3], selector); + c5[2] = hc_byte_perm_S (w6[1], w6[2], selector); + c5[1] = hc_byte_perm_S (w6[0], w6[1], selector); + c5[0] = hc_byte_perm_S (w5[3], w6[0], selector); + c4[3] = hc_byte_perm_S (w5[2], w5[3], selector); + c4[2] = hc_byte_perm_S (w5[1], w5[2], selector); + c4[1] = hc_byte_perm_S (w5[0], w5[1], selector); + c4[0] = hc_byte_perm_S (w4[3], w5[0], selector); + c3[3] = hc_byte_perm_S (w4[2], w4[3], selector); + c3[2] = hc_byte_perm_S (w4[1], w4[2], selector); + c3[1] = hc_byte_perm_S (w4[0], w4[1], selector); + c3[0] = hc_byte_perm_S (w3[3], w4[0], selector); + c2[3] = hc_byte_perm_S (w3[2], w3[3], selector); + c2[2] = hc_byte_perm_S (w3[1], w3[2], selector); + c2[1] = hc_byte_perm_S (w3[0], w3[1], selector); + c2[0] = hc_byte_perm_S (w2[3], w3[0], selector); + c1[3] = hc_byte_perm_S (w2[2], w2[3], selector); + c1[2] = hc_byte_perm_S (w2[1], w2[2], selector); + c1[1] = hc_byte_perm_S (w2[0], w2[1], selector); + c1[0] = hc_byte_perm_S (w1[3], w2[0], selector); + c0[3] = hc_byte_perm_S (w1[2], w1[3], selector); + c0[2] = hc_byte_perm_S (w1[1], w1[2], selector); + c0[1] = hc_byte_perm_S (w1[0], w1[1], selector); + c0[0] = hc_byte_perm_S (w0[3], w1[0], selector); + w7[3] = hc_byte_perm_S (w0[2], w0[3], selector); + w7[2] = hc_byte_perm_S (w0[1], w0[2], selector); + w7[1] = hc_byte_perm_S (w0[0], w0[1], selector); + w7[0] = hc_byte_perm_S ( 0, w0[0], selector); + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -37592,28 +44271,59 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 13: - w7[3] = hc_byte_perm_S (w4[1], w4[2], selector); - w7[2] = hc_byte_perm_S (w4[0], w4[1], selector); - w7[1] = hc_byte_perm_S (w3[3], w4[0], selector); - w7[0] = hc_byte_perm_S (w3[2], w3[3], selector); - w6[3] = hc_byte_perm_S (w3[1], w3[2], selector); - w6[2] = hc_byte_perm_S (w3[0], w3[1], selector); - w6[1] = hc_byte_perm_S (w2[3], w3[0], selector); - w6[0] = hc_byte_perm_S (w2[2], w2[3], selector); - w5[3] = hc_byte_perm_S (w2[1], w2[2], selector); - w5[2] = hc_byte_perm_S (w2[0], w2[1], selector); - w5[1] = hc_byte_perm_S (w1[3], w2[0], selector); - w5[0] = hc_byte_perm_S (w1[2], w1[3], selector); - w4[3] = hc_byte_perm_S (w1[1], w1[2], selector); - w4[2] = hc_byte_perm_S (w1[0], w1[1], selector); - w4[1] = hc_byte_perm_S (w0[3], w1[0], selector); - w4[0] = hc_byte_perm_S (w0[2], w0[3], selector); - w3[3] = hc_byte_perm_S (w0[1], w0[2], selector); - w3[2] = hc_byte_perm_S (w0[0], w0[1], selector); - w3[1] = hc_byte_perm_S ( 0, w0[0], selector); + case 29: + c7[1] = hc_byte_perm_S (w7[3], 0, selector); + c7[0] = hc_byte_perm_S (w7[2], w7[3], selector); + c6[3] = hc_byte_perm_S (w7[1], w7[2], selector); + c6[2] = hc_byte_perm_S (w7[0], w7[1], selector); + c6[1] = hc_byte_perm_S (w6[3], w7[0], selector); + c6[0] = hc_byte_perm_S (w6[2], w6[3], selector); + c5[3] = hc_byte_perm_S (w6[1], w6[2], selector); + c5[2] = hc_byte_perm_S (w6[0], w6[1], selector); + c5[1] = hc_byte_perm_S (w5[3], w6[0], selector); + c5[0] = hc_byte_perm_S (w5[2], w5[3], selector); + c4[3] = hc_byte_perm_S (w5[1], w5[2], selector); + c4[2] = hc_byte_perm_S (w5[0], w5[1], selector); + c4[1] = hc_byte_perm_S (w4[3], w5[0], selector); + c4[0] = hc_byte_perm_S (w4[2], w4[3], selector); + c3[3] = hc_byte_perm_S (w4[1], w4[2], selector); + c3[2] = hc_byte_perm_S (w4[0], w4[1], selector); + c3[1] = hc_byte_perm_S (w3[3], w4[0], selector); + c3[0] = hc_byte_perm_S (w3[2], w3[3], selector); + c2[3] = hc_byte_perm_S (w3[1], w3[2], selector); + c2[2] = hc_byte_perm_S (w3[0], w3[1], selector); + c2[1] = hc_byte_perm_S (w2[3], w3[0], selector); + c2[0] = hc_byte_perm_S (w2[2], w2[3], selector); + c1[3] = hc_byte_perm_S (w2[1], w2[2], selector); + c1[2] = hc_byte_perm_S (w2[0], w2[1], selector); + c1[1] = hc_byte_perm_S (w1[3], w2[0], selector); + c1[0] = hc_byte_perm_S (w1[2], w1[3], selector); + c0[3] = hc_byte_perm_S (w1[1], w1[2], selector); + c0[2] = hc_byte_perm_S (w1[0], w1[1], selector); + c0[1] = hc_byte_perm_S (w0[3], w1[0], selector); + c0[0] = hc_byte_perm_S (w0[2], w0[3], selector); + w7[3] = hc_byte_perm_S (w0[1], w0[2], selector); + w7[2] = hc_byte_perm_S (w0[0], w0[1], selector); + w7[1] = hc_byte_perm_S ( 0, w0[0], selector); + w7[0] = 0; + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -37627,27 +44337,59 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 14: - w7[3] = hc_byte_perm_S (w4[0], w4[1], selector); - w7[2] = hc_byte_perm_S (w3[3], w4[0], selector); - w7[1] = hc_byte_perm_S (w3[2], w3[3], selector); - w7[0] = hc_byte_perm_S (w3[1], w3[2], selector); - w6[3] = hc_byte_perm_S (w3[0], w3[1], selector); - w6[2] = hc_byte_perm_S (w2[3], w3[0], selector); - w6[1] = hc_byte_perm_S (w2[2], w2[3], selector); - w6[0] = hc_byte_perm_S (w2[1], w2[2], selector); - w5[3] = hc_byte_perm_S (w2[0], w2[1], selector); - w5[2] = hc_byte_perm_S (w1[3], w2[0], selector); - w5[1] = hc_byte_perm_S (w1[2], w1[3], selector); - w5[0] = hc_byte_perm_S (w1[1], w1[2], selector); - w4[3] = hc_byte_perm_S (w1[0], w1[1], selector); - w4[2] = hc_byte_perm_S (w0[3], w1[0], selector); - w4[1] = hc_byte_perm_S (w0[2], w0[3], selector); - w4[0] = hc_byte_perm_S (w0[1], w0[2], selector); - w3[3] = hc_byte_perm_S (w0[0], w0[1], selector); - w3[2] = hc_byte_perm_S ( 0, w0[0], selector); + case 30: + c7[2] = hc_byte_perm_S (w7[3], 0, selector); + c7[1] = hc_byte_perm_S (w7[2], w7[3], selector); + c7[0] = hc_byte_perm_S (w7[1], w7[2], selector); + c6[3] = hc_byte_perm_S (w7[0], w7[1], selector); + c6[2] = hc_byte_perm_S (w6[3], w7[0], selector); + c6[1] = hc_byte_perm_S (w6[2], w6[3], selector); + c6[0] = hc_byte_perm_S (w6[1], w6[2], selector); + c5[3] = hc_byte_perm_S (w6[0], w6[1], selector); + c5[2] = hc_byte_perm_S (w5[3], w6[0], selector); + c5[1] = hc_byte_perm_S (w5[2], w5[3], selector); + c5[0] = hc_byte_perm_S (w5[1], w5[2], selector); + c4[3] = hc_byte_perm_S (w5[0], w5[1], selector); + c4[2] = hc_byte_perm_S (w4[3], w5[0], selector); + c4[1] = hc_byte_perm_S (w4[2], w4[3], selector); + c4[0] = hc_byte_perm_S (w4[1], w4[2], selector); + c3[3] = hc_byte_perm_S (w4[0], w4[1], selector); + c3[2] = hc_byte_perm_S (w3[3], w4[0], selector); + c3[1] = hc_byte_perm_S (w3[2], w3[3], selector); + c3[0] = hc_byte_perm_S (w3[1], w3[2], selector); + c2[3] = hc_byte_perm_S (w3[0], w3[1], selector); + c2[2] = hc_byte_perm_S (w2[3], w3[0], selector); + c2[1] = hc_byte_perm_S (w2[2], w2[3], selector); + c2[0] = hc_byte_perm_S (w2[1], w2[2], selector); + c1[3] = hc_byte_perm_S (w2[0], w2[1], selector); + c1[2] = hc_byte_perm_S (w1[3], w2[0], selector); + c1[1] = hc_byte_perm_S (w1[2], w1[3], selector); + c1[0] = hc_byte_perm_S (w1[1], w1[2], selector); + c0[3] = hc_byte_perm_S (w1[0], w1[1], selector); + c0[2] = hc_byte_perm_S (w0[3], w1[0], selector); + c0[1] = hc_byte_perm_S (w0[2], w0[3], selector); + c0[0] = hc_byte_perm_S (w0[1], w0[2], selector); + w7[3] = hc_byte_perm_S (w0[0], w0[1], selector); + w7[2] = hc_byte_perm_S ( 0, w0[0], selector); + w7[1] = 0; + w7[0] = 0; + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -37662,26 +44404,59 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 15: - w7[3] = hc_byte_perm_S (w3[3], w4[0], selector); - w7[2] = hc_byte_perm_S (w3[2], w3[3], selector); - w7[1] = hc_byte_perm_S (w3[1], w3[2], selector); - w7[0] = hc_byte_perm_S (w3[0], w3[1], selector); - w6[3] = hc_byte_perm_S (w2[3], w3[0], selector); - w6[2] = hc_byte_perm_S (w2[2], w2[3], selector); - w6[1] = hc_byte_perm_S (w2[1], w2[2], selector); - w6[0] = hc_byte_perm_S (w2[0], w2[1], selector); - w5[3] = hc_byte_perm_S (w1[3], w2[0], selector); - w5[2] = hc_byte_perm_S (w1[2], w1[3], selector); - w5[1] = hc_byte_perm_S (w1[1], w1[2], selector); - w5[0] = hc_byte_perm_S (w1[0], w1[1], selector); - w4[3] = hc_byte_perm_S (w0[3], w1[0], selector); - w4[2] = hc_byte_perm_S (w0[2], w0[3], selector); - w4[1] = hc_byte_perm_S (w0[1], w0[2], selector); - w4[0] = hc_byte_perm_S (w0[0], w0[1], selector); - w3[3] = hc_byte_perm_S ( 0, w0[0], selector); + case 31: + c7[3] = hc_byte_perm_S (w7[3], 0, selector); + c7[2] = hc_byte_perm_S (w7[2], w7[3], selector); + c7[1] = hc_byte_perm_S (w7[1], w7[2], selector); + c7[0] = hc_byte_perm_S (w7[0], w7[1], selector); + c6[3] = hc_byte_perm_S (w6[3], w7[0], selector); + c6[2] = hc_byte_perm_S (w6[2], w6[3], selector); + c6[1] = hc_byte_perm_S (w6[1], w6[2], selector); + c6[0] = hc_byte_perm_S (w6[0], w6[1], selector); + c5[3] = hc_byte_perm_S (w5[3], w6[0], selector); + c5[2] = hc_byte_perm_S (w5[2], w5[3], selector); + c5[1] = hc_byte_perm_S (w5[1], w5[2], selector); + c5[0] = hc_byte_perm_S (w5[0], w5[1], selector); + c4[3] = hc_byte_perm_S (w4[3], w5[0], selector); + c4[2] = hc_byte_perm_S (w4[2], w4[3], selector); + c4[1] = hc_byte_perm_S (w4[1], w4[2], selector); + c4[0] = hc_byte_perm_S (w4[0], w4[1], selector); + c3[3] = hc_byte_perm_S (w3[3], w4[0], selector); + c3[2] = hc_byte_perm_S (w3[2], w3[3], selector); + c3[1] = hc_byte_perm_S (w3[1], w3[2], selector); + c3[0] = hc_byte_perm_S (w3[0], w3[1], selector); + c2[3] = hc_byte_perm_S (w2[3], w3[0], selector); + c2[2] = hc_byte_perm_S (w2[2], w2[3], selector); + c2[1] = hc_byte_perm_S (w2[1], w2[2], selector); + c2[0] = hc_byte_perm_S (w2[0], w2[1], selector); + c1[3] = hc_byte_perm_S (w1[3], w2[0], selector); + c1[2] = hc_byte_perm_S (w1[2], w1[3], selector); + c1[1] = hc_byte_perm_S (w1[1], w1[2], selector); + c1[0] = hc_byte_perm_S (w1[0], w1[1], selector); + c0[3] = hc_byte_perm_S (w0[3], w1[0], selector); + c0[2] = hc_byte_perm_S (w0[2], w0[3], selector); + c0[1] = hc_byte_perm_S (w0[1], w0[2], selector); + c0[0] = hc_byte_perm_S (w0[0], w0[1], selector); + w7[3] = hc_byte_perm_S ( 0, w0[0], selector); + w7[2] = 0; + w7[1] = 0; + w7[0] = 0; + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -37697,6 +44472,7 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; } #endif diff --git a/OpenCL/inc_common.h b/OpenCL/inc_common.h index 07137297b..6e39b2ca3 100644 --- a/OpenCL/inc_common.h +++ b/OpenCL/inc_common.h @@ -262,6 +262,7 @@ DECLSPEC void switch_buffer_by_offset_carry_le (u32x *w0, u32x *w1, u32x *w2, u3 DECLSPEC void switch_buffer_by_offset_be (u32x *w0, u32x *w1, u32x *w2, u32x *w3, const u32 offset); DECLSPEC void switch_buffer_by_offset_carry_be (u32x *w0, u32x *w1, u32x *w2, u32x *w3, u32x *c0, u32x *c1, u32x *c2, u32x *c3, const u32 offset); DECLSPEC void switch_buffer_by_offset_8x4_le (u32x *w0, u32x *w1, u32x *w2, u32x *w3, u32x *w4, u32x *w5, u32x *w6, u32x *w7, const u32 offset); +DECLSPEC void switch_buffer_by_offset_8x4_carry_le (u32x *w0, u32x *w1, u32x *w2, u32x *w3, u32x *w4, u32x *w5, u32x *w6, u32x *w7, u32x *c0, u32x *c1, u32x *c2, u32x *c3, u32x *c4, u32x *c5, u32x *c6, u32x *c7, const u32 offset); DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x *w3, u32x *w4, u32x *w5, u32x *w6, u32x *w7, const u32 offset); DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2, u32x *w3, u32x *w4, u32x *w5, u32x *w6, u32x *w7, u32x *c0, u32x *c1, u32x *c2, u32x *c3, u32x *c4, u32x *c5, u32x *c6, u32x *c7, const u32 offset); DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset); @@ -289,6 +290,7 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 DECLSPEC void switch_buffer_by_offset_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 offset); DECLSPEC void switch_buffer_by_offset_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, u32 *c0, u32 *c1, u32 *c2, u32 *c3, const u32 offset); DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, u32 *w4, u32 *w5, u32 *w6, u32 *w7, const u32 offset); +DECLSPEC void switch_buffer_by_offset_8x4_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, u32 *w4, u32 *w5, u32 *w6, u32 *w7, u32 *c0, u32 *c1, u32 *c2, u32 *c3, u32 *c4, u32 *c5, u32 *c6, u32 *c7, const u32 offset); DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, u32 *w4, u32 *w5, u32 *w6, u32 *w7, const u32 offset); DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, u32 *w4, u32 *w5, u32 *w6, u32 *w7, u32 *c0, u32 *c1, u32 *c2, u32 *c3, u32 *c4, u32 *c5, u32 *c6, u32 *c7, const u32 offset); DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset); diff --git a/OpenCL/inc_hash_blake2b.cl b/OpenCL/inc_hash_blake2b.cl new file mode 100644 index 000000000..ac4377c2f --- /dev/null +++ b/OpenCL/inc_hash_blake2b.cl @@ -0,0 +1,662 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#include "inc_vendor.h" +#include "inc_types.h" +#include "inc_platform.h" +#include "inc_common.h" +#include "inc_hash_blake2b.h" + +DECLSPEC void blake2b_transform (u64 *h, const u64 *m, const u32 len, const u64 f0) +{ + const u64 t0 = hl32_to_64_S (0, len); + + u64 v[16]; + + v[ 0] = h[0]; + v[ 1] = h[1]; + v[ 2] = h[2]; + v[ 3] = h[3]; + v[ 4] = h[4]; + v[ 5] = h[5]; + v[ 6] = h[6]; + v[ 7] = h[7]; + v[ 8] = BLAKE2B_IV_00; + v[ 9] = BLAKE2B_IV_01; + v[10] = BLAKE2B_IV_02; + v[11] = BLAKE2B_IV_03; + v[12] = BLAKE2B_IV_04 ^ t0; + v[13] = BLAKE2B_IV_05; // ^ t1; + v[14] = BLAKE2B_IV_06 ^ f0; + v[15] = BLAKE2B_IV_07; // ^ f1; + + BLAKE2B_ROUND ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + BLAKE2B_ROUND (14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3); + BLAKE2B_ROUND (11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4); + BLAKE2B_ROUND ( 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8); + BLAKE2B_ROUND ( 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13); + BLAKE2B_ROUND ( 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9); + BLAKE2B_ROUND (12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11); + BLAKE2B_ROUND (13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10); + BLAKE2B_ROUND ( 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5); + BLAKE2B_ROUND (10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0); + BLAKE2B_ROUND ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + BLAKE2B_ROUND (14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3); + + h[0] = h[0] ^ v[0] ^ v[ 8]; + h[1] = h[1] ^ v[1] ^ v[ 9]; + h[2] = h[2] ^ v[2] ^ v[10]; + h[3] = h[3] ^ v[3] ^ v[11]; + h[4] = h[4] ^ v[4] ^ v[12]; + h[5] = h[5] ^ v[5] ^ v[13]; + h[6] = h[6] ^ v[6] ^ v[14]; + h[7] = h[7] ^ v[7] ^ v[15]; +} + +DECLSPEC void blake2b_init (blake2b_ctx_t *ctx) +{ + ctx->h[0] = BLAKE2B_IV_00 ^ 0x01010040; // default output length: 0x40 = 64 bytes + ctx->h[1] = BLAKE2B_IV_01; + ctx->h[2] = BLAKE2B_IV_02; + ctx->h[3] = BLAKE2B_IV_03; + ctx->h[4] = BLAKE2B_IV_04; + ctx->h[5] = BLAKE2B_IV_05; + ctx->h[6] = BLAKE2B_IV_06; + ctx->h[7] = BLAKE2B_IV_07; + + ctx->m[ 0] = 0; + ctx->m[ 1] = 0; + ctx->m[ 2] = 0; + ctx->m[ 3] = 0; + ctx->m[ 4] = 0; + ctx->m[ 5] = 0; + ctx->m[ 6] = 0; + ctx->m[ 7] = 0; + ctx->m[ 8] = 0; + ctx->m[ 9] = 0; + ctx->m[10] = 0; + ctx->m[11] = 0; + ctx->m[12] = 0; + ctx->m[13] = 0; + ctx->m[14] = 0; + ctx->m[15] = 0; + + ctx->len = 0; +} + +DECLSPEC void blake2b_update_128 (blake2b_ctx_t *ctx, u32 *w0, u32 *w1, u32 *w2, u32 *w3, u32 *w4, u32 *w5, u32 *w6, u32 *w7, const u32 len) +{ + MAYBE_VOLATILE const u32 pos = ctx->len & 127; + + if (pos == 0) + { + if (ctx->len > 0) // if new block (pos == 0) AND the (old) len is not zero => transform + { + blake2b_transform (ctx->h, ctx->m, ctx->len, BLAKE2B_UPDATE); + } + + ctx->m[ 0] = hl32_to_64_S (w0[1], w0[0]); + ctx->m[ 1] = hl32_to_64_S (w0[3], w0[2]); + ctx->m[ 2] = hl32_to_64_S (w1[1], w1[0]); + ctx->m[ 3] = hl32_to_64_S (w1[3], w1[2]); + ctx->m[ 4] = hl32_to_64_S (w2[1], w2[0]); + ctx->m[ 5] = hl32_to_64_S (w2[3], w2[2]); + ctx->m[ 6] = hl32_to_64_S (w3[1], w3[0]); + ctx->m[ 7] = hl32_to_64_S (w3[3], w3[2]); + ctx->m[ 8] = hl32_to_64_S (w4[1], w4[0]); + ctx->m[ 9] = hl32_to_64_S (w4[3], w4[2]); + ctx->m[10] = hl32_to_64_S (w5[1], w5[0]); + ctx->m[11] = hl32_to_64_S (w5[3], w5[2]); + ctx->m[12] = hl32_to_64_S (w6[1], w6[0]); + ctx->m[13] = hl32_to_64_S (w6[3], w6[2]); + ctx->m[14] = hl32_to_64_S (w7[1], w7[0]); + ctx->m[15] = hl32_to_64_S (w7[3], w7[2]); + } + else + { + if ((pos + len) <= 128) + { + switch_buffer_by_offset_8x4_le_S (w0, w1, w2, w3, w4, w5, w6, w7, pos); + + ctx->m[ 0] |= hl32_to_64_S (w0[1], w0[0]); + ctx->m[ 1] |= hl32_to_64_S (w0[3], w0[2]); + ctx->m[ 2] |= hl32_to_64_S (w1[1], w1[0]); + ctx->m[ 3] |= hl32_to_64_S (w1[3], w1[2]); + ctx->m[ 4] |= hl32_to_64_S (w2[1], w2[0]); + ctx->m[ 5] |= hl32_to_64_S (w2[3], w2[2]); + ctx->m[ 6] |= hl32_to_64_S (w3[1], w3[0]); + ctx->m[ 7] |= hl32_to_64_S (w3[3], w3[2]); + ctx->m[ 8] |= hl32_to_64_S (w4[1], w4[0]); + ctx->m[ 9] |= hl32_to_64_S (w4[3], w4[2]); + ctx->m[10] |= hl32_to_64_S (w5[1], w5[0]); + ctx->m[11] |= hl32_to_64_S (w5[3], w5[2]); + ctx->m[12] |= hl32_to_64_S (w6[1], w6[0]); + ctx->m[13] |= hl32_to_64_S (w6[3], w6[2]); + ctx->m[14] |= hl32_to_64_S (w7[1], w7[0]); + ctx->m[15] |= hl32_to_64_S (w7[3], w7[2]); + } + else + { + u32 c0[4] = { 0 }; + u32 c1[4] = { 0 }; + u32 c2[4] = { 0 }; + u32 c3[4] = { 0 }; + u32 c4[4] = { 0 }; + u32 c5[4] = { 0 }; + u32 c6[4] = { 0 }; + u32 c7[4] = { 0 }; + + switch_buffer_by_offset_8x4_carry_le_S (w0, w1, w2, w3, w4, w5, w6, w7, c0, c1, c2, c3, c4, c5, c6, c7, pos); + + ctx->m[ 0] |= hl32_to_64_S (w0[1], w0[0]); + ctx->m[ 1] |= hl32_to_64_S (w0[3], w0[2]); + ctx->m[ 2] |= hl32_to_64_S (w1[1], w1[0]); + ctx->m[ 3] |= hl32_to_64_S (w1[3], w1[2]); + ctx->m[ 4] |= hl32_to_64_S (w2[1], w2[0]); + ctx->m[ 5] |= hl32_to_64_S (w2[3], w2[2]); + ctx->m[ 6] |= hl32_to_64_S (w3[1], w3[0]); + ctx->m[ 7] |= hl32_to_64_S (w3[3], w3[2]); + ctx->m[ 8] |= hl32_to_64_S (w4[1], w4[0]); + ctx->m[ 9] |= hl32_to_64_S (w4[3], w4[2]); + ctx->m[10] |= hl32_to_64_S (w5[1], w5[0]); + ctx->m[11] |= hl32_to_64_S (w5[3], w5[2]); + ctx->m[12] |= hl32_to_64_S (w6[1], w6[0]); + ctx->m[13] |= hl32_to_64_S (w6[3], w6[2]); + ctx->m[14] |= hl32_to_64_S (w7[1], w7[0]); + ctx->m[15] |= hl32_to_64_S (w7[3], w7[2]); + + // len must be a multiple of 128 (not ctx->len) for BLAKE2B_UPDATE: + + const u32 cur_len = ((ctx->len + len) / 128) * 128; + + blake2b_transform (ctx->h, ctx->m, cur_len, BLAKE2B_UPDATE); + + ctx->m[ 0] = hl32_to_64_S (c0[1], c0[0]); + ctx->m[ 1] = hl32_to_64_S (c0[3], c0[2]); + ctx->m[ 2] = hl32_to_64_S (c1[1], c1[0]); + ctx->m[ 3] = hl32_to_64_S (c1[3], c1[2]); + ctx->m[ 4] = hl32_to_64_S (c2[1], c2[0]); + ctx->m[ 5] = hl32_to_64_S (c2[3], c2[2]); + ctx->m[ 6] = hl32_to_64_S (c3[1], c3[0]); + ctx->m[ 7] = hl32_to_64_S (c3[3], c3[2]); + ctx->m[ 8] = hl32_to_64_S (c4[1], c4[0]); + ctx->m[ 9] = hl32_to_64_S (c4[3], c4[2]); + ctx->m[10] = hl32_to_64_S (c5[1], c5[0]); + ctx->m[11] = hl32_to_64_S (c5[3], c5[2]); + ctx->m[12] = hl32_to_64_S (c6[1], c6[0]); + ctx->m[13] = hl32_to_64_S (c6[3], c6[2]); + ctx->m[14] = hl32_to_64_S (c7[1], c7[0]); + ctx->m[15] = hl32_to_64_S (c7[3], c7[2]); + } + } + + ctx->len += len; +} + +DECLSPEC void blake2b_update (blake2b_ctx_t *ctx, const u32 *w, const u32 len) +{ + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + u32 w4[4]; + u32 w5[4]; + u32 w6[4]; + u32 w7[4]; + + const int limit = (const int) len - 128; // int type needed, could be negative + + int pos1; + int pos4; + + for (pos1 = 0, pos4 = 0; pos1 < limit; pos1 += 128, pos4 += 32) + { + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + w2[0] = w[pos4 + 8]; + w2[1] = w[pos4 + 9]; + w2[2] = w[pos4 + 10]; + w2[3] = w[pos4 + 11]; + w3[0] = w[pos4 + 12]; + w3[1] = w[pos4 + 13]; + w3[2] = w[pos4 + 14]; + w3[3] = w[pos4 + 15]; + w4[0] = w[pos4 + 16]; + w4[1] = w[pos4 + 17]; + w4[2] = w[pos4 + 18]; + w4[3] = w[pos4 + 19]; + w5[0] = w[pos4 + 20]; + w5[1] = w[pos4 + 21]; + w5[2] = w[pos4 + 22]; + w5[3] = w[pos4 + 23]; + w6[0] = w[pos4 + 24]; + w6[1] = w[pos4 + 25]; + w6[2] = w[pos4 + 26]; + w6[3] = w[pos4 + 27]; + w7[0] = w[pos4 + 28]; + w7[1] = w[pos4 + 29]; + w7[2] = w[pos4 + 30]; + w7[3] = w[pos4 + 31]; + + blake2b_update_128 (ctx, w0, w1, w2, w3, w4, w5, w6, w7, 128); + } + + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + w2[0] = w[pos4 + 8]; + w2[1] = w[pos4 + 9]; + w2[2] = w[pos4 + 10]; + w2[3] = w[pos4 + 11]; + w3[0] = w[pos4 + 12]; + w3[1] = w[pos4 + 13]; + w3[2] = w[pos4 + 14]; + w3[3] = w[pos4 + 15]; + w4[0] = w[pos4 + 16]; + w4[1] = w[pos4 + 17]; + w4[2] = w[pos4 + 18]; + w4[3] = w[pos4 + 19]; + w5[0] = w[pos4 + 20]; + w5[1] = w[pos4 + 21]; + w5[2] = w[pos4 + 22]; + w5[3] = w[pos4 + 23]; + w6[0] = w[pos4 + 24]; + w6[1] = w[pos4 + 25]; + w6[2] = w[pos4 + 26]; + w6[3] = w[pos4 + 27]; + w7[0] = w[pos4 + 28]; + w7[1] = w[pos4 + 29]; + w7[2] = w[pos4 + 30]; + w7[3] = w[pos4 + 31]; + + blake2b_update_128 (ctx, w0, w1, w2, w3, w4, w5, w6, w7, len - (u32) pos1); +} + +DECLSPEC void blake2b_update_global (blake2b_ctx_t *ctx, GLOBAL_AS const u32 *w, const u32 len) +{ + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + u32 w4[4]; + u32 w5[4]; + u32 w6[4]; + u32 w7[4]; + + const int limit = (const int) len - 128; // int type needed, could be negative + + int pos1; + int pos4; + + for (pos1 = 0, pos4 = 0; pos1 < limit; pos1 += 128, pos4 += 32) + { + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + w2[0] = w[pos4 + 8]; + w2[1] = w[pos4 + 9]; + w2[2] = w[pos4 + 10]; + w2[3] = w[pos4 + 11]; + w3[0] = w[pos4 + 12]; + w3[1] = w[pos4 + 13]; + w3[2] = w[pos4 + 14]; + w3[3] = w[pos4 + 15]; + w4[0] = w[pos4 + 16]; + w4[1] = w[pos4 + 17]; + w4[2] = w[pos4 + 18]; + w4[3] = w[pos4 + 19]; + w5[0] = w[pos4 + 20]; + w5[1] = w[pos4 + 21]; + w5[2] = w[pos4 + 22]; + w5[3] = w[pos4 + 23]; + w6[0] = w[pos4 + 24]; + w6[1] = w[pos4 + 25]; + w6[2] = w[pos4 + 26]; + w6[3] = w[pos4 + 27]; + w7[0] = w[pos4 + 28]; + w7[1] = w[pos4 + 29]; + w7[2] = w[pos4 + 30]; + w7[3] = w[pos4 + 31]; + + blake2b_update_128 (ctx, w0, w1, w2, w3, w4, w5, w6, w7, 128); + } + + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + w2[0] = w[pos4 + 8]; + w2[1] = w[pos4 + 9]; + w2[2] = w[pos4 + 10]; + w2[3] = w[pos4 + 11]; + w3[0] = w[pos4 + 12]; + w3[1] = w[pos4 + 13]; + w3[2] = w[pos4 + 14]; + w3[3] = w[pos4 + 15]; + w4[0] = w[pos4 + 16]; + w4[1] = w[pos4 + 17]; + w4[2] = w[pos4 + 18]; + w4[3] = w[pos4 + 19]; + w5[0] = w[pos4 + 20]; + w5[1] = w[pos4 + 21]; + w5[2] = w[pos4 + 22]; + w5[3] = w[pos4 + 23]; + w6[0] = w[pos4 + 24]; + w6[1] = w[pos4 + 25]; + w6[2] = w[pos4 + 26]; + w6[3] = w[pos4 + 27]; + w7[0] = w[pos4 + 28]; + w7[1] = w[pos4 + 29]; + w7[2] = w[pos4 + 30]; + w7[3] = w[pos4 + 31]; + + blake2b_update_128 (ctx, w0, w1, w2, w3, w4, w5, w6, w7, len - (u32) pos1); +} + +DECLSPEC void blake2b_final (blake2b_ctx_t *ctx) +{ + blake2b_transform (ctx->h, ctx->m, ctx->len, BLAKE2B_FINAL); +} + +DECLSPEC void blake2b_transform_vector (u64x *h, const u64x *m, const u32x len, const u64 f0) +{ + const u64x t0 = hl32_to_64 (0, len); + + u64x v[16]; + + v[ 0] = h[0]; + v[ 1] = h[1]; + v[ 2] = h[2]; + v[ 3] = h[3]; + v[ 4] = h[4]; + v[ 5] = h[5]; + v[ 6] = h[6]; + v[ 7] = h[7]; + v[ 8] = BLAKE2B_IV_00; + v[ 9] = BLAKE2B_IV_01; + v[10] = BLAKE2B_IV_02; + v[11] = BLAKE2B_IV_03; + v[12] = BLAKE2B_IV_04 ^ t0; + v[13] = BLAKE2B_IV_05; // ^ t1; + v[14] = BLAKE2B_IV_06 ^ f0; + v[15] = BLAKE2B_IV_07; // ^ f1; + + BLAKE2B_ROUND_VECTOR ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + BLAKE2B_ROUND_VECTOR (14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3); + BLAKE2B_ROUND_VECTOR (11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4); + BLAKE2B_ROUND_VECTOR ( 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8); + BLAKE2B_ROUND_VECTOR ( 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13); + BLAKE2B_ROUND_VECTOR ( 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9); + BLAKE2B_ROUND_VECTOR (12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11); + BLAKE2B_ROUND_VECTOR (13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10); + BLAKE2B_ROUND_VECTOR ( 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5); + BLAKE2B_ROUND_VECTOR (10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0); + BLAKE2B_ROUND_VECTOR ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + BLAKE2B_ROUND_VECTOR (14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3); + + h[0] = h[0] ^ v[0] ^ v[ 8]; + h[1] = h[1] ^ v[1] ^ v[ 9]; + h[2] = h[2] ^ v[2] ^ v[10]; + h[3] = h[3] ^ v[3] ^ v[11]; + h[4] = h[4] ^ v[4] ^ v[12]; + h[5] = h[5] ^ v[5] ^ v[13]; + h[6] = h[6] ^ v[6] ^ v[14]; + h[7] = h[7] ^ v[7] ^ v[15]; +} + +DECLSPEC void blake2b_init_vector (blake2b_ctx_vector_t *ctx) +{ + ctx->h[0] = BLAKE2B_IV_00 ^ 0x01010040; // default output length: 0x40 = 64 bytes + ctx->h[1] = BLAKE2B_IV_01; + ctx->h[2] = BLAKE2B_IV_02; + ctx->h[3] = BLAKE2B_IV_03; + ctx->h[4] = BLAKE2B_IV_04; + ctx->h[5] = BLAKE2B_IV_05; + ctx->h[6] = BLAKE2B_IV_06; + ctx->h[7] = BLAKE2B_IV_07; + + ctx->m[ 0] = 0; + ctx->m[ 1] = 0; + ctx->m[ 2] = 0; + ctx->m[ 3] = 0; + ctx->m[ 4] = 0; + ctx->m[ 5] = 0; + ctx->m[ 6] = 0; + ctx->m[ 7] = 0; + ctx->m[ 8] = 0; + ctx->m[ 9] = 0; + ctx->m[10] = 0; + ctx->m[11] = 0; + ctx->m[12] = 0; + ctx->m[13] = 0; + ctx->m[14] = 0; + ctx->m[15] = 0; + + ctx->len = 0; +} + +DECLSPEC void blake2b_update_vector_128 (blake2b_ctx_vector_t *ctx, u32x *w0, u32x *w1, u32x *w2, u32x *w3, u32x *w4, u32x *w5, u32x *w6, u32x *w7, const u32 len) +{ + MAYBE_VOLATILE const u32 pos = ctx->len & 127; + + if (pos == 0) + { + if (ctx->len > 0) // if new block (pos == 0) AND the (old) len is not zero => transform + { + blake2b_transform_vector (ctx->h, ctx->m, (u32x) ctx->len, BLAKE2B_UPDATE); + } + + ctx->m[ 0] = hl32_to_64 (w0[1], w0[0]); + ctx->m[ 1] = hl32_to_64 (w0[3], w0[2]); + ctx->m[ 2] = hl32_to_64 (w1[1], w1[0]); + ctx->m[ 3] = hl32_to_64 (w1[3], w1[2]); + ctx->m[ 4] = hl32_to_64 (w2[1], w2[0]); + ctx->m[ 5] = hl32_to_64 (w2[3], w2[2]); + ctx->m[ 6] = hl32_to_64 (w3[1], w3[0]); + ctx->m[ 7] = hl32_to_64 (w3[3], w3[2]); + ctx->m[ 8] = hl32_to_64 (w4[1], w4[0]); + ctx->m[ 9] = hl32_to_64 (w4[3], w4[2]); + ctx->m[10] = hl32_to_64 (w5[1], w5[0]); + ctx->m[11] = hl32_to_64 (w5[3], w5[2]); + ctx->m[12] = hl32_to_64 (w6[1], w6[0]); + ctx->m[13] = hl32_to_64 (w6[3], w6[2]); + ctx->m[14] = hl32_to_64 (w7[1], w7[0]); + ctx->m[15] = hl32_to_64 (w7[3], w7[2]); + } + else + { + if ((pos + len) <= 128) + { + switch_buffer_by_offset_8x4_le (w0, w1, w2, w3, w4, w5, w6, w7, pos); + + ctx->m[ 0] |= hl32_to_64 (w0[1], w0[0]); + ctx->m[ 1] |= hl32_to_64 (w0[3], w0[2]); + ctx->m[ 2] |= hl32_to_64 (w1[1], w1[0]); + ctx->m[ 3] |= hl32_to_64 (w1[3], w1[2]); + ctx->m[ 4] |= hl32_to_64 (w2[1], w2[0]); + ctx->m[ 5] |= hl32_to_64 (w2[3], w2[2]); + ctx->m[ 6] |= hl32_to_64 (w3[1], w3[0]); + ctx->m[ 7] |= hl32_to_64 (w3[3], w3[2]); + ctx->m[ 8] |= hl32_to_64 (w4[1], w4[0]); + ctx->m[ 9] |= hl32_to_64 (w4[3], w4[2]); + ctx->m[10] |= hl32_to_64 (w5[1], w5[0]); + ctx->m[11] |= hl32_to_64 (w5[3], w5[2]); + ctx->m[12] |= hl32_to_64 (w6[1], w6[0]); + ctx->m[13] |= hl32_to_64 (w6[3], w6[2]); + ctx->m[14] |= hl32_to_64 (w7[1], w7[0]); + ctx->m[15] |= hl32_to_64 (w7[3], w7[2]); + } + else + { + u32x c0[4] = { 0 }; + u32x c1[4] = { 0 }; + u32x c2[4] = { 0 }; + u32x c3[4] = { 0 }; + u32x c4[4] = { 0 }; + u32x c5[4] = { 0 }; + u32x c6[4] = { 0 }; + u32x c7[4] = { 0 }; + + switch_buffer_by_offset_8x4_carry_le (w0, w1, w2, w3, w4, w5, w6, w7, c0, c1, c2, c3, c4, c5, c6, c7, pos); + + ctx->m[ 0] |= hl32_to_64 (w0[1], w0[0]); + ctx->m[ 1] |= hl32_to_64 (w0[3], w0[2]); + ctx->m[ 2] |= hl32_to_64 (w1[1], w1[0]); + ctx->m[ 3] |= hl32_to_64 (w1[3], w1[2]); + ctx->m[ 4] |= hl32_to_64 (w2[1], w2[0]); + ctx->m[ 5] |= hl32_to_64 (w2[3], w2[2]); + ctx->m[ 6] |= hl32_to_64 (w3[1], w3[0]); + ctx->m[ 7] |= hl32_to_64 (w3[3], w3[2]); + ctx->m[ 8] |= hl32_to_64 (w4[1], w4[0]); + ctx->m[ 9] |= hl32_to_64 (w4[3], w4[2]); + ctx->m[10] |= hl32_to_64 (w5[1], w5[0]); + ctx->m[11] |= hl32_to_64 (w5[3], w5[2]); + ctx->m[12] |= hl32_to_64 (w6[1], w6[0]); + ctx->m[13] |= hl32_to_64 (w6[3], w6[2]); + ctx->m[14] |= hl32_to_64 (w7[1], w7[0]); + ctx->m[15] |= hl32_to_64 (w7[3], w7[2]); + + // len must be a multiple of 128 (not ctx->len) for BLAKE2B_UPDATE: + + const u32x cur_len = ((ctx->len + len) / 128) * 128; + + blake2b_transform_vector (ctx->h, ctx->m, cur_len, BLAKE2B_UPDATE); + + ctx->m[ 0] = hl32_to_64 (c0[1], c0[0]); + ctx->m[ 1] = hl32_to_64 (c0[3], c0[2]); + ctx->m[ 2] = hl32_to_64 (c1[1], c1[0]); + ctx->m[ 3] = hl32_to_64 (c1[3], c1[2]); + ctx->m[ 4] = hl32_to_64 (c2[1], c2[0]); + ctx->m[ 5] = hl32_to_64 (c2[3], c2[2]); + ctx->m[ 6] = hl32_to_64 (c3[1], c3[0]); + ctx->m[ 7] = hl32_to_64 (c3[3], c3[2]); + ctx->m[ 8] = hl32_to_64 (c4[1], c4[0]); + ctx->m[ 9] = hl32_to_64 (c4[3], c4[2]); + ctx->m[10] = hl32_to_64 (c5[1], c5[0]); + ctx->m[11] = hl32_to_64 (c5[3], c5[2]); + ctx->m[12] = hl32_to_64 (c6[1], c6[0]); + ctx->m[13] = hl32_to_64 (c6[3], c6[2]); + ctx->m[14] = hl32_to_64 (c7[1], c7[0]); + ctx->m[15] = hl32_to_64 (c7[3], c7[2]); + } + } + + ctx->len += len; +} + +DECLSPEC void blake2b_update_vector (blake2b_ctx_vector_t *ctx, const u32x *w, const u32 len) +{ + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + u32x w4[4]; + u32x w5[4]; + u32x w6[4]; + u32x w7[4]; + + const int limit = (const int) len - 128; // int type needed, could be negative + + int pos1; + int pos4; + + for (pos1 = 0, pos4 = 0; pos1 < limit; pos1 += 128, pos4 += 32) + { + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + w2[0] = w[pos4 + 8]; + w2[1] = w[pos4 + 9]; + w2[2] = w[pos4 + 10]; + w2[3] = w[pos4 + 11]; + w3[0] = w[pos4 + 12]; + w3[1] = w[pos4 + 13]; + w3[2] = w[pos4 + 14]; + w3[3] = w[pos4 + 15]; + w4[0] = w[pos4 + 16]; + w4[1] = w[pos4 + 17]; + w4[2] = w[pos4 + 18]; + w4[3] = w[pos4 + 19]; + w5[0] = w[pos4 + 20]; + w5[1] = w[pos4 + 21]; + w5[2] = w[pos4 + 22]; + w5[3] = w[pos4 + 23]; + w6[0] = w[pos4 + 24]; + w6[1] = w[pos4 + 25]; + w6[2] = w[pos4 + 26]; + w6[3] = w[pos4 + 27]; + w7[0] = w[pos4 + 28]; + w7[1] = w[pos4 + 29]; + w7[2] = w[pos4 + 30]; + w7[3] = w[pos4 + 31]; + + blake2b_update_vector_128 (ctx, w0, w1, w2, w3, w4, w5, w6, w7, 128); + } + + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + w2[0] = w[pos4 + 8]; + w2[1] = w[pos4 + 9]; + w2[2] = w[pos4 + 10]; + w2[3] = w[pos4 + 11]; + w3[0] = w[pos4 + 12]; + w3[1] = w[pos4 + 13]; + w3[2] = w[pos4 + 14]; + w3[3] = w[pos4 + 15]; + w4[0] = w[pos4 + 16]; + w4[1] = w[pos4 + 17]; + w4[2] = w[pos4 + 18]; + w4[3] = w[pos4 + 19]; + w5[0] = w[pos4 + 20]; + w5[1] = w[pos4 + 21]; + w5[2] = w[pos4 + 22]; + w5[3] = w[pos4 + 23]; + w6[0] = w[pos4 + 24]; + w6[1] = w[pos4 + 25]; + w6[2] = w[pos4 + 26]; + w6[3] = w[pos4 + 27]; + w7[0] = w[pos4 + 28]; + w7[1] = w[pos4 + 29]; + w7[2] = w[pos4 + 30]; + w7[3] = w[pos4 + 31]; + + blake2b_update_vector_128 (ctx, w0, w1, w2, w3, w4, w5, w6, w7, len - (u32) pos1); +} + +DECLSPEC void blake2b_final_vector (blake2b_ctx_vector_t *ctx) +{ + blake2b_transform_vector (ctx->h, ctx->m, (u32x) ctx->len, BLAKE2B_FINAL); +} diff --git a/OpenCL/inc_hash_blake2b.h b/OpenCL/inc_hash_blake2b.h new file mode 100644 index 000000000..798b651b7 --- /dev/null +++ b/OpenCL/inc_hash_blake2b.h @@ -0,0 +1,90 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#ifndef _INC_HASH_BLAKE2B_H +#define _INC_HASH_BLAKE2B_H + +#define BLAKE2B_UPDATE 0 +#define BLAKE2B_FINAL -1 + +#define BLAKE2B_G(k0,k1,a,b,c,d) \ +{ \ + a = a + b + m[k0]; \ + d = hc_rotr64_S (d ^ a, 32); \ + c = c + d; \ + b = hc_rotr64_S (b ^ c, 24); \ + a = a + b + m[k1]; \ + d = hc_rotr64_S (d ^ a, 16); \ + c = c + d; \ + b = hc_rotr64_S (b ^ c, 63); \ +} + +#define BLAKE2B_ROUND(c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,ca,cb,cc,cd,ce,cf) \ +{ \ + BLAKE2B_G (c0, c1, v[0], v[4], v[ 8], v[12]); \ + BLAKE2B_G (c2, c3, v[1], v[5], v[ 9], v[13]); \ + BLAKE2B_G (c4, c5, v[2], v[6], v[10], v[14]); \ + BLAKE2B_G (c6, c7, v[3], v[7], v[11], v[15]); \ + BLAKE2B_G (c8, c9, v[0], v[5], v[10], v[15]); \ + BLAKE2B_G (ca, cb, v[1], v[6], v[11], v[12]); \ + BLAKE2B_G (cc, cd, v[2], v[7], v[ 8], v[13]); \ + BLAKE2B_G (ce, cf, v[3], v[4], v[ 9], v[14]); \ +} + +#define BLAKE2B_G_VECTOR(k0,k1,a,b,c,d) \ +{ \ + a = a + b + m[k0]; \ + d = hc_rotr64 (d ^ a, 32); \ + c = c + d; \ + b = hc_rotr64 (b ^ c, 24); \ + a = a + b + m[k1]; \ + d = hc_rotr64 (d ^ a, 16); \ + c = c + d; \ + b = hc_rotr64 (b ^ c, 63); \ +} + +#define BLAKE2B_ROUND_VECTOR(c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,ca,cb,cc,cd,ce,cf) \ +{ \ + BLAKE2B_G_VECTOR (c0, c1, v[0], v[4], v[ 8], v[12]); \ + BLAKE2B_G_VECTOR (c2, c3, v[1], v[5], v[ 9], v[13]); \ + BLAKE2B_G_VECTOR (c4, c5, v[2], v[6], v[10], v[14]); \ + BLAKE2B_G_VECTOR (c6, c7, v[3], v[7], v[11], v[15]); \ + BLAKE2B_G_VECTOR (c8, c9, v[0], v[5], v[10], v[15]); \ + BLAKE2B_G_VECTOR (ca, cb, v[1], v[6], v[11], v[12]); \ + BLAKE2B_G_VECTOR (cc, cd, v[2], v[7], v[ 8], v[13]); \ + BLAKE2B_G_VECTOR (ce, cf, v[3], v[4], v[ 9], v[14]); \ +} + +typedef struct blake2b_ctx +{ + u64 m[16]; // buffer + u64 h[ 8]; // digest + + u32 len; + +} blake2b_ctx_t; + +typedef struct blake2b_ctx_vector +{ + u64x m[16]; // buffer + u64x h[ 8]; // digest + + u32 len; + +} blake2b_ctx_vector_t; + +DECLSPEC void blake2b_transform (u64 *h, const u64 *m, const u32 len, const u64 f0); +DECLSPEC void blake2b_init (blake2b_ctx_t *ctx); +DECLSPEC void blake2b_update (blake2b_ctx_t *ctx, const u32 *w, const u32 len); +DECLSPEC void blake2b_update_global (blake2b_ctx_t *ctx, GLOBAL_AS const u32 *w, const u32 len); +DECLSPEC void blake2b_final (blake2b_ctx_t *ctx); + +DECLSPEC void blake2b_transform_vector (u64x *h, const u64x *m, const u32x len, const u64 f0); +DECLSPEC void blake2b_init_vector (blake2b_ctx_vector_t *ctx); +DECLSPEC void blake2b_update_vector (blake2b_ctx_vector_t *ctx, const u32x *w, const u32 len); +DECLSPEC void blake2b_final_vector (blake2b_ctx_vector_t *ctx); + + +#endif // _INC_HASH_BLAKE2B_H diff --git a/OpenCL/m00600_a0-optimized.cl b/OpenCL/m00600_a0-optimized.cl index 0967e3cff..1a499c113 100644 --- a/OpenCL/m00600_a0-optimized.cl +++ b/OpenCL/m00600_a0-optimized.cl @@ -13,117 +13,15 @@ #include "inc_rp_optimized.h" #include "inc_rp_optimized.cl" #include "inc_simd.cl" +#include "inc_hash_blake2b.cl" #endif -typedef struct blake2 -{ - u64 h[8]; - u64 t[2]; - u64 f[2]; - u32 buflen; - u32 outlen; - -} blake2_t; - -#define BLAKE2B_FINAL 1 -#define BLAKE2B_UPDATE 0 - -#define BLAKE2B_G(k0,k1,a,b,c,d) \ - do { \ - a = a + b + m[(k0)]; \ - d = hc_rotr64 (d ^ a, 32); \ - c = c + d; \ - b = hc_rotr64 (b ^ c, 24); \ - a = a + b + m[(k1)]; \ - d = hc_rotr64 (d ^ a, 16); \ - c = c + d; \ - b = hc_rotr64 (b ^ c, 63); \ - } while (0) - -#define BLAKE2B_ROUND(c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,ca,cb,cc,cd,ce,cf) \ - do { \ - BLAKE2B_G ((c0),(c1),v[ 0],v[ 4],v[ 8],v[12]); \ - BLAKE2B_G ((c2),(c3),v[ 1],v[ 5],v[ 9],v[13]); \ - BLAKE2B_G ((c4),(c5),v[ 2],v[ 6],v[10],v[14]); \ - BLAKE2B_G ((c6),(c7),v[ 3],v[ 7],v[11],v[15]); \ - BLAKE2B_G ((c8),(c9),v[ 0],v[ 5],v[10],v[15]); \ - BLAKE2B_G ((ca),(cb),v[ 1],v[ 6],v[11],v[12]); \ - BLAKE2B_G ((cc),(cd),v[ 2],v[ 7],v[ 8],v[13]); \ - BLAKE2B_G ((ce),(cf),v[ 3],v[ 4],v[ 9],v[14]); \ -} while (0) - -DECLSPEC void blake2b_transform (u64x *h, u64x *t, u64x *f, u64x *m, u64x *v, const u32x *w0, const u32x *w1, const u32x *w2, const u32x *w3, const u32x out_len, const u8 isFinal) -{ - if (isFinal) - f[0] = -1; - - t[0] += hl32_to_64 (0, out_len); - - m[ 0] = hl32_to_64 (w0[1], w0[0]); - m[ 1] = hl32_to_64 (w0[3], w0[2]); - m[ 2] = hl32_to_64 (w1[1], w1[0]); - m[ 3] = hl32_to_64 (w1[3], w1[2]); - m[ 4] = hl32_to_64 (w2[1], w2[0]); - m[ 5] = hl32_to_64 (w2[3], w2[2]); - m[ 6] = hl32_to_64 (w3[1], w3[0]); - m[ 7] = hl32_to_64 (w3[3], w3[2]); - m[ 8] = 0; - m[ 9] = 0; - m[10] = 0; - m[11] = 0; - m[12] = 0; - m[13] = 0; - m[14] = 0; - m[15] = 0; - - v[ 0] = h[0]; - v[ 1] = h[1]; - v[ 2] = h[2]; - v[ 3] = h[3]; - v[ 4] = h[4]; - v[ 5] = h[5]; - v[ 6] = h[6]; - v[ 7] = h[7]; - v[ 8] = BLAKE2B_IV_00; - v[ 9] = BLAKE2B_IV_01; - v[10] = BLAKE2B_IV_02; - v[11] = BLAKE2B_IV_03; - v[12] = BLAKE2B_IV_04 ^ t[0]; - v[13] = BLAKE2B_IV_05 ^ t[1]; - v[14] = BLAKE2B_IV_06 ^ f[0]; - v[15] = BLAKE2B_IV_07 ^ f[1]; - - BLAKE2B_ROUND ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - BLAKE2B_ROUND (14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3); - BLAKE2B_ROUND (11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4); - BLAKE2B_ROUND ( 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8); - BLAKE2B_ROUND ( 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13); - BLAKE2B_ROUND ( 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9); - BLAKE2B_ROUND (12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11); - BLAKE2B_ROUND (13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10); - BLAKE2B_ROUND ( 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5); - BLAKE2B_ROUND (10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0); - BLAKE2B_ROUND ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - BLAKE2B_ROUND (14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3); - - h[0] = h[0] ^ v[0] ^ v[ 8]; - h[1] = h[1] ^ v[1] ^ v[ 9]; - h[2] = h[2] ^ v[2] ^ v[10]; - h[3] = h[3] ^ v[3] ^ v[11]; - h[4] = h[4] ^ v[4] ^ v[12]; - h[5] = h[5] ^ v[5] ^ v[13]; - h[6] = h[6] ^ v[6] ^ v[14]; - h[7] = h[7] ^ v[7] ^ v[15]; -} - -KERNEL_FQ void m00600_m04 (KERN_ATTR_RULES_ESALT (blake2_t)) +KERNEL_FQ void m00600_m04 (KERN_ATTR_RULES ()) { /** - * modifier + * base */ - const u64 lid = get_local_id (0); - const u64 gid = get_global_id (0); if (gid >= gid_max) return; @@ -142,24 +40,6 @@ KERNEL_FQ void m00600_m04 (KERN_ATTR_RULES_ESALT (blake2_t)) const u32 pw_len = pws[gid].pw_len & 63; - u64 tmp_h[8]; - u64 tmp_t[2]; - u64 tmp_f[2]; - - tmp_h[0] = esalt_bufs[digests_offset].h[0]; - tmp_h[1] = esalt_bufs[digests_offset].h[1]; - tmp_h[2] = esalt_bufs[digests_offset].h[2]; - tmp_h[3] = esalt_bufs[digests_offset].h[3]; - tmp_h[4] = esalt_bufs[digests_offset].h[4]; - tmp_h[5] = esalt_bufs[digests_offset].h[5]; - tmp_h[6] = esalt_bufs[digests_offset].h[6]; - tmp_h[7] = esalt_bufs[digests_offset].h[7]; - - tmp_t[0] = esalt_bufs[digests_offset].t[0]; - tmp_t[1] = esalt_bufs[digests_offset].t[1]; - tmp_f[0] = esalt_bufs[digests_offset].f[0]; - tmp_f[1] = esalt_bufs[digests_offset].f[1]; - /** * loop */ @@ -173,64 +53,61 @@ KERNEL_FQ void m00600_m04 (KERN_ATTR_RULES_ESALT (blake2_t)) const u32x out_len = apply_rules_vect_optimized (pw_buf0, pw_buf1, pw_len, rules_buf, il_pos, w0, w1); - u64x digest[8]; u64x m[16]; - u64x v[16]; + + m[ 0] = hl32_to_64 (w0[1], w0[0]); + m[ 1] = hl32_to_64 (w0[3], w0[2]); + m[ 2] = hl32_to_64 (w1[1], w1[0]); + m[ 3] = hl32_to_64 (w1[3], w1[2]); + m[ 4] = hl32_to_64 (w2[1], w2[0]); + m[ 5] = hl32_to_64 (w2[3], w2[2]); + m[ 6] = hl32_to_64 (w3[1], w3[0]); + m[ 7] = hl32_to_64 (w3[3], w3[2]); + m[ 8] = 0; + m[ 9] = 0; + m[10] = 0; + m[11] = 0; + m[12] = 0; + m[13] = 0; + m[14] = 0; + m[15] = 0; u64x h[8]; - u64x t[2]; - u64x f[2]; - - h[0] = tmp_h[0]; - h[1] = tmp_h[1]; - h[2] = tmp_h[2]; - h[3] = tmp_h[3]; - h[4] = tmp_h[4]; - h[5] = tmp_h[5]; - h[6] = tmp_h[6]; - h[7] = tmp_h[7]; - - t[0] = tmp_t[0]; - t[1] = tmp_t[1]; - f[0] = tmp_f[0]; - f[1] = tmp_f[1]; - - blake2b_transform (h, t, f, m, v, w0, w1, w2, w3, out_len, BLAKE2B_FINAL); - - digest[0] = h[0]; - digest[1] = h[1]; - digest[2] = h[2]; - digest[3] = h[3]; - digest[4] = h[4]; - digest[5] = h[5]; - digest[6] = h[6]; - digest[7] = h[7]; - - const u32x r0 = h32_from_64 (digest[0]); - const u32x r1 = l32_from_64 (digest[0]); - const u32x r2 = h32_from_64 (digest[1]); - const u32x r3 = l32_from_64 (digest[1]); + + h[0] = BLAKE2B_IV_00 ^ 0x01010040; + h[1] = BLAKE2B_IV_01; + h[2] = BLAKE2B_IV_02; + h[3] = BLAKE2B_IV_03; + h[4] = BLAKE2B_IV_04; + h[5] = BLAKE2B_IV_05; + h[6] = BLAKE2B_IV_06; + h[7] = BLAKE2B_IV_07; + + blake2b_transform_vector (h, m, out_len, BLAKE2B_FINAL); + + const u32x r0 = h32_from_64 (h[0]); + const u32x r1 = l32_from_64 (h[0]); + const u32x r2 = h32_from_64 (h[1]); + const u32x r3 = l32_from_64 (h[1]); COMPARE_M_SIMD (r0, r1, r2, r3); } } -KERNEL_FQ void m00600_m08 (KERN_ATTR_RULES_ESALT (blake2_t)) +KERNEL_FQ void m00600_m08 (KERN_ATTR_RULES ()) { } -KERNEL_FQ void m00600_m16 (KERN_ATTR_RULES_ESALT (blake2_t)) +KERNEL_FQ void m00600_m16 (KERN_ATTR_RULES ()) { } -KERNEL_FQ void m00600_s04 (KERN_ATTR_RULES_ESALT (blake2_t)) +KERNEL_FQ void m00600_s04 (KERN_ATTR_RULES ()) { /** - * modifier + * base */ - const u64 lid = get_local_id (0); - const u64 gid = get_global_id (0); if (gid >= gid_max) return; @@ -249,24 +126,6 @@ KERNEL_FQ void m00600_s04 (KERN_ATTR_RULES_ESALT (blake2_t)) const u32 pw_len = pws[gid].pw_len & 63; - u64 tmp_h[8]; - u64 tmp_t[2]; - u64 tmp_f[2]; - - tmp_h[0] = esalt_bufs[digests_offset].h[0]; - tmp_h[1] = esalt_bufs[digests_offset].h[1]; - tmp_h[2] = esalt_bufs[digests_offset].h[2]; - tmp_h[3] = esalt_bufs[digests_offset].h[3]; - tmp_h[4] = esalt_bufs[digests_offset].h[4]; - tmp_h[5] = esalt_bufs[digests_offset].h[5]; - tmp_h[6] = esalt_bufs[digests_offset].h[6]; - tmp_h[7] = esalt_bufs[digests_offset].h[7]; - - tmp_t[0] = esalt_bufs[digests_offset].t[0]; - tmp_t[1] = esalt_bufs[digests_offset].t[1]; - tmp_f[0] = esalt_bufs[digests_offset].f[0]; - tmp_f[1] = esalt_bufs[digests_offset].f[1]; - /** * digest */ @@ -292,52 +151,51 @@ KERNEL_FQ void m00600_s04 (KERN_ATTR_RULES_ESALT (blake2_t)) const u32x out_len = apply_rules_vect_optimized (pw_buf0, pw_buf1, pw_len, rules_buf, il_pos, w0, w1); - u64x digest[8]; u64x m[16]; - u64x v[16]; + + m[ 0] = hl32_to_64 (w0[1], w0[0]); + m[ 1] = hl32_to_64 (w0[3], w0[2]); + m[ 2] = hl32_to_64 (w1[1], w1[0]); + m[ 3] = hl32_to_64 (w1[3], w1[2]); + m[ 4] = hl32_to_64 (w2[1], w2[0]); + m[ 5] = hl32_to_64 (w2[3], w2[2]); + m[ 6] = hl32_to_64 (w3[1], w3[0]); + m[ 7] = hl32_to_64 (w3[3], w3[2]); + m[ 8] = 0; + m[ 9] = 0; + m[10] = 0; + m[11] = 0; + m[12] = 0; + m[13] = 0; + m[14] = 0; + m[15] = 0; u64x h[8]; - u64x t[2]; - u64x f[2]; - - h[0] = tmp_h[0]; - h[1] = tmp_h[1]; - h[2] = tmp_h[2]; - h[3] = tmp_h[3]; - h[4] = tmp_h[4]; - h[5] = tmp_h[5]; - h[6] = tmp_h[6]; - h[7] = tmp_h[7]; - - t[0] = tmp_t[0]; - t[1] = tmp_t[1]; - f[0] = tmp_f[0]; - f[1] = tmp_f[1]; - - blake2b_transform (h, t, f, m, v, w0, w1, w2, w3, out_len, BLAKE2B_FINAL); - - digest[0] = h[0]; - digest[1] = h[1]; - digest[2] = h[2]; - digest[3] = h[3]; - digest[4] = h[4]; - digest[5] = h[5]; - digest[6] = h[6]; - digest[7] = h[7]; - - const u32x r0 = h32_from_64 (digest[0]); - const u32x r1 = l32_from_64 (digest[0]); - const u32x r2 = h32_from_64 (digest[1]); - const u32x r3 = l32_from_64 (digest[1]); + + h[0] = BLAKE2B_IV_00 ^ 0x01010040; + h[1] = BLAKE2B_IV_01; + h[2] = BLAKE2B_IV_02; + h[3] = BLAKE2B_IV_03; + h[4] = BLAKE2B_IV_04; + h[5] = BLAKE2B_IV_05; + h[6] = BLAKE2B_IV_06; + h[7] = BLAKE2B_IV_07; + + blake2b_transform_vector (h, m, out_len, BLAKE2B_FINAL); + + const u32x r0 = h32_from_64 (h[0]); + const u32x r1 = l32_from_64 (h[0]); + const u32x r2 = h32_from_64 (h[1]); + const u32x r3 = l32_from_64 (h[1]); COMPARE_S_SIMD (r0, r1, r2, r3); } } -KERNEL_FQ void m00600_s08 (KERN_ATTR_RULES_ESALT (blake2_t)) +KERNEL_FQ void m00600_s08 (KERN_ATTR_RULES ()) { } -KERNEL_FQ void m00600_s16 (KERN_ATTR_RULES_ESALT (blake2_t)) +KERNEL_FQ void m00600_s16 (KERN_ATTR_RULES ()) { } diff --git a/OpenCL/m00600_a0-pure.cl b/OpenCL/m00600_a0-pure.cl new file mode 100644 index 000000000..07dd567f0 --- /dev/null +++ b/OpenCL/m00600_a0-pure.cl @@ -0,0 +1,111 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#ifdef KERNEL_STATIC +#include "inc_vendor.h" +#include "inc_types.h" +#include "inc_platform.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_blake2b.cl" +#endif + +KERNEL_FQ void m00600_mxx (KERN_ATTR_RULES ()) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + COPY_PW (pws[gid]); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + pw_t tmp = PASTE_PW; + + tmp.pw_len = apply_rules (rules_buf[il_pos].cmds, tmp.i, tmp.pw_len); + + blake2b_ctx_t ctx; + + blake2b_init (&ctx); + blake2b_update (&ctx, tmp.i, tmp.pw_len); + blake2b_final (&ctx); + + const u32 r0 = h32_from_64_S (ctx.h[0]); + const u32 r1 = l32_from_64_S (ctx.h[0]); + const u32 r2 = h32_from_64_S (ctx.h[1]); + const u32 r3 = l32_from_64_S (ctx.h[1]); + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +KERNEL_FQ void m00600_sxx (KERN_ATTR_RULES ()) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + COPY_PW (pws[gid]); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + pw_t tmp = PASTE_PW; + + tmp.pw_len = apply_rules (rules_buf[il_pos].cmds, tmp.i, tmp.pw_len); + + blake2b_ctx_t ctx; + + blake2b_init (&ctx); + blake2b_update (&ctx, tmp.i, tmp.pw_len); + blake2b_final (&ctx); + + const u32 r0 = h32_from_64_S (ctx.h[0]); + const u32 r1 = l32_from_64_S (ctx.h[0]); + const u32 r2 = h32_from_64_S (ctx.h[1]); + const u32 r3 = l32_from_64_S (ctx.h[1]); + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m00600_a1-optimized.cl b/OpenCL/m00600_a1-optimized.cl index dea787469..64c852492 100644 --- a/OpenCL/m00600_a1-optimized.cl +++ b/OpenCL/m00600_a1-optimized.cl @@ -11,117 +11,18 @@ #include "inc_platform.cl" #include "inc_common.cl" #include "inc_simd.cl" +#include "inc_hash_blake2b.cl" #endif -typedef struct blake2 -{ - u64 h[8]; - u64 t[2]; - u64 f[2]; - u32 buflen; - u32 outlen; - -} blake2_t; - -#define BLAKE2B_FINAL 1 -#define BLAKE2B_UPDATE 0 - -#define BLAKE2B_G(k0,k1,a,b,c,d) \ - do { \ - a = a + b + m[(k0)]; \ - d = hc_rotr64 (d ^ a, 32); \ - c = c + d; \ - b = hc_rotr64 (b ^ c, 24); \ - a = a + b + m[(k1)]; \ - d = hc_rotr64 (d ^ a, 16); \ - c = c + d; \ - b = hc_rotr64 (b ^ c, 63); \ - } while (0) - -#define BLAKE2B_ROUND(c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,ca,cb,cc,cd,ce,cf) \ - do { \ - BLAKE2B_G ((c0),(c1),v[ 0],v[ 4],v[ 8],v[12]); \ - BLAKE2B_G ((c2),(c3),v[ 1],v[ 5],v[ 9],v[13]); \ - BLAKE2B_G ((c4),(c5),v[ 2],v[ 6],v[10],v[14]); \ - BLAKE2B_G ((c6),(c7),v[ 3],v[ 7],v[11],v[15]); \ - BLAKE2B_G ((c8),(c9),v[ 0],v[ 5],v[10],v[15]); \ - BLAKE2B_G ((ca),(cb),v[ 1],v[ 6],v[11],v[12]); \ - BLAKE2B_G ((cc),(cd),v[ 2],v[ 7],v[ 8],v[13]); \ - BLAKE2B_G ((ce),(cf),v[ 3],v[ 4],v[ 9],v[14]); \ -} while (0) - -DECLSPEC void blake2b_transform (u64x *h, u64x *t, u64x *f, u64x *m, u64x *v, const u32x *w0, const u32x *w1, const u32x *w2, const u32x *w3, const u32x out_len, const u8 isFinal) -{ - if (isFinal) - f[0] = -1; - - t[0] += hl32_to_64 (0, out_len); - - m[ 0] = hl32_to_64 (w0[1], w0[0]); - m[ 1] = hl32_to_64 (w0[3], w0[2]); - m[ 2] = hl32_to_64 (w1[1], w1[0]); - m[ 3] = hl32_to_64 (w1[3], w1[2]); - m[ 4] = hl32_to_64 (w2[1], w2[0]); - m[ 5] = hl32_to_64 (w2[3], w2[2]); - m[ 6] = hl32_to_64 (w3[1], w3[0]); - m[ 7] = hl32_to_64 (w3[3], w3[2]); - m[ 8] = 0; - m[ 9] = 0; - m[10] = 0; - m[11] = 0; - m[12] = 0; - m[13] = 0; - m[14] = 0; - m[15] = 0; - - v[ 0] = h[0]; - v[ 1] = h[1]; - v[ 2] = h[2]; - v[ 3] = h[3]; - v[ 4] = h[4]; - v[ 5] = h[5]; - v[ 6] = h[6]; - v[ 7] = h[7]; - v[ 8] = BLAKE2B_IV_00; - v[ 9] = BLAKE2B_IV_01; - v[10] = BLAKE2B_IV_02; - v[11] = BLAKE2B_IV_03; - v[12] = BLAKE2B_IV_04 ^ t[0]; - v[13] = BLAKE2B_IV_05 ^ t[1]; - v[14] = BLAKE2B_IV_06 ^ f[0]; - v[15] = BLAKE2B_IV_07 ^ f[1]; - - BLAKE2B_ROUND ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - BLAKE2B_ROUND (14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3); - BLAKE2B_ROUND (11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4); - BLAKE2B_ROUND ( 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8); - BLAKE2B_ROUND ( 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13); - BLAKE2B_ROUND ( 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9); - BLAKE2B_ROUND (12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11); - BLAKE2B_ROUND (13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10); - BLAKE2B_ROUND ( 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5); - BLAKE2B_ROUND (10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0); - BLAKE2B_ROUND ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - BLAKE2B_ROUND (14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3); - - h[0] = h[0] ^ v[0] ^ v[ 8]; - h[1] = h[1] ^ v[1] ^ v[ 9]; - h[2] = h[2] ^ v[2] ^ v[10]; - h[3] = h[3] ^ v[3] ^ v[11]; - h[4] = h[4] ^ v[4] ^ v[12]; - h[5] = h[5] ^ v[5] ^ v[13]; - h[6] = h[6] ^ v[6] ^ v[14]; - h[7] = h[7] ^ v[7] ^ v[15]; -} - -KERNEL_FQ void m00600_m04 (KERN_ATTR_ESALT (blake2_t)) +KERNEL_FQ void m00600_m04 (KERN_ATTR_BASIC ()) { /** - * modifier + * base */ const u64 gid = get_global_id (0); - const u64 lid = get_local_id (0); + + if (gid >= gid_max) return; u32 pw_buf0[4]; u32 pw_buf1[4]; @@ -137,24 +38,6 @@ KERNEL_FQ void m00600_m04 (KERN_ATTR_ESALT (blake2_t)) const u32 pw_l_len = pws[gid].pw_len & 63; - u64 tmp_h[8]; - u64 tmp_t[2]; - u64 tmp_f[2]; - - tmp_h[0] = esalt_bufs[digests_offset].h[0]; - tmp_h[1] = esalt_bufs[digests_offset].h[1]; - tmp_h[2] = esalt_bufs[digests_offset].h[2]; - tmp_h[3] = esalt_bufs[digests_offset].h[3]; - tmp_h[4] = esalt_bufs[digests_offset].h[4]; - tmp_h[5] = esalt_bufs[digests_offset].h[5]; - tmp_h[6] = esalt_bufs[digests_offset].h[6]; - tmp_h[7] = esalt_bufs[digests_offset].h[7]; - - tmp_t[0] = esalt_bufs[digests_offset].t[0]; - tmp_t[1] = esalt_bufs[digests_offset].t[1]; - tmp_f[0] = esalt_bufs[digests_offset].f[0]; - tmp_f[1] = esalt_bufs[digests_offset].f[1]; - /** * loop */ @@ -228,64 +111,61 @@ KERNEL_FQ void m00600_m04 (KERN_ATTR_ESALT (blake2_t)) w3[2] = wordl3[2] | wordr3[2]; w3[3] = wordl3[3] | wordr3[3]; - u64x digest[8]; u64x m[16]; - u64x v[16]; + + m[ 0] = hl32_to_64 (w0[1], w0[0]); + m[ 1] = hl32_to_64 (w0[3], w0[2]); + m[ 2] = hl32_to_64 (w1[1], w1[0]); + m[ 3] = hl32_to_64 (w1[3], w1[2]); + m[ 4] = hl32_to_64 (w2[1], w2[0]); + m[ 5] = hl32_to_64 (w2[3], w2[2]); + m[ 6] = hl32_to_64 (w3[1], w3[0]); + m[ 7] = hl32_to_64 (w3[3], w3[2]); + m[ 8] = 0; + m[ 9] = 0; + m[10] = 0; + m[11] = 0; + m[12] = 0; + m[13] = 0; + m[14] = 0; + m[15] = 0; u64x h[8]; - u64x t[2]; - u64x f[2]; - - h[0] = tmp_h[0]; - h[1] = tmp_h[1]; - h[2] = tmp_h[2]; - h[3] = tmp_h[3]; - h[4] = tmp_h[4]; - h[5] = tmp_h[5]; - h[6] = tmp_h[6]; - h[7] = tmp_h[7]; - - t[0] = tmp_t[0]; - t[1] = tmp_t[1]; - f[0] = tmp_f[0]; - f[1] = tmp_f[1]; - - blake2b_transform (h, t, f, m, v, w0, w1, w2, w3, out_len, BLAKE2B_FINAL); - - digest[0] = h[0]; - digest[1] = h[1]; - digest[2] = h[2]; - digest[3] = h[3]; - digest[4] = h[4]; - digest[5] = h[5]; - digest[6] = h[6]; - digest[7] = h[7]; - - const u32x r0 = h32_from_64 (digest[0]); - const u32x r1 = l32_from_64 (digest[0]); - const u32x r2 = h32_from_64 (digest[1]); - const u32x r3 = l32_from_64 (digest[1]); + + h[0] = BLAKE2B_IV_00 ^ 0x01010040; + h[1] = BLAKE2B_IV_01; + h[2] = BLAKE2B_IV_02; + h[3] = BLAKE2B_IV_03; + h[4] = BLAKE2B_IV_04; + h[5] = BLAKE2B_IV_05; + h[6] = BLAKE2B_IV_06; + h[7] = BLAKE2B_IV_07; + + blake2b_transform_vector (h, m, out_len, BLAKE2B_FINAL); + + const u32x r0 = h32_from_64 (h[0]); + const u32x r1 = l32_from_64 (h[0]); + const u32x r2 = h32_from_64 (h[1]); + const u32x r3 = l32_from_64 (h[1]); COMPARE_M_SIMD (r0, r1, r2, r3); } } -KERNEL_FQ void m00600_m08 (KERN_ATTR_ESALT (blake2_t)) +KERNEL_FQ void m00600_m08 (KERN_ATTR_BASIC ()) { } -KERNEL_FQ void m00600_m16 (KERN_ATTR_ESALT (blake2_t)) +KERNEL_FQ void m00600_m16 (KERN_ATTR_BASIC ()) { } -KERNEL_FQ void m00600_s04 (KERN_ATTR_ESALT (blake2_t)) +KERNEL_FQ void m00600_s04 (KERN_ATTR_BASIC ()) { /** - * modifier + * base */ - const u64 lid = get_local_id (0); - const u64 gid = get_global_id (0); if (gid >= gid_max) return; @@ -304,24 +184,6 @@ KERNEL_FQ void m00600_s04 (KERN_ATTR_ESALT (blake2_t)) const u32 pw_l_len = pws[gid].pw_len & 63; - u64 tmp_h[8]; - u64 tmp_t[2]; - u64 tmp_f[2]; - - tmp_h[0] = esalt_bufs[digests_offset].h[0]; - tmp_h[1] = esalt_bufs[digests_offset].h[1]; - tmp_h[2] = esalt_bufs[digests_offset].h[2]; - tmp_h[3] = esalt_bufs[digests_offset].h[3]; - tmp_h[4] = esalt_bufs[digests_offset].h[4]; - tmp_h[5] = esalt_bufs[digests_offset].h[5]; - tmp_h[6] = esalt_bufs[digests_offset].h[6]; - tmp_h[7] = esalt_bufs[digests_offset].h[7]; - - tmp_t[0] = esalt_bufs[digests_offset].t[0]; - tmp_t[1] = esalt_bufs[digests_offset].t[1]; - tmp_f[0] = esalt_bufs[digests_offset].f[0]; - tmp_f[1] = esalt_bufs[digests_offset].f[1]; - /** * digest */ @@ -407,52 +269,51 @@ KERNEL_FQ void m00600_s04 (KERN_ATTR_ESALT (blake2_t)) w3[2] = wordl3[2] | wordr3[2]; w3[3] = wordl3[3] | wordr3[3]; - u64x digest[8]; u64x m[16]; - u64x v[16]; + + m[ 0] = hl32_to_64 (w0[1], w0[0]); + m[ 1] = hl32_to_64 (w0[3], w0[2]); + m[ 2] = hl32_to_64 (w1[1], w1[0]); + m[ 3] = hl32_to_64 (w1[3], w1[2]); + m[ 4] = hl32_to_64 (w2[1], w2[0]); + m[ 5] = hl32_to_64 (w2[3], w2[2]); + m[ 6] = hl32_to_64 (w3[1], w3[0]); + m[ 7] = hl32_to_64 (w3[3], w3[2]); + m[ 8] = 0; + m[ 9] = 0; + m[10] = 0; + m[11] = 0; + m[12] = 0; + m[13] = 0; + m[14] = 0; + m[15] = 0; u64x h[8]; - u64x t[2]; - u64x f[2]; - - h[0] = tmp_h[0]; - h[1] = tmp_h[1]; - h[2] = tmp_h[2]; - h[3] = tmp_h[3]; - h[4] = tmp_h[4]; - h[5] = tmp_h[5]; - h[6] = tmp_h[6]; - h[7] = tmp_h[7]; - - t[0] = tmp_t[0]; - t[1] = tmp_t[1]; - f[0] = tmp_f[0]; - f[1] = tmp_f[1]; - - blake2b_transform (h, t, f, m, v, w0, w1, w2, w3, out_len, BLAKE2B_FINAL); - - digest[0] = h[0]; - digest[1] = h[1]; - digest[2] = h[2]; - digest[3] = h[3]; - digest[4] = h[4]; - digest[5] = h[5]; - digest[6] = h[6]; - digest[7] = h[7]; - - const u32x r0 = h32_from_64 (digest[0]); - const u32x r1 = l32_from_64 (digest[0]); - const u32x r2 = h32_from_64 (digest[1]); - const u32x r3 = l32_from_64 (digest[1]); + + h[0] = BLAKE2B_IV_00 ^ 0x01010040; + h[1] = BLAKE2B_IV_01; + h[2] = BLAKE2B_IV_02; + h[3] = BLAKE2B_IV_03; + h[4] = BLAKE2B_IV_04; + h[5] = BLAKE2B_IV_05; + h[6] = BLAKE2B_IV_06; + h[7] = BLAKE2B_IV_07; + + blake2b_transform_vector (h, m, out_len, BLAKE2B_FINAL); + + const u32x r0 = h32_from_64 (h[0]); + const u32x r1 = l32_from_64 (h[0]); + const u32x r2 = h32_from_64 (h[1]); + const u32x r3 = l32_from_64 (h[1]); COMPARE_S_SIMD (r0, r1, r2, r3); } } -KERNEL_FQ void m00600_s08 (KERN_ATTR_ESALT (blake2_t)) +KERNEL_FQ void m00600_s08 (KERN_ATTR_BASIC ()) { } -KERNEL_FQ void m00600_s16 (KERN_ATTR_ESALT (blake2_t)) +KERNEL_FQ void m00600_s16 (KERN_ATTR_BASIC ()) { } diff --git a/OpenCL/m00600_a1-pure.cl b/OpenCL/m00600_a1-pure.cl new file mode 100644 index 000000000..4cc7c9707 --- /dev/null +++ b/OpenCL/m00600_a1-pure.cl @@ -0,0 +1,109 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#ifdef KERNEL_STATIC +#include "inc_vendor.h" +#include "inc_types.h" +#include "inc_platform.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_blake2b.cl" +#endif + +KERNEL_FQ void m00600_mxx (KERN_ATTR_BASIC ()) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + blake2b_ctx_t ctx0; + + blake2b_init (&ctx0); + + blake2b_update_global (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + blake2b_ctx_t ctx = ctx0; + + blake2b_update_global (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + blake2b_final (&ctx); + + const u32 r0 = h32_from_64_S (ctx.h[0]); + const u32 r1 = l32_from_64_S (ctx.h[0]); + const u32 r2 = h32_from_64_S (ctx.h[1]); + const u32 r3 = l32_from_64_S (ctx.h[1]); + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +KERNEL_FQ void m00600_sxx (KERN_ATTR_BASIC ()) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + blake2b_ctx_t ctx0; + + blake2b_init (&ctx0); + + blake2b_update_global (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + blake2b_ctx_t ctx = ctx0; + + blake2b_update_global (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + blake2b_final (&ctx); + + const u32 r0 = h32_from_64_S (ctx.h[0]); + const u32 r1 = l32_from_64_S (ctx.h[0]); + const u32 r2 = h32_from_64_S (ctx.h[1]); + const u32 r3 = l32_from_64_S (ctx.h[1]); + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m00600_a3-optimized.cl b/OpenCL/m00600_a3-optimized.cl index 2fa9e46b6..20f9e7327 100644 --- a/OpenCL/m00600_a3-optimized.cl +++ b/OpenCL/m00600_a3-optimized.cl @@ -11,141 +11,22 @@ #include "inc_platform.cl" #include "inc_common.cl" #include "inc_simd.cl" +#include "inc_hash_blake2b.cl" #endif -typedef struct blake2 -{ - u64 h[8]; - u64 t[2]; - u64 f[2]; - u32 buflen; - u32 outlen; - -} blake2_t; - -#define BLAKE2B_FINAL 1 -#define BLAKE2B_UPDATE 0 - -#define BLAKE2B_G(k0,k1,a,b,c,d) \ - do { \ - a = a + b + m[(k0)]; \ - d = hc_rotr64 (d ^ a, 32); \ - c = c + d; \ - b = hc_rotr64 (b ^ c, 24); \ - a = a + b + m[(k1)]; \ - d = hc_rotr64 (d ^ a, 16); \ - c = c + d; \ - b = hc_rotr64 (b ^ c, 63); \ - } while (0) - -#define BLAKE2B_ROUND(c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,ca,cb,cc,cd,ce,cf) \ - do { \ - BLAKE2B_G ((c0),(c1),v[ 0],v[ 4],v[ 8],v[12]); \ - BLAKE2B_G ((c2),(c3),v[ 1],v[ 5],v[ 9],v[13]); \ - BLAKE2B_G ((c4),(c5),v[ 2],v[ 6],v[10],v[14]); \ - BLAKE2B_G ((c6),(c7),v[ 3],v[ 7],v[11],v[15]); \ - BLAKE2B_G ((c8),(c9),v[ 0],v[ 5],v[10],v[15]); \ - BLAKE2B_G ((ca),(cb),v[ 1],v[ 6],v[11],v[12]); \ - BLAKE2B_G ((cc),(cd),v[ 2],v[ 7],v[ 8],v[13]); \ - BLAKE2B_G ((ce),(cf),v[ 3],v[ 4],v[ 9],v[14]); \ -} while (0) - -DECLSPEC void blake2b_transform (u64x *h, u64x *t, u64x *f, u64x *m, u64x *v, const u32x *w0, const u32x *w1, const u32x *w2, const u32x *w3, const u32x out_len, const u8 isFinal) -{ - if (isFinal) - f[0] = -1; - - t[0] += hl32_to_64 (0, out_len); - - m[ 0] = hl32_to_64 (w0[1], w0[0]); - m[ 1] = hl32_to_64 (w0[3], w0[2]); - m[ 2] = hl32_to_64 (w1[1], w1[0]); - m[ 3] = hl32_to_64 (w1[3], w1[2]); - m[ 4] = hl32_to_64 (w2[1], w2[0]); - m[ 5] = hl32_to_64 (w2[3], w2[2]); - m[ 6] = hl32_to_64 (w3[1], w3[0]); - m[ 7] = hl32_to_64 (w3[3], w3[2]); - m[ 8] = 0; - m[ 9] = 0; - m[10] = 0; - m[11] = 0; - m[12] = 0; - m[13] = 0; - m[14] = 0; - m[15] = 0; - - v[ 0] = h[0]; - v[ 1] = h[1]; - v[ 2] = h[2]; - v[ 3] = h[3]; - v[ 4] = h[4]; - v[ 5] = h[5]; - v[ 6] = h[6]; - v[ 7] = h[7]; - v[ 8] = BLAKE2B_IV_00; - v[ 9] = BLAKE2B_IV_01; - v[10] = BLAKE2B_IV_02; - v[11] = BLAKE2B_IV_03; - v[12] = BLAKE2B_IV_04 ^ t[0]; - v[13] = BLAKE2B_IV_05 ^ t[1]; - v[14] = BLAKE2B_IV_06 ^ f[0]; - v[15] = BLAKE2B_IV_07 ^ f[1]; - - BLAKE2B_ROUND ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - BLAKE2B_ROUND (14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3); - BLAKE2B_ROUND (11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4); - BLAKE2B_ROUND ( 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8); - BLAKE2B_ROUND ( 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13); - BLAKE2B_ROUND ( 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9); - BLAKE2B_ROUND (12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11); - BLAKE2B_ROUND (13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10); - BLAKE2B_ROUND ( 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5); - BLAKE2B_ROUND (10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0); - BLAKE2B_ROUND ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - BLAKE2B_ROUND (14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3); - - h[0] = h[0] ^ v[0] ^ v[ 8]; - h[1] = h[1] ^ v[1] ^ v[ 9]; - h[2] = h[2] ^ v[2] ^ v[10]; - h[3] = h[3] ^ v[3] ^ v[11]; - h[4] = h[4] ^ v[4] ^ v[12]; - h[5] = h[5] ^ v[5] ^ v[13]; - h[6] = h[6] ^ v[6] ^ v[14]; - h[7] = h[7] ^ v[7] ^ v[15]; -} - -KERNEL_FQ void m00600_m04 (KERN_ATTR_VECTOR_ESALT (blake2_t)) +DECLSPEC void m00600m (u32 *w, const u32 pw_len, KERN_ATTR_VECTOR ()) { /** * modifier */ const u64 gid = get_global_id (0); - const u64 lid = get_local_id (0); - - u64 tmp_h[8]; - u64 tmp_t[2]; - u64 tmp_f[2]; - - tmp_h[0] = esalt_bufs[digests_offset].h[0]; - tmp_h[1] = esalt_bufs[digests_offset].h[1]; - tmp_h[2] = esalt_bufs[digests_offset].h[2]; - tmp_h[3] = esalt_bufs[digests_offset].h[3]; - tmp_h[4] = esalt_bufs[digests_offset].h[4]; - tmp_h[5] = esalt_bufs[digests_offset].h[5]; - tmp_h[6] = esalt_bufs[digests_offset].h[6]; - tmp_h[7] = esalt_bufs[digests_offset].h[7]; - - tmp_t[0] = esalt_bufs[digests_offset].t[0]; - tmp_t[1] = esalt_bufs[digests_offset].t[1]; - tmp_f[0] = esalt_bufs[digests_offset].f[0]; - tmp_f[1] = esalt_bufs[digests_offset].f[1]; /** * loop */ - u32 w0l = pws[gid].i[0]; + u32 w0l = w[0]; for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) { @@ -158,202 +39,88 @@ KERNEL_FQ void m00600_m04 (KERN_ATTR_VECTOR_ESALT (blake2_t)) u32x w3[4]; w0[0] = w0x; - w0[1] = pws[gid].i[ 1]; - w0[2] = pws[gid].i[ 2]; - w0[3] = pws[gid].i[ 3]; - w1[0] = 0; - w1[1] = 0; - w1[2] = 0; - w1[3] = 0; - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; - - u32x out_len = pws[gid].pw_len; - - u64x digest[8]; + w0[1] = w[ 1]; + w0[2] = w[ 2]; + w0[3] = w[ 3]; + w1[0] = w[ 4]; + w1[1] = w[ 5]; + w1[2] = w[ 6]; + w1[3] = w[ 7]; + w2[0] = w[ 8]; + w2[1] = w[ 9]; + w2[2] = w[10]; + w2[3] = w[11]; + w3[0] = w[12]; + w3[1] = w[13]; + w3[2] = w[14]; + w3[3] = w[15]; + u64x m[16]; - u64x v[16]; + + m[ 0] = hl32_to_64 (w0[1], w0[0]); + m[ 1] = hl32_to_64 (w0[3], w0[2]); + m[ 2] = hl32_to_64 (w1[1], w1[0]); + m[ 3] = hl32_to_64 (w1[3], w1[2]); + m[ 4] = hl32_to_64 (w2[1], w2[0]); + m[ 5] = hl32_to_64 (w2[3], w2[2]); + m[ 6] = hl32_to_64 (w3[1], w3[0]); + m[ 7] = hl32_to_64 (w3[3], w3[2]); + m[ 8] = 0; + m[ 9] = 0; + m[10] = 0; + m[11] = 0; + m[12] = 0; + m[13] = 0; + m[14] = 0; + m[15] = 0; u64x h[8]; - u64x t[2]; - u64x f[2]; - - h[0] = tmp_h[0]; - h[1] = tmp_h[1]; - h[2] = tmp_h[2]; - h[3] = tmp_h[3]; - h[4] = tmp_h[4]; - h[5] = tmp_h[5]; - h[6] = tmp_h[6]; - h[7] = tmp_h[7]; - - t[0] = tmp_t[0]; - t[1] = tmp_t[1]; - f[0] = tmp_f[0]; - f[1] = tmp_f[1]; - - blake2b_transform (h, t, f, m, v, w0, w1, w2, w3, out_len, BLAKE2B_FINAL); - - digest[0] = h[0]; - digest[1] = h[1]; - digest[2] = h[2]; - digest[3] = h[3]; - digest[4] = h[4]; - digest[5] = h[5]; - digest[6] = h[6]; - digest[7] = h[7]; - - const u32x r0 = h32_from_64 (digest[0]); - const u32x r1 = l32_from_64 (digest[0]); - const u32x r2 = h32_from_64 (digest[1]); - const u32x r3 = l32_from_64 (digest[1]); + + h[0] = BLAKE2B_IV_00 ^ 0x01010040; + h[1] = BLAKE2B_IV_01; + h[2] = BLAKE2B_IV_02; + h[3] = BLAKE2B_IV_03; + h[4] = BLAKE2B_IV_04; + h[5] = BLAKE2B_IV_05; + h[6] = BLAKE2B_IV_06; + h[7] = BLAKE2B_IV_07; + + blake2b_transform_vector (h, m, pw_len, BLAKE2B_FINAL); + + const u32x r0 = h32_from_64 (h[0]); + const u32x r1 = l32_from_64 (h[0]); + const u32x r2 = h32_from_64 (h[1]); + const u32x r3 = l32_from_64 (h[1]); COMPARE_M_SIMD (r0, r1, r2, r3); } } -KERNEL_FQ void m00600_m08 (KERN_ATTR_VECTOR_ESALT (blake2_t)) +DECLSPEC void m00600s (u32 *w, const u32 pw_len, KERN_ATTR_VECTOR ()) { /** * modifier */ const u64 gid = get_global_id (0); - const u64 lid = get_local_id (0); - - u64 tmp_h[8]; - u64 tmp_t[2]; - u64 tmp_f[2]; - - tmp_h[0] = esalt_bufs[digests_offset].h[0]; - tmp_h[1] = esalt_bufs[digests_offset].h[1]; - tmp_h[2] = esalt_bufs[digests_offset].h[2]; - tmp_h[3] = esalt_bufs[digests_offset].h[3]; - tmp_h[4] = esalt_bufs[digests_offset].h[4]; - tmp_h[5] = esalt_bufs[digests_offset].h[5]; - tmp_h[6] = esalt_bufs[digests_offset].h[6]; - tmp_h[7] = esalt_bufs[digests_offset].h[7]; - - tmp_t[0] = esalt_bufs[digests_offset].t[0]; - tmp_t[1] = esalt_bufs[digests_offset].t[1]; - tmp_f[0] = esalt_bufs[digests_offset].f[0]; - tmp_f[1] = esalt_bufs[digests_offset].f[1]; /** - * loop + * digest */ - u32 w0l = pws[gid].i[0]; - - for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + const u32 search[4] = { - const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32x w0x = w0l | w0r; - - u32x w0[4]; - u32x w1[4]; - u32x w2[4]; - u32x w3[4]; - - w0[0] = w0x; - w0[1] = pws[gid].i[ 1]; - w0[2] = pws[gid].i[ 2]; - w0[3] = pws[gid].i[ 3]; - w1[0] = pws[gid].i[ 4]; - w1[1] = pws[gid].i[ 5]; - w1[2] = pws[gid].i[ 6]; - w1[3] = pws[gid].i[ 7]; - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; - - u32x out_len = pws[gid].pw_len; - - u64x digest[8]; - u64x m[16]; - u64x v[16]; - - u64x h[8]; - u64x t[2]; - u64x f[2]; - - h[0] = tmp_h[0]; - h[1] = tmp_h[1]; - h[2] = tmp_h[2]; - h[3] = tmp_h[3]; - h[4] = tmp_h[4]; - h[5] = tmp_h[5]; - h[6] = tmp_h[6]; - h[7] = tmp_h[7]; - - t[0] = tmp_t[0]; - t[1] = tmp_t[1]; - f[0] = tmp_f[0]; - f[1] = tmp_f[1]; - - blake2b_transform (h, t, f, m, v, w0, w1, w2, w3, out_len, BLAKE2B_FINAL); - - digest[0] = h[0]; - digest[1] = h[1]; - digest[2] = h[2]; - digest[3] = h[3]; - digest[4] = h[4]; - digest[5] = h[5]; - digest[6] = h[6]; - digest[7] = h[7]; - - const u32x r0 = h32_from_64 (digest[0]); - const u32x r1 = l32_from_64 (digest[0]); - const u32x r2 = h32_from_64 (digest[1]); - const u32x r3 = l32_from_64 (digest[1]); - - COMPARE_M_SIMD (r0, r1, r2, r3); - } -} - -KERNEL_FQ void m00600_m16 (KERN_ATTR_VECTOR_ESALT (blake2_t)) -{ - /** - * modifier - */ - - const u64 gid = get_global_id (0); - const u64 lid = get_local_id (0); - - u64 tmp_h[8]; - u64 tmp_t[2]; - u64 tmp_f[2]; - - tmp_h[0] = esalt_bufs[digests_offset].h[0]; - tmp_h[1] = esalt_bufs[digests_offset].h[1]; - tmp_h[2] = esalt_bufs[digests_offset].h[2]; - tmp_h[3] = esalt_bufs[digests_offset].h[3]; - tmp_h[4] = esalt_bufs[digests_offset].h[4]; - tmp_h[5] = esalt_bufs[digests_offset].h[5]; - tmp_h[6] = esalt_bufs[digests_offset].h[6]; - tmp_h[7] = esalt_bufs[digests_offset].h[7]; - - tmp_t[0] = esalt_bufs[digests_offset].t[0]; - tmp_t[1] = esalt_bufs[digests_offset].t[1]; - tmp_f[0] = esalt_bufs[digests_offset].f[0]; - tmp_f[1] = esalt_bufs[digests_offset].f[1]; + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; /** * loop */ - u32 w0l = pws[gid].i[0]; + u32 w0l = w[0]; for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) { @@ -366,410 +133,287 @@ KERNEL_FQ void m00600_m16 (KERN_ATTR_VECTOR_ESALT (blake2_t)) u32x w3[4]; w0[0] = w0x; - w0[1] = pws[gid].i[ 1]; - w0[2] = pws[gid].i[ 2]; - w0[3] = pws[gid].i[ 3]; - w1[0] = pws[gid].i[ 4]; - w1[1] = pws[gid].i[ 5]; - w1[2] = pws[gid].i[ 6]; - w1[3] = pws[gid].i[ 7]; - w2[0] = pws[gid].i[ 8]; - w2[1] = pws[gid].i[ 9]; - w2[2] = pws[gid].i[10]; - w2[3] = pws[gid].i[11]; - w3[0] = pws[gid].i[12]; - w3[1] = pws[gid].i[13]; - w3[2] = pws[gid].i[14]; - w3[3] = pws[gid].i[15]; - - u32x out_len = pws[gid].pw_len; - - u64x digest[8]; + w0[1] = w[ 1]; + w0[2] = w[ 2]; + w0[3] = w[ 3]; + w1[0] = w[ 4]; + w1[1] = w[ 5]; + w1[2] = w[ 6]; + w1[3] = w[ 7]; + w2[0] = w[ 8]; + w2[1] = w[ 9]; + w2[2] = w[10]; + w2[3] = w[11]; + w3[0] = w[12]; + w3[1] = w[13]; + w3[2] = w[14]; + w3[3] = w[15]; + u64x m[16]; - u64x v[16]; + + m[ 0] = hl32_to_64 (w0[1], w0[0]); + m[ 1] = hl32_to_64 (w0[3], w0[2]); + m[ 2] = hl32_to_64 (w1[1], w1[0]); + m[ 3] = hl32_to_64 (w1[3], w1[2]); + m[ 4] = hl32_to_64 (w2[1], w2[0]); + m[ 5] = hl32_to_64 (w2[3], w2[2]); + m[ 6] = hl32_to_64 (w3[1], w3[0]); + m[ 7] = hl32_to_64 (w3[3], w3[2]); + m[ 8] = 0; + m[ 9] = 0; + m[10] = 0; + m[11] = 0; + m[12] = 0; + m[13] = 0; + m[14] = 0; + m[15] = 0; u64x h[8]; - u64x t[2]; - u64x f[2]; - - h[0] = tmp_h[0]; - h[1] = tmp_h[1]; - h[2] = tmp_h[2]; - h[3] = tmp_h[3]; - h[4] = tmp_h[4]; - h[5] = tmp_h[5]; - h[6] = tmp_h[6]; - h[7] = tmp_h[7]; - - t[0] = tmp_t[0]; - t[1] = tmp_t[1]; - f[0] = tmp_f[0]; - f[1] = tmp_f[1]; - - blake2b_transform (h, t, f, m, v, w0, w1, w2, w3, out_len, BLAKE2B_FINAL); - - digest[0] = h[0]; - digest[1] = h[1]; - digest[2] = h[2]; - digest[3] = h[3]; - digest[4] = h[4]; - digest[5] = h[5]; - digest[6] = h[6]; - digest[7] = h[7]; - - const u32x r0 = h32_from_64 (digest[0]); - const u32x r1 = l32_from_64 (digest[0]); - const u32x r2 = h32_from_64 (digest[1]); - const u32x r3 = l32_from_64 (digest[1]); - COMPARE_M_SIMD (r0, r1, r2, r3); + h[0] = BLAKE2B_IV_00 ^ 0x01010040; + h[1] = BLAKE2B_IV_01; + h[2] = BLAKE2B_IV_02; + h[3] = BLAKE2B_IV_03; + h[4] = BLAKE2B_IV_04; + h[5] = BLAKE2B_IV_05; + h[6] = BLAKE2B_IV_06; + h[7] = BLAKE2B_IV_07; + + blake2b_transform_vector (h, m, pw_len, BLAKE2B_FINAL); + + const u32x r0 = h32_from_64 (h[0]); + const u32x r1 = l32_from_64 (h[0]); + const u32x r2 = h32_from_64 (h[1]); + const u32x r3 = l32_from_64 (h[1]); + + COMPARE_S_SIMD (r0, r1, r2, r3); } } -KERNEL_FQ void m00600_s04 (KERN_ATTR_VECTOR_ESALT (blake2_t)) +KERNEL_FQ void m00600_m04 (KERN_ATTR_VECTOR ()) { /** - * modifier + * base */ const u64 gid = get_global_id (0); - const u64 lid = get_local_id (0); - - u64 tmp_h[8]; - u64 tmp_t[2]; - u64 tmp_f[2]; - - tmp_h[0] = esalt_bufs[digests_offset].h[0]; - tmp_h[1] = esalt_bufs[digests_offset].h[1]; - tmp_h[2] = esalt_bufs[digests_offset].h[2]; - tmp_h[3] = esalt_bufs[digests_offset].h[3]; - tmp_h[4] = esalt_bufs[digests_offset].h[4]; - tmp_h[5] = esalt_bufs[digests_offset].h[5]; - tmp_h[6] = esalt_bufs[digests_offset].h[6]; - tmp_h[7] = esalt_bufs[digests_offset].h[7]; - - tmp_t[0] = esalt_bufs[digests_offset].t[0]; - tmp_t[1] = esalt_bufs[digests_offset].t[1]; - tmp_f[0] = esalt_bufs[digests_offset].f[0]; - tmp_f[1] = esalt_bufs[digests_offset].f[1]; + + if (gid >= gid_max) return; + + u32 w[16]; + + w[ 0] = pws[gid].i[ 0]; + w[ 1] = pws[gid].i[ 1]; + w[ 2] = pws[gid].i[ 2]; + w[ 3] = pws[gid].i[ 3]; + w[ 4] = 0; + w[ 5] = 0; + w[ 6] = 0; + w[ 7] = 0; + w[ 8] = 0; + w[ 9] = 0; + w[10] = 0; + w[11] = 0; + w[12] = 0; + w[13] = 0; + w[14] = 0; + w[15] = 0; + + const u32 pw_len = pws[gid].pw_len & 63; /** - * digest + * main */ - const u32 search[4] = - { - digests_buf[digests_offset].digest_buf[DGST_R0], - digests_buf[digests_offset].digest_buf[DGST_R1], - digests_buf[digests_offset].digest_buf[DGST_R2], - digests_buf[digests_offset].digest_buf[DGST_R3] - }; + m00600m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, combs_mode, gid_max); +} +KERNEL_FQ void m00600_m08 (KERN_ATTR_VECTOR ()) +{ /** - * loop + * base */ - u32 w0l = pws[gid].i[0]; - - for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) - { - const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32x w0x = w0l | w0r; - - u32x w0[4]; - u32x w1[4]; - u32x w2[4]; - u32x w3[4]; + const u64 gid = get_global_id (0); - w0[0] = w0x; - w0[1] = pws[gid].i[ 1]; - w0[2] = pws[gid].i[ 2]; - w0[3] = pws[gid].i[ 3]; - w1[0] = 0; - w1[1] = 0; - w1[2] = 0; - w1[3] = 0; - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; - - u32x out_len = pws[gid].pw_len; - - u64x digest[8]; - u64x m[16]; - u64x v[16]; + if (gid >= gid_max) return; + + u32 w[16]; + + w[ 0] = pws[gid].i[ 0]; + w[ 1] = pws[gid].i[ 1]; + w[ 2] = pws[gid].i[ 2]; + w[ 3] = pws[gid].i[ 3]; + w[ 4] = pws[gid].i[ 4]; + w[ 5] = pws[gid].i[ 5]; + w[ 6] = pws[gid].i[ 6]; + w[ 7] = pws[gid].i[ 7]; + w[ 8] = 0; + w[ 9] = 0; + w[10] = 0; + w[11] = 0; + w[12] = 0; + w[13] = 0; + w[14] = 0; + w[15] = 0; + + const u32 pw_len = pws[gid].pw_len & 63; - u64x h[8]; - u64x t[2]; - u64x f[2]; - - h[0] = tmp_h[0]; - h[1] = tmp_h[1]; - h[2] = tmp_h[2]; - h[3] = tmp_h[3]; - h[4] = tmp_h[4]; - h[5] = tmp_h[5]; - h[6] = tmp_h[6]; - h[7] = tmp_h[7]; - - t[0] = tmp_t[0]; - t[1] = tmp_t[1]; - f[0] = tmp_f[0]; - f[1] = tmp_f[1]; - - blake2b_transform (h, t, f, m, v, w0, w1, w2, w3, out_len, BLAKE2B_FINAL); - - digest[0] = h[0]; - digest[1] = h[1]; - digest[2] = h[2]; - digest[3] = h[3]; - digest[4] = h[4]; - digest[5] = h[5]; - digest[6] = h[6]; - digest[7] = h[7]; - - const u32x r0 = h32_from_64 (digest[0]); - const u32x r1 = l32_from_64 (digest[0]); - const u32x r2 = h32_from_64 (digest[1]); - const u32x r3 = l32_from_64 (digest[1]); + /** + * main + */ - COMPARE_S_SIMD (r0, r1, r2, r3); - } + m00600m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, combs_mode, gid_max); } -KERNEL_FQ void m00600_s08 (KERN_ATTR_VECTOR_ESALT (blake2_t)) +KERNEL_FQ void m00600_m16 (KERN_ATTR_VECTOR ()) { /** - * modifier + * base */ const u64 gid = get_global_id (0); - const u64 lid = get_local_id (0); - - u64 tmp_h[8]; - u64 tmp_t[2]; - u64 tmp_f[2]; - - tmp_h[0] = esalt_bufs[digests_offset].h[0]; - tmp_h[1] = esalt_bufs[digests_offset].h[1]; - tmp_h[2] = esalt_bufs[digests_offset].h[2]; - tmp_h[3] = esalt_bufs[digests_offset].h[3]; - tmp_h[4] = esalt_bufs[digests_offset].h[4]; - tmp_h[5] = esalt_bufs[digests_offset].h[5]; - tmp_h[6] = esalt_bufs[digests_offset].h[6]; - tmp_h[7] = esalt_bufs[digests_offset].h[7]; - - tmp_t[0] = esalt_bufs[digests_offset].t[0]; - tmp_t[1] = esalt_bufs[digests_offset].t[1]; - tmp_f[0] = esalt_bufs[digests_offset].f[0]; - tmp_f[1] = esalt_bufs[digests_offset].f[1]; + + if (gid >= gid_max) return; + + u32 w[16]; + + w[ 0] = pws[gid].i[ 0]; + w[ 1] = pws[gid].i[ 1]; + w[ 2] = pws[gid].i[ 2]; + w[ 3] = pws[gid].i[ 3]; + w[ 4] = pws[gid].i[ 4]; + w[ 5] = pws[gid].i[ 5]; + w[ 6] = pws[gid].i[ 6]; + w[ 7] = pws[gid].i[ 7]; + w[ 8] = pws[gid].i[ 8]; + w[ 9] = pws[gid].i[ 9]; + w[10] = pws[gid].i[10]; + w[11] = pws[gid].i[11]; + w[12] = pws[gid].i[12]; + w[13] = pws[gid].i[13]; + w[14] = pws[gid].i[14]; + w[15] = pws[gid].i[15]; + + const u32 pw_len = pws[gid].pw_len & 63; /** - * digest + * main */ - const u32 search[4] = - { - digests_buf[digests_offset].digest_buf[DGST_R0], - digests_buf[digests_offset].digest_buf[DGST_R1], - digests_buf[digests_offset].digest_buf[DGST_R2], - digests_buf[digests_offset].digest_buf[DGST_R3] - }; + m00600m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, combs_mode, gid_max); +} +KERNEL_FQ void m00600_s04 (KERN_ATTR_VECTOR ()) +{ /** - * loop + * base */ - u32 w0l = pws[gid].i[0]; - - for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) - { - const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32x w0x = w0l | w0r; - - u32x w0[4]; - u32x w1[4]; - u32x w2[4]; - u32x w3[4]; + const u64 gid = get_global_id (0); - w0[0] = w0x; - w0[1] = pws[gid].i[ 1]; - w0[2] = pws[gid].i[ 2]; - w0[3] = pws[gid].i[ 3]; - w1[0] = pws[gid].i[ 4]; - w1[1] = pws[gid].i[ 5]; - w1[2] = pws[gid].i[ 6]; - w1[3] = pws[gid].i[ 7]; - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; - - u32x out_len = pws[gid].pw_len; - - u64x digest[8]; - u64x m[16]; - u64x v[16]; + if (gid >= gid_max) return; + + u32 w[16]; + + w[ 0] = pws[gid].i[ 0]; + w[ 1] = pws[gid].i[ 1]; + w[ 2] = pws[gid].i[ 2]; + w[ 3] = pws[gid].i[ 3]; + w[ 4] = 0; + w[ 5] = 0; + w[ 6] = 0; + w[ 7] = 0; + w[ 8] = 0; + w[ 9] = 0; + w[10] = 0; + w[11] = 0; + w[12] = 0; + w[13] = 0; + w[14] = 0; + w[15] = 0; + + const u32 pw_len = pws[gid].pw_len & 63; - u64x h[8]; - u64x t[2]; - u64x f[2]; - - h[0] = tmp_h[0]; - h[1] = tmp_h[1]; - h[2] = tmp_h[2]; - h[3] = tmp_h[3]; - h[4] = tmp_h[4]; - h[5] = tmp_h[5]; - h[6] = tmp_h[6]; - h[7] = tmp_h[7]; - - t[0] = tmp_t[0]; - t[1] = tmp_t[1]; - f[0] = tmp_f[0]; - f[1] = tmp_f[1]; - - blake2b_transform (h, t, f, m, v, w0, w1, w2, w3, out_len, BLAKE2B_FINAL); - - digest[0] = h[0]; - digest[1] = h[1]; - digest[2] = h[2]; - digest[3] = h[3]; - digest[4] = h[4]; - digest[5] = h[5]; - digest[6] = h[6]; - digest[7] = h[7]; - - const u32x r0 = h32_from_64 (digest[0]); - const u32x r1 = l32_from_64 (digest[0]); - const u32x r2 = h32_from_64 (digest[1]); - const u32x r3 = l32_from_64 (digest[1]); + /** + * main + */ - COMPARE_S_SIMD (r0, r1, r2, r3); - } + m00600s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, combs_mode, gid_max); } -KERNEL_FQ void m00600_s16 (KERN_ATTR_VECTOR_ESALT (blake2_t)) +KERNEL_FQ void m00600_s08 (KERN_ATTR_VECTOR ()) { /** - * modifier + * base */ const u64 gid = get_global_id (0); - const u64 lid = get_local_id (0); - - u64 tmp_h[8]; - u64 tmp_t[2]; - u64 tmp_f[2]; - - tmp_h[0] = esalt_bufs[digests_offset].h[0]; - tmp_h[1] = esalt_bufs[digests_offset].h[1]; - tmp_h[2] = esalt_bufs[digests_offset].h[2]; - tmp_h[3] = esalt_bufs[digests_offset].h[3]; - tmp_h[4] = esalt_bufs[digests_offset].h[4]; - tmp_h[5] = esalt_bufs[digests_offset].h[5]; - tmp_h[6] = esalt_bufs[digests_offset].h[6]; - tmp_h[7] = esalt_bufs[digests_offset].h[7]; - - tmp_t[0] = esalt_bufs[digests_offset].t[0]; - tmp_t[1] = esalt_bufs[digests_offset].t[1]; - tmp_f[0] = esalt_bufs[digests_offset].f[0]; - tmp_f[1] = esalt_bufs[digests_offset].f[1]; + + if (gid >= gid_max) return; + + u32 w[16]; + + w[ 0] = pws[gid].i[ 0]; + w[ 1] = pws[gid].i[ 1]; + w[ 2] = pws[gid].i[ 2]; + w[ 3] = pws[gid].i[ 3]; + w[ 4] = pws[gid].i[ 4]; + w[ 5] = pws[gid].i[ 5]; + w[ 6] = pws[gid].i[ 6]; + w[ 7] = pws[gid].i[ 7]; + w[ 8] = 0; + w[ 9] = 0; + w[10] = 0; + w[11] = 0; + w[12] = 0; + w[13] = 0; + w[14] = 0; + w[15] = 0; + + const u32 pw_len = pws[gid].pw_len & 63; /** - * digest + * main */ - const u32 search[4] = - { - digests_buf[digests_offset].digest_buf[DGST_R0], - digests_buf[digests_offset].digest_buf[DGST_R1], - digests_buf[digests_offset].digest_buf[DGST_R2], - digests_buf[digests_offset].digest_buf[DGST_R3] - }; + m00600s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, combs_mode, gid_max); +} +KERNEL_FQ void m00600_s16 (KERN_ATTR_VECTOR ()) +{ /** - * loop + * base */ - u32 w0l = pws[gid].i[0]; - - for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) - { - const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32x w0x = w0l | w0r; - - u32x w0[4]; - u32x w1[4]; - u32x w2[4]; - u32x w3[4]; + const u64 gid = get_global_id (0); - w0[0] = w0x; - w0[1] = pws[gid].i[ 1]; - w0[2] = pws[gid].i[ 2]; - w0[3] = pws[gid].i[ 3]; - w1[0] = pws[gid].i[ 4]; - w1[1] = pws[gid].i[ 5]; - w1[2] = pws[gid].i[ 6]; - w1[3] = pws[gid].i[ 7]; - w2[0] = pws[gid].i[ 8]; - w2[1] = pws[gid].i[ 9]; - w2[2] = pws[gid].i[10]; - w2[3] = pws[gid].i[11]; - w3[0] = pws[gid].i[12]; - w3[1] = pws[gid].i[13]; - w3[2] = pws[gid].i[14]; - w3[3] = pws[gid].i[15]; - - u32x out_len = pws[gid].pw_len; - - u64x digest[8]; - u64x m[16]; - u64x v[16]; + if (gid >= gid_max) return; + + u32 w[16]; + + w[ 0] = pws[gid].i[ 0]; + w[ 1] = pws[gid].i[ 1]; + w[ 2] = pws[gid].i[ 2]; + w[ 3] = pws[gid].i[ 3]; + w[ 4] = pws[gid].i[ 4]; + w[ 5] = pws[gid].i[ 5]; + w[ 6] = pws[gid].i[ 6]; + w[ 7] = pws[gid].i[ 7]; + w[ 8] = pws[gid].i[ 8]; + w[ 9] = pws[gid].i[ 9]; + w[10] = pws[gid].i[10]; + w[11] = pws[gid].i[11]; + w[12] = pws[gid].i[12]; + w[13] = pws[gid].i[13]; + w[14] = pws[gid].i[14]; + w[15] = pws[gid].i[15]; + + const u32 pw_len = pws[gid].pw_len & 63; - u64x h[8]; - u64x t[2]; - u64x f[2]; - - h[0] = tmp_h[0]; - h[1] = tmp_h[1]; - h[2] = tmp_h[2]; - h[3] = tmp_h[3]; - h[4] = tmp_h[4]; - h[5] = tmp_h[5]; - h[6] = tmp_h[6]; - h[7] = tmp_h[7]; - - t[0] = tmp_t[0]; - t[1] = tmp_t[1]; - f[0] = tmp_f[0]; - f[1] = tmp_f[1]; - - blake2b_transform (h, t, f, m, v, w0, w1, w2, w3, out_len, BLAKE2B_FINAL); - - digest[0] = h[0]; - digest[1] = h[1]; - digest[2] = h[2]; - digest[3] = h[3]; - digest[4] = h[4]; - digest[5] = h[5]; - digest[6] = h[6]; - digest[7] = h[7]; - - const u32x r0 = h32_from_64 (digest[0]); - const u32x r1 = l32_from_64 (digest[0]); - const u32x r2 = h32_from_64 (digest[1]); - const u32x r3 = l32_from_64 (digest[1]); + /** + * main + */ - COMPARE_S_SIMD (r0, r1, r2, r3); - } + m00600s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, combs_mode, gid_max); } diff --git a/OpenCL/m00600_a3-pure.cl b/OpenCL/m00600_a3-pure.cl new file mode 100644 index 000000000..6f19658b5 --- /dev/null +++ b/OpenCL/m00600_a3-pure.cl @@ -0,0 +1,131 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#ifdef KERNEL_STATIC +#include "inc_vendor.h" +#include "inc_types.h" +#include "inc_platform.cl" +#include "inc_common.cl" +#include "inc_simd.cl" +#include "inc_hash_blake2b.cl" +#endif + +KERNEL_FQ void m00600_mxx (KERN_ATTR_VECTOR ()) +{ + /** + * modifier + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + u32x w[64] = { 0 }; + + for (u32 i = 0, idx = 0; i < pw_len; i += 4, idx += 1) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + blake2b_ctx_vector_t ctx; + + blake2b_init_vector (&ctx); + blake2b_update_vector (&ctx, w, pw_len); + blake2b_final_vector (&ctx); + + const u32x r0 = h32_from_64 (ctx.h[0]); + const u32x r1 = l32_from_64 (ctx.h[0]); + const u32x r2 = h32_from_64 (ctx.h[1]); + const u32x r3 = l32_from_64 (ctx.h[1]); + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +KERNEL_FQ void m00600_sxx (KERN_ATTR_VECTOR ()) +{ + /** + * modifier + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + u32x w[64] = { 0 }; + + for (u32 i = 0, idx = 0; i < pw_len; i += 4, idx += 1) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + blake2b_ctx_vector_t ctx; + + blake2b_init_vector (&ctx); + blake2b_update_vector (&ctx, w, pw_len); + blake2b_final_vector (&ctx); + + const u32x r0 = h32_from_64 (ctx.h[0]); + const u32x r1 = l32_from_64 (ctx.h[0]); + const u32x r2 = h32_from_64 (ctx.h[1]); + const u32x r3 = l32_from_64 (ctx.h[1]); + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} diff --git a/docs/changes.txt b/docs/changes.txt index 420c2102d..3d9745087 100644 --- a/docs/changes.txt +++ b/docs/changes.txt @@ -1,5 +1,11 @@ * changes v6.0.0 -> v6.0.x +## +## Algorithms +## + +- Added pure kernels for hash-mode 600 (BLAKE2b-512) + ## ## Improvements ## diff --git a/src/modules/module_00600.c b/src/modules/module_00600.c index 3a2b13610..32e5a6550 100644 --- a/src/modules/module_00600.c +++ b/src/modules/module_00600.c @@ -42,31 +42,12 @@ u32 module_salt_type (MAYBE_UNUSED const hashconfig_t *hashconfig, const char *module_st_hash (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return ST_HASH; } const char *module_st_pass (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return ST_PASS; } -typedef struct blake2 -{ - u64 h[8]; - u64 t[2]; - u64 f[2]; - u32 buflen; - u32 outlen; - -} blake2_t; - static const char *SIGNATURE_BLAKE2B = "$BLAKE2$"; -u64 module_esalt_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) -{ - const u64 esalt_size = (const u64) sizeof (blake2_t); - - return esalt_size; -} - int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED void *digest_buf, MAYBE_UNUSED salt_t *salt, MAYBE_UNUSED void *esalt_buf, MAYBE_UNUSED void *hook_salt_buf, MAYBE_UNUSED hashinfo_t *hash_info, const char *line_buf, MAYBE_UNUSED const int line_len) { u64 *digest = (u64 *) digest_buf; - blake2_t *blake2 = (blake2_t *) esalt_buf; - token_t token; token.token_cnt = 2; @@ -97,24 +78,6 @@ int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSE digest[6] = hex_to_u64 (hash_pos + 96); digest[7] = hex_to_u64 (hash_pos + 112); - // Initialize BLAKE2 Params and State - - memset (blake2, 0, sizeof (blake2_t)); - - blake2->h[0] = BLAKE2B_IV_00; - blake2->h[1] = BLAKE2B_IV_01; - blake2->h[2] = BLAKE2B_IV_02; - blake2->h[3] = BLAKE2B_IV_03; - blake2->h[4] = BLAKE2B_IV_04; - blake2->h[5] = BLAKE2B_IV_05; - blake2->h[6] = BLAKE2B_IV_06; - blake2->h[7] = BLAKE2B_IV_07; - - // blake2->h[0] ^= 0x0000000001010040; // digest_lenght = 0x40, depth = 0x01, fanout = 0x01 - blake2->h[0] ^= 0x40 << 0; - blake2->h[0] ^= 0x01 << 16; - blake2->h[0] ^= 0x01 << 24; - return (PARSER_OK); } @@ -161,7 +124,7 @@ void module_init (module_ctx_t *module_ctx) module_ctx->module_dgst_pos3 = module_dgst_pos3; module_ctx->module_dgst_size = module_dgst_size; module_ctx->module_dictstat_disable = MODULE_DEFAULT; - module_ctx->module_esalt_size = module_esalt_size; + module_ctx->module_esalt_size = MODULE_DEFAULT; module_ctx->module_extra_buffer_size = MODULE_DEFAULT; module_ctx->module_extra_tmp_size = MODULE_DEFAULT; module_ctx->module_forced_outfile_format = MODULE_DEFAULT; diff --git a/tools/test_modules/m00600.pm b/tools/test_modules/m00600.pm index a96be2372..8d1883470 100644 --- a/tools/test_modules/m00600.pm +++ b/tools/test_modules/m00600.pm @@ -10,7 +10,7 @@ use warnings; use Digest::BLAKE2 qw (blake2b_hex); -sub module_constraints { [[-1, -1], [-1, -1], [0, 55], [-1, -1], [-1, -1]] } +sub module_constraints { [[0, 256], [-1, -1], [0, 55], [-1, -1], [-1, -1]] } sub module_generate_hash {