diff --git a/OpenCL/inc_common.cl b/OpenCL/inc_common.cl index 6c163280c..edadc89b9 100644 --- a/OpenCL/inc_common.cl +++ b/OpenCL/inc_common.cl @@ -2310,7 +2310,7 @@ inline void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x switch (offset / 4) { - case 0: + case 0: w3[3] = amd_bytealign (w3[3], w3[2], offset_minus_4); w3[2] = amd_bytealign (w3[2], w3[1], offset_minus_4); w3[1] = amd_bytealign (w3[1], w3[0], offset_minus_4); @@ -2350,7 +2350,7 @@ inline void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x break; - case 1: + case 1: w3[3] = amd_bytealign (w3[2], w3[1], offset_minus_4); w3[2] = amd_bytealign (w3[1], w3[0], offset_minus_4); w3[1] = amd_bytealign (w3[0], w2[3], offset_minus_4); @@ -2389,7 +2389,7 @@ inline void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x break; - case 2: + case 2: w3[3] = amd_bytealign (w3[1], w3[0], offset_minus_4); w3[2] = amd_bytealign (w3[0], w2[3], offset_minus_4); w3[1] = amd_bytealign (w2[3], w2[2], offset_minus_4); @@ -2427,7 +2427,7 @@ inline void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x break; - case 3: + case 3: w3[3] = amd_bytealign (w3[0], w2[3], offset_minus_4); w3[2] = amd_bytealign (w2[3], w2[2], offset_minus_4); w3[1] = amd_bytealign (w2[2], w2[1], offset_minus_4); @@ -2464,7 +2464,7 @@ inline void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x break; - case 4: + case 4: w3[3] = amd_bytealign (w2[3], w2[2], offset_minus_4); w3[2] = amd_bytealign (w2[2], w2[1], offset_minus_4); w3[1] = amd_bytealign (w2[1], w2[0], offset_minus_4); @@ -2500,7 +2500,7 @@ inline void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x break; - case 5: + case 5: w3[3] = amd_bytealign (w2[2], w2[1], offset_minus_4); w3[2] = amd_bytealign (w2[1], w2[0], offset_minus_4); w3[1] = amd_bytealign (w2[0], w1[3], offset_minus_4); @@ -2535,7 +2535,7 @@ inline void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x break; - case 6: + case 6: w3[3] = amd_bytealign (w2[1], w2[0], offset_minus_4); w3[2] = amd_bytealign (w2[0], w1[3], offset_minus_4); w3[1] = amd_bytealign (w1[3], w1[2], offset_minus_4); @@ -2569,7 +2569,7 @@ inline void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x break; - case 7: + case 7: w3[3] = amd_bytealign (w2[0], w1[3], offset_minus_4); w3[2] = amd_bytealign (w1[3], w1[2], offset_minus_4); w3[1] = amd_bytealign (w1[2], w1[1], offset_minus_4); @@ -2602,7 +2602,7 @@ inline void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x break; - case 8: + case 8: w3[3] = amd_bytealign (w1[3], w1[2], offset_minus_4); w3[2] = amd_bytealign (w1[2], w1[1], offset_minus_4); w3[1] = amd_bytealign (w1[1], w1[0], offset_minus_4); @@ -2634,7 +2634,7 @@ inline void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x break; - case 9: + case 9: w3[3] = amd_bytealign (w1[2], w1[1], offset_minus_4); w3[2] = amd_bytealign (w1[1], w1[0], offset_minus_4); w3[1] = amd_bytealign (w1[0], w0[3], offset_minus_4); @@ -2839,7 +2839,7 @@ inline void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x switch (offset / 4) { - case 0: + case 0: w3[3] = __byte_perm (w3[2], w3[3], selector); w3[2] = __byte_perm (w3[1], w3[2], selector); w3[1] = __byte_perm (w3[0], w3[1], selector); @@ -2856,9 +2856,10 @@ inline void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[2] = __byte_perm (w0[1], w0[2], selector); w0[1] = __byte_perm (w0[0], w0[1], selector); w0[0] = __byte_perm ( 0, w0[0], selector); + break; - case 1: + case 1: w3[3] = __byte_perm (w3[1], w3[2], selector); w3[2] = __byte_perm (w3[0], w3[1], selector); w3[1] = __byte_perm (w2[3], w3[0], selector); @@ -2875,9 +2876,10 @@ inline void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[2] = __byte_perm (w0[0], w0[1], selector); w0[1] = __byte_perm ( 0, w0[0], selector); w0[0] = 0; + break; - case 2: + case 2: w3[3] = __byte_perm (w3[0], w3[1], selector); w3[2] = __byte_perm (w2[3], w3[0], selector); w3[1] = __byte_perm (w2[2], w2[3], selector); @@ -2894,9 +2896,10 @@ inline void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[2] = __byte_perm ( 0, w0[0], selector); w0[1] = 0; w0[0] = 0; + break; - case 3: + case 3: w3[3] = __byte_perm (w2[3], w3[0], selector); w3[2] = __byte_perm (w2[2], w2[3], selector); w3[1] = __byte_perm (w2[1], w2[2], selector); @@ -2913,9 +2916,10 @@ inline void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 4: + case 4: w3[3] = __byte_perm (w2[2], w2[3], selector); w3[2] = __byte_perm (w2[1], w2[2], selector); w3[1] = __byte_perm (w2[0], w2[1], selector); @@ -2932,9 +2936,10 @@ inline void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 5: + case 5: w3[3] = __byte_perm (w2[1], w2[2], selector); w3[2] = __byte_perm (w2[0], w2[1], selector); w3[1] = __byte_perm (w1[3], w2[0], selector); @@ -2951,9 +2956,10 @@ inline void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 6: + case 6: w3[3] = __byte_perm (w2[0], w2[1], selector); w3[2] = __byte_perm (w1[3], w2[0], selector); w3[1] = __byte_perm (w1[2], w1[3], selector); @@ -2970,9 +2976,10 @@ inline void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 7: + case 7: w3[3] = __byte_perm (w1[3], w2[0], selector); w3[2] = __byte_perm (w1[2], w1[3], selector); w3[1] = __byte_perm (w1[1], w1[2], selector); @@ -2989,9 +2996,10 @@ inline void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 8: + case 8: w3[3] = __byte_perm (w1[2], w1[3], selector); w3[2] = __byte_perm (w1[1], w1[2], selector); w3[1] = __byte_perm (w1[0], w1[1], selector); @@ -3008,9 +3016,10 @@ inline void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 9: + case 9: w3[3] = __byte_perm (w1[1], w1[2], selector); w3[2] = __byte_perm (w1[0], w1[1], selector); w3[1] = __byte_perm (w0[3], w1[0], selector); @@ -3027,6 +3036,7 @@ inline void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; case 10: @@ -3046,6 +3056,7 @@ inline void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; case 11: @@ -3065,6 +3076,7 @@ inline void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; case 12: @@ -3084,6 +3096,7 @@ inline void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; case 13: @@ -3103,6 +3116,7 @@ inline void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; case 14: @@ -3122,6 +3136,7 @@ inline void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; case 15: @@ -3141,6 +3156,7 @@ inline void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; } #endif @@ -3953,7 +3969,7 @@ inline void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x #if defined IS_AMD || defined IS_GENERIC switch (offset / 4) { - case 0: + case 0: w3[3] = amd_bytealign (w3[2], w3[3], offset); w3[2] = amd_bytealign (w3[1], w3[2], offset); w3[1] = amd_bytealign (w3[0], w3[1], offset); @@ -3970,9 +3986,10 @@ inline void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[2] = amd_bytealign (w0[1], w0[2], offset); w0[1] = amd_bytealign (w0[0], w0[1], offset); w0[0] = amd_bytealign ( 0, w0[0], offset); + break; - case 1: + case 1: w3[3] = amd_bytealign (w3[1], w3[2], offset); w3[2] = amd_bytealign (w3[0], w3[1], offset); w3[1] = amd_bytealign (w2[3], w3[0], offset); @@ -3989,9 +4006,10 @@ inline void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[2] = amd_bytealign (w0[0], w0[1], offset); w0[1] = amd_bytealign ( 0, w0[0], offset); w0[0] = 0; + break; - case 2: + case 2: w3[3] = amd_bytealign (w3[0], w3[1], offset); w3[2] = amd_bytealign (w2[3], w3[0], offset); w3[1] = amd_bytealign (w2[2], w2[3], offset); @@ -4008,9 +4026,10 @@ inline void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[2] = amd_bytealign ( 0, w0[0], offset); w0[1] = 0; w0[0] = 0; + break; - case 3: + case 3: w3[3] = amd_bytealign (w2[3], w3[0], offset); w3[2] = amd_bytealign (w2[2], w2[3], offset); w3[1] = amd_bytealign (w2[1], w2[2], offset); @@ -4027,9 +4046,10 @@ inline void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 4: + case 4: w3[3] = amd_bytealign (w2[2], w2[3], offset); w3[2] = amd_bytealign (w2[1], w2[2], offset); w3[1] = amd_bytealign (w2[0], w2[1], offset); @@ -4046,9 +4066,10 @@ inline void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 5: + case 5: w3[3] = amd_bytealign (w2[1], w2[2], offset); w3[2] = amd_bytealign (w2[0], w2[1], offset); w3[1] = amd_bytealign (w1[3], w2[0], offset); @@ -4065,9 +4086,10 @@ inline void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 6: + case 6: w3[3] = amd_bytealign (w2[0], w2[1], offset); w3[2] = amd_bytealign (w1[3], w2[0], offset); w3[1] = amd_bytealign (w1[2], w1[3], offset); @@ -4084,9 +4106,10 @@ inline void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 7: + case 7: w3[3] = amd_bytealign (w1[3], w2[0], offset); w3[2] = amd_bytealign (w1[2], w1[3], offset); w3[1] = amd_bytealign (w1[1], w1[2], offset); @@ -4103,9 +4126,10 @@ inline void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 8: + case 8: w3[3] = amd_bytealign (w1[2], w1[3], offset); w3[2] = amd_bytealign (w1[1], w1[2], offset); w3[1] = amd_bytealign (w1[0], w1[1], offset); @@ -4122,9 +4146,10 @@ inline void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 9: + case 9: w3[3] = amd_bytealign (w1[1], w1[2], offset); w3[2] = amd_bytealign (w1[0], w1[1], offset); w3[1] = amd_bytealign (w0[3], w1[0], offset); @@ -4141,6 +4166,7 @@ inline void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; case 10: @@ -4160,6 +4186,7 @@ inline void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; case 11: @@ -4179,6 +4206,7 @@ inline void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; case 12: @@ -4198,6 +4226,7 @@ inline void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; case 13: @@ -4217,6 +4246,7 @@ inline void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; case 14: @@ -4236,6 +4266,7 @@ inline void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; case 15: @@ -4255,6 +4286,7 @@ inline void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; } #endif @@ -4264,7 +4296,7 @@ inline void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x switch (offset / 4) { - case 0: + case 0: w3[3] = __byte_perm (w3[3], w3[2], selector); w3[2] = __byte_perm (w3[2], w3[1], selector); w3[1] = __byte_perm (w3[1], w3[0], selector); @@ -4281,9 +4313,10 @@ inline void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[2] = __byte_perm (w0[2], w0[1], selector); w0[1] = __byte_perm (w0[1], w0[0], selector); w0[0] = __byte_perm (w0[0], 0, selector); + break; - case 1: + case 1: w3[3] = __byte_perm (w3[2], w3[1], selector); w3[2] = __byte_perm (w3[1], w3[0], selector); w3[1] = __byte_perm (w3[0], w2[3], selector); @@ -4300,9 +4333,10 @@ inline void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[2] = __byte_perm (w0[1], w0[0], selector); w0[1] = __byte_perm (w0[0], 0, selector); w0[0] = 0; + break; - case 2: + case 2: w3[3] = __byte_perm (w3[1], w3[0], selector); w3[2] = __byte_perm (w3[0], w2[3], selector); w3[1] = __byte_perm (w2[3], w2[2], selector); @@ -4319,9 +4353,10 @@ inline void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[2] = __byte_perm (w0[0], 0, selector); w0[1] = 0; w0[0] = 0; + break; - case 3: + case 3: w3[3] = __byte_perm (w3[0], w2[3], selector); w3[2] = __byte_perm (w2[3], w2[2], selector); w3[1] = __byte_perm (w2[2], w2[1], selector); @@ -4338,9 +4373,10 @@ inline void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 4: + case 4: w3[3] = __byte_perm (w2[3], w2[2], selector); w3[2] = __byte_perm (w2[2], w2[1], selector); w3[1] = __byte_perm (w2[1], w2[0], selector); @@ -4357,9 +4393,10 @@ inline void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 5: + case 5: w3[3] = __byte_perm (w2[2], w2[1], selector); w3[2] = __byte_perm (w2[1], w2[0], selector); w3[1] = __byte_perm (w2[0], w1[3], selector); @@ -4376,9 +4413,10 @@ inline void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 6: + case 6: w3[3] = __byte_perm (w2[1], w2[0], selector); w3[2] = __byte_perm (w2[0], w1[3], selector); w3[1] = __byte_perm (w1[3], w1[2], selector); @@ -4395,9 +4433,10 @@ inline void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 7: + case 7: w3[3] = __byte_perm (w2[0], w1[3], selector); w3[2] = __byte_perm (w1[3], w1[2], selector); w3[1] = __byte_perm (w1[2], w1[1], selector); @@ -4414,9 +4453,10 @@ inline void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 8: + case 8: w3[3] = __byte_perm (w1[3], w1[2], selector); w3[2] = __byte_perm (w1[2], w1[1], selector); w3[1] = __byte_perm (w1[1], w1[0], selector); @@ -4433,9 +4473,10 @@ inline void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 9: + case 9: w3[3] = __byte_perm (w1[2], w1[1], selector); w3[2] = __byte_perm (w1[1], w1[0], selector); w3[1] = __byte_perm (w1[0], w0[3], selector); @@ -4452,6 +4493,7 @@ inline void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; case 10: @@ -4471,6 +4513,7 @@ inline void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; case 11: @@ -4490,6 +4533,7 @@ inline void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; case 12: @@ -4509,6 +4553,7 @@ inline void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; case 13: @@ -4528,6 +4573,7 @@ inline void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; case 14: @@ -4547,6 +4593,7 @@ inline void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; case 15: @@ -4566,6 +4613,7 @@ inline void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; } #endif @@ -4573,9 +4621,10 @@ inline void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x inline void switch_buffer_by_offset_carry_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x c0[4], u32x c1[4], u32x c2[4], u32x c3[4], const u32 offset) { + #if defined IS_AMD || defined IS_GENERIC switch (offset / 4) { - case 0: + case 0: c0[0] = amd_bytealign (w3[3], 0, offset); w3[3] = amd_bytealign (w3[2], w3[3], offset); w3[2] = amd_bytealign (w3[1], w3[2], offset); @@ -4593,9 +4642,10 @@ inline void switch_buffer_by_offset_carry_be (u32x w0[4], u32x w1[4], u32x w2[4] w0[2] = amd_bytealign (w0[1], w0[2], offset); w0[1] = amd_bytealign (w0[0], w0[1], offset); w0[0] = amd_bytealign ( 0, w0[0], offset); + break; - case 1: + case 1: c0[1] = amd_bytealign (w3[3], 0, offset); c0[0] = amd_bytealign (w3[2], w3[3], offset); w3[3] = amd_bytealign (w3[1], w3[2], offset); @@ -4614,9 +4664,10 @@ inline void switch_buffer_by_offset_carry_be (u32x w0[4], u32x w1[4], u32x w2[4] w0[2] = amd_bytealign (w0[0], w0[1], offset); w0[1] = amd_bytealign ( 0, w0[0], offset); w0[0] = 0; + break; - case 2: + case 2: c0[2] = amd_bytealign (w3[3], 0, offset); c0[1] = amd_bytealign (w3[2], w3[3], offset); c0[0] = amd_bytealign (w3[1], w3[2], offset); @@ -4636,9 +4687,10 @@ inline void switch_buffer_by_offset_carry_be (u32x w0[4], u32x w1[4], u32x w2[4] w0[2] = amd_bytealign ( 0, w0[0], offset); w0[1] = 0; w0[0] = 0; + break; - case 3: + case 3: c0[3] = amd_bytealign (w3[3], 0, offset); c0[2] = amd_bytealign (w3[2], w3[3], offset); c0[1] = amd_bytealign (w3[1], w3[2], offset); @@ -4659,9 +4711,10 @@ inline void switch_buffer_by_offset_carry_be (u32x w0[4], u32x w1[4], u32x w2[4] w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 4: + case 4: c1[0] = amd_bytealign (w3[3], 0, offset); c0[3] = amd_bytealign (w3[2], w3[3], offset); c0[2] = amd_bytealign (w3[1], w3[2], offset); @@ -4683,9 +4736,10 @@ inline void switch_buffer_by_offset_carry_be (u32x w0[4], u32x w1[4], u32x w2[4] w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 5: + case 5: c1[1] = amd_bytealign (w3[3], 0, offset); c1[0] = amd_bytealign (w3[2], w3[3], offset); c0[3] = amd_bytealign (w3[1], w3[2], offset); @@ -4708,9 +4762,10 @@ inline void switch_buffer_by_offset_carry_be (u32x w0[4], u32x w1[4], u32x w2[4] w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 6: + case 6: c1[2] = amd_bytealign (w3[3], 0, offset); c1[1] = amd_bytealign (w3[2], w3[3], offset); c1[0] = amd_bytealign (w3[1], w3[2], offset); @@ -4734,9 +4789,10 @@ inline void switch_buffer_by_offset_carry_be (u32x w0[4], u32x w1[4], u32x w2[4] w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 7: + case 7: c1[3] = amd_bytealign (w3[3], 0, offset); c1[2] = amd_bytealign (w3[2], w3[3], offset); c1[1] = amd_bytealign (w3[1], w3[2], offset); @@ -4761,9 +4817,10 @@ inline void switch_buffer_by_offset_carry_be (u32x w0[4], u32x w1[4], u32x w2[4] w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 8: + case 8: c2[0] = amd_bytealign (w3[3], 0, offset); c1[3] = amd_bytealign (w3[2], w3[3], offset); c1[2] = amd_bytealign (w3[1], w3[2], offset); @@ -4789,9 +4846,10 @@ inline void switch_buffer_by_offset_carry_be (u32x w0[4], u32x w1[4], u32x w2[4] w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 9: + case 9: c2[1] = amd_bytealign (w3[3], 0, offset); c2[0] = amd_bytealign (w3[2], w3[3], offset); c1[3] = amd_bytealign (w3[1], w3[2], offset); @@ -4818,6 +4876,7 @@ inline void switch_buffer_by_offset_carry_be (u32x w0[4], u32x w1[4], u32x w2[4] w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; case 10: @@ -4848,6 +4907,7 @@ inline void switch_buffer_by_offset_carry_be (u32x w0[4], u32x w1[4], u32x w2[4] w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; case 11: @@ -4879,6 +4939,7 @@ inline void switch_buffer_by_offset_carry_be (u32x w0[4], u32x w1[4], u32x w2[4] w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; case 12: @@ -4911,6 +4972,7 @@ inline void switch_buffer_by_offset_carry_be (u32x w0[4], u32x w1[4], u32x w2[4] w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; case 13: @@ -4944,6 +5006,7 @@ inline void switch_buffer_by_offset_carry_be (u32x w0[4], u32x w1[4], u32x w2[4] w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; case 14: @@ -4978,6 +5041,7 @@ inline void switch_buffer_by_offset_carry_be (u32x w0[4], u32x w1[4], u32x w2[4] w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; case 15: @@ -5013,320 +5077,785 @@ inline void switch_buffer_by_offset_carry_be (u32x w0[4], u32x w1[4], u32x w2[4] w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; } -} + #endif -inline void overwrite_at_le (u32x sw[16], const u32x w0, const u32 salt_len) -{ - #if defined cl_amd_media_ops - switch (salt_len) + #ifdef IS_NV + const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; + + switch (offset / 4) { - case 0: sw[0] = w0; - break; - case 1: sw[0] = amd_bytealign (w0, sw[0] << 24, 3); - sw[1] = amd_bytealign (sw[1] >> 8, w0, 3); - break; - case 2: sw[0] = amd_bytealign (w0, sw[0] << 16, 2); - sw[1] = amd_bytealign (sw[1] >> 16, w0, 2); - break; - case 3: sw[0] = amd_bytealign (w0, sw[0] << 8, 1); - sw[1] = amd_bytealign (sw[1] >> 24, w0, 1); - break; - case 4: sw[1] = w0; - break; - case 5: sw[1] = amd_bytealign (w0, sw[1] << 24, 3); - sw[2] = amd_bytealign (sw[2] >> 8, w0, 3); - break; - case 6: sw[1] = amd_bytealign (w0, sw[1] << 16, 2); - sw[2] = amd_bytealign (sw[2] >> 16, w0, 2); - break; - case 7: sw[1] = amd_bytealign (w0, sw[1] << 8, 1); - sw[2] = amd_bytealign (sw[2] >> 24, w0, 1); - break; - case 8: sw[2] = w0; - break; - case 9: sw[2] = amd_bytealign (w0, sw[2] << 24, 3); - sw[3] = amd_bytealign (sw[3] >> 8, w0, 3); - break; - case 10: sw[2] = amd_bytealign (w0, sw[2] << 16, 2); - sw[3] = amd_bytealign (sw[3] >> 16, w0, 2); - break; - case 11: sw[2] = amd_bytealign (w0, sw[2] << 8, 1); - sw[3] = amd_bytealign (sw[3] >> 24, w0, 1); - break; - case 12: sw[3] = w0; - break; - case 13: sw[3] = amd_bytealign (w0, sw[3] << 24, 3); - sw[4] = amd_bytealign (sw[4] >> 8, w0, 3); - break; - case 14: sw[3] = amd_bytealign (w0, sw[3] << 16, 2); - sw[4] = amd_bytealign (sw[4] >> 16, w0, 2); - break; - case 15: sw[3] = amd_bytealign (w0, sw[3] << 8, 1); - sw[4] = amd_bytealign (sw[4] >> 24, w0, 1); - break; - case 16: sw[4] = w0; - break; - case 17: sw[4] = amd_bytealign (w0, sw[4] << 24, 3); - sw[5] = amd_bytealign (sw[5] >> 8, w0, 3); - break; - case 18: sw[4] = amd_bytealign (w0, sw[4] << 16, 2); - sw[5] = amd_bytealign (sw[5] >> 16, w0, 2); - break; - case 19: sw[4] = amd_bytealign (w0, sw[4] << 8, 1); - sw[5] = amd_bytealign (sw[5] >> 24, w0, 1); - break; - case 20: sw[5] = w0; - break; - case 21: sw[5] = amd_bytealign (w0, sw[5] << 24, 3); - sw[6] = amd_bytealign (sw[6] >> 8, w0, 3); - break; - case 22: sw[5] = amd_bytealign (w0, sw[5] << 16, 2); - sw[6] = amd_bytealign (sw[6] >> 16, w0, 2); - break; - case 23: sw[5] = amd_bytealign (w0, sw[5] << 8, 1); - sw[6] = amd_bytealign (sw[6] >> 24, w0, 1); - break; - case 24: sw[6] = w0; - break; - case 25: sw[6] = amd_bytealign (w0, sw[6] << 24, 3); - sw[7] = amd_bytealign (sw[7] >> 8, w0, 3); - break; - case 26: sw[6] = amd_bytealign (w0, sw[6] << 16, 2); - sw[7] = amd_bytealign (sw[7] >> 16, w0, 2); - break; - case 27: sw[6] = amd_bytealign (w0, sw[6] << 8, 1); - sw[7] = amd_bytealign (sw[7] >> 24, w0, 1); - break; - case 28: sw[7] = w0; - break; - case 29: sw[7] = amd_bytealign (w0, sw[7] << 24, 3); - sw[8] = amd_bytealign (sw[8] >> 8, w0, 3); - break; - case 30: sw[7] = amd_bytealign (w0, sw[7] << 16, 2); - sw[8] = amd_bytealign (sw[8] >> 16, w0, 2); - break; - case 31: sw[7] = amd_bytealign (w0, sw[7] << 8, 1); - sw[8] = amd_bytealign (sw[8] >> 24, w0, 1); - break; - } - #else - switch (salt_len) - { - case 0: sw[0] = w0; - break; - case 1: sw[0] = (sw[0] & 0x000000ff) | (w0 << 8); - sw[1] = (sw[1] & 0xffffff00) | (w0 >> 24); - break; - case 2: sw[0] = (sw[0] & 0x0000ffff) | (w0 << 16); - sw[1] = (sw[1] & 0xffff0000) | (w0 >> 16); - break; - case 3: sw[0] = (sw[0] & 0x00ffffff) | (w0 << 24); - sw[1] = (sw[1] & 0xff000000) | (w0 >> 8); - break; - case 4: sw[1] = w0; - break; - case 5: sw[1] = (sw[1] & 0x000000ff) | (w0 << 8); - sw[2] = (sw[2] & 0xffffff00) | (w0 >> 24); - break; - case 6: sw[1] = (sw[1] & 0x0000ffff) | (w0 << 16); - sw[2] = (sw[2] & 0xffff0000) | (w0 >> 16); - break; - case 7: sw[1] = (sw[1] & 0x00ffffff) | (w0 << 24); - sw[2] = (sw[2] & 0xff000000) | (w0 >> 8); - break; - case 8: sw[2] = w0; - break; - case 9: sw[2] = (sw[2] & 0x000000ff) | (w0 << 8); - sw[3] = (sw[3] & 0xffffff00) | (w0 >> 24); - break; - case 10: sw[2] = (sw[2] & 0x0000ffff) | (w0 << 16); - sw[3] = (sw[3] & 0xffff0000) | (w0 >> 16); - break; - case 11: sw[2] = (sw[2] & 0x00ffffff) | (w0 << 24); - sw[3] = (sw[3] & 0xff000000) | (w0 >> 8); - break; - case 12: sw[3] = w0; - break; - case 13: sw[3] = (sw[3] & 0x000000ff) | (w0 << 8); - sw[4] = (sw[4] & 0xffffff00) | (w0 >> 24); - break; - case 14: sw[3] = (sw[3] & 0x0000ffff) | (w0 << 16); - sw[4] = (sw[4] & 0xffff0000) | (w0 >> 16); - break; - case 15: sw[3] = (sw[3] & 0x00ffffff) | (w0 << 24); - sw[4] = (sw[4] & 0xff000000) | (w0 >> 8); - break; - case 16: sw[4] = w0; - break; - case 17: sw[4] = (sw[4] & 0x000000ff) | (w0 << 8); - sw[5] = (sw[5] & 0xffffff00) | (w0 >> 24); - break; - case 18: sw[4] = (sw[4] & 0x0000ffff) | (w0 << 16); - sw[5] = (sw[5] & 0xffff0000) | (w0 >> 16); - break; - case 19: sw[4] = (sw[4] & 0x00ffffff) | (w0 << 24); - sw[5] = (sw[5] & 0xff000000) | (w0 >> 8); - break; - case 20: sw[5] = w0; - break; - case 21: sw[5] = (sw[5] & 0x000000ff) | (w0 << 8); - sw[6] = (sw[6] & 0xffffff00) | (w0 >> 24); - break; - case 22: sw[5] = (sw[5] & 0x0000ffff) | (w0 << 16); - sw[6] = (sw[6] & 0xffff0000) | (w0 >> 16); - break; - case 23: sw[5] = (sw[5] & 0x00ffffff) | (w0 << 24); - sw[6] = (sw[6] & 0xff000000) | (w0 >> 8); - break; - case 24: sw[6] = w0; - break; - case 25: sw[6] = (sw[6] & 0x000000ff) | (w0 << 8); - sw[7] = (sw[7] & 0xffffff00) | (w0 >> 24); - break; - case 26: sw[6] = (sw[6] & 0x0000ffff) | (w0 << 16); - sw[7] = (sw[7] & 0xffff0000) | (w0 >> 16); - break; - case 27: sw[6] = (sw[6] & 0x00ffffff) | (w0 << 24); - sw[7] = (sw[7] & 0xff000000) | (w0 >> 8); - break; - case 28: sw[7] = w0; - break; - case 29: sw[7] = (sw[7] & 0x000000ff) | (w0 << 8); - sw[8] = (sw[8] & 0xffffff00) | (w0 >> 24); - break; - case 30: sw[7] = (sw[7] & 0x0000ffff) | (w0 << 16); - sw[8] = (sw[8] & 0xffff0000) | (w0 >> 16); - break; - case 31: sw[7] = (sw[7] & 0x00ffffff) | (w0 << 24); - sw[8] = (sw[8] & 0xff000000) | (w0 >> 8); - break; - } - #endif -} + case 0: + c0[0] = __byte_perm ( 0, w3[3], selector); + w3[3] = __byte_perm (w3[3], w3[2], selector); + w3[2] = __byte_perm (w3[2], w3[1], selector); + w3[1] = __byte_perm (w3[1], w3[0], selector); + w3[0] = __byte_perm (w3[0], w2[3], selector); + w2[3] = __byte_perm (w2[3], w2[2], selector); + w2[2] = __byte_perm (w2[2], w2[1], selector); + w2[1] = __byte_perm (w2[1], w2[0], selector); + w2[0] = __byte_perm (w2[0], w1[3], selector); + w1[3] = __byte_perm (w1[3], w1[2], selector); + w1[2] = __byte_perm (w1[2], w1[1], selector); + w1[1] = __byte_perm (w1[1], w1[0], selector); + w1[0] = __byte_perm (w1[0], w0[3], selector); + w0[3] = __byte_perm (w0[3], w0[2], selector); + w0[2] = __byte_perm (w0[2], w0[1], selector); + w0[1] = __byte_perm (w0[1], w0[0], selector); + w0[0] = __byte_perm (w0[0], 0, selector); -inline void overwrite_at_be (u32x sw[16], const u32x w0, const u32 salt_len) -{ - // would be nice to have optimization based on amd_bytealign as with _le counterpart + break; - switch (salt_len) - { - case 0: sw[0] = w0; - break; - case 1: sw[0] = (sw[0] & 0xff000000) | (w0 >> 8); - sw[1] = (sw[1] & 0x00ffffff) | (w0 << 24); - break; - case 2: sw[0] = (sw[0] & 0xffff0000) | (w0 >> 16); - sw[1] = (sw[1] & 0x0000ffff) | (w0 << 16); - break; - case 3: sw[0] = (sw[0] & 0xffffff00) | (w0 >> 24); - sw[1] = (sw[1] & 0x000000ff) | (w0 << 8); - break; - case 4: sw[1] = w0; - break; - case 5: sw[1] = (sw[1] & 0xff000000) | (w0 >> 8); - sw[2] = (sw[2] & 0x00ffffff) | (w0 << 24); - break; - case 6: sw[1] = (sw[1] & 0xffff0000) | (w0 >> 16); - sw[2] = (sw[2] & 0x0000ffff) | (w0 << 16); - break; - case 7: sw[1] = (sw[1] & 0xffffff00) | (w0 >> 24); - sw[2] = (sw[2] & 0x000000ff) | (w0 << 8); - break; - case 8: sw[2] = w0; - break; - case 9: sw[2] = (sw[2] & 0xff000000) | (w0 >> 8); - sw[3] = (sw[3] & 0x00ffffff) | (w0 << 24); - break; - case 10: sw[2] = (sw[2] & 0xffff0000) | (w0 >> 16); - sw[3] = (sw[3] & 0x0000ffff) | (w0 << 16); - break; - case 11: sw[2] = (sw[2] & 0xffffff00) | (w0 >> 24); - sw[3] = (sw[3] & 0x000000ff) | (w0 << 8); - break; - case 12: sw[3] = w0; - break; - case 13: sw[3] = (sw[3] & 0xff000000) | (w0 >> 8); - sw[4] = (sw[4] & 0x00ffffff) | (w0 << 24); - break; - case 14: sw[3] = (sw[3] & 0xffff0000) | (w0 >> 16); - sw[4] = (sw[4] & 0x0000ffff) | (w0 << 16); - break; - case 15: sw[3] = (sw[3] & 0xffffff00) | (w0 >> 24); - sw[4] = (sw[4] & 0x000000ff) | (w0 << 8); - break; - case 16: sw[4] = w0; - break; - case 17: sw[4] = (sw[4] & 0xff000000) | (w0 >> 8); - sw[5] = (sw[5] & 0x00ffffff) | (w0 << 24); - break; - case 18: sw[4] = (sw[4] & 0xffff0000) | (w0 >> 16); - sw[5] = (sw[5] & 0x0000ffff) | (w0 << 16); - break; - case 19: sw[4] = (sw[4] & 0xffffff00) | (w0 >> 24); - sw[5] = (sw[5] & 0x000000ff) | (w0 << 8); - break; - case 20: sw[5] = w0; - break; - case 21: sw[5] = (sw[5] & 0xff000000) | (w0 >> 8); - sw[6] = (sw[6] & 0x00ffffff) | (w0 << 24); - break; - case 22: sw[5] = (sw[5] & 0xffff0000) | (w0 >> 16); - sw[6] = (sw[6] & 0x0000ffff) | (w0 << 16); - break; - case 23: sw[5] = (sw[5] & 0xffffff00) | (w0 >> 24); - sw[6] = (sw[6] & 0x000000ff) | (w0 << 8); - break; - case 24: sw[6] = w0; - break; - case 25: sw[6] = (sw[6] & 0xff000000) | (w0 >> 8); - sw[7] = (sw[7] & 0x00ffffff) | (w0 << 24); - break; - case 26: sw[6] = (sw[6] & 0xffff0000) | (w0 >> 16); - sw[7] = (sw[7] & 0x0000ffff) | (w0 << 16); - break; - case 27: sw[6] = (sw[6] & 0xffffff00) | (w0 >> 24); - sw[7] = (sw[7] & 0x000000ff) | (w0 << 8); - break; - case 28: sw[7] = w0; - break; - case 29: sw[7] = (sw[7] & 0xff000000) | (w0 >> 8); - sw[8] = (sw[8] & 0x00ffffff) | (w0 << 24); - break; - case 30: sw[7] = (sw[7] & 0xffff0000) | (w0 >> 16); - sw[8] = (sw[8] & 0x0000ffff) | (w0 << 16); - break; - case 31: sw[7] = (sw[7] & 0xffffff00) | (w0 >> 24); - sw[8] = (sw[8] & 0x000000ff) | (w0 << 8); - break; - } -} + case 1: + c0[1] = __byte_perm ( 0, w3[3], selector); + c0[0] = __byte_perm (w3[3], w3[2], selector); + w3[3] = __byte_perm (w3[2], w3[1], selector); + w3[2] = __byte_perm (w3[1], w3[0], selector); + w3[1] = __byte_perm (w3[0], w2[3], selector); + w3[0] = __byte_perm (w2[3], w2[2], selector); + w2[3] = __byte_perm (w2[2], w2[1], selector); + w2[2] = __byte_perm (w2[1], w2[0], selector); + w2[1] = __byte_perm (w2[0], w1[3], selector); + w2[0] = __byte_perm (w1[3], w1[2], selector); + w1[3] = __byte_perm (w1[2], w1[1], selector); + w1[2] = __byte_perm (w1[1], w1[0], selector); + w1[1] = __byte_perm (w1[0], w0[3], selector); + w1[0] = __byte_perm (w0[3], w0[2], selector); + w0[3] = __byte_perm (w0[2], w0[1], selector); + w0[2] = __byte_perm (w0[1], w0[0], selector); + w0[1] = __byte_perm (w0[0], 0, selector); + w0[0] = 0; -inline void overwrite_at_le_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x wx, const u32 salt_len) -{ - #if defined cl_amd_media_ops - switch (salt_len) - { - case 0: w0[0] = wx; - break; - case 1: w0[0] = amd_bytealign (wx, w0[0] << 24, 3); - w0[1] = amd_bytealign (w0[1] >> 8, wx, 3); - break; - case 2: w0[0] = amd_bytealign (wx, w0[0] << 16, 2); - w0[1] = amd_bytealign (w0[1] >> 16, wx, 2); - break; - case 3: w0[0] = amd_bytealign (wx, w0[0] << 8, 1); - w0[1] = amd_bytealign (w0[1] >> 24, wx, 1); - break; - case 4: w0[1] = wx; - break; - case 5: w0[1] = amd_bytealign (wx, w0[1] << 24, 3); - w0[2] = amd_bytealign (w0[2] >> 8, wx, 3); - break; - case 6: w0[1] = amd_bytealign (wx, w0[1] << 16, 2); - w0[2] = amd_bytealign (w0[2] >> 16, wx, 2); - break; + break; + + case 2: + c0[2] = __byte_perm ( 0, w3[3], selector); + c0[1] = __byte_perm (w3[3], w3[2], selector); + c0[0] = __byte_perm (w3[2], w3[1], selector); + w3[3] = __byte_perm (w3[1], w3[0], selector); + w3[2] = __byte_perm (w3[0], w2[3], selector); + w3[1] = __byte_perm (w2[3], w2[2], selector); + w3[0] = __byte_perm (w2[2], w2[1], selector); + w2[3] = __byte_perm (w2[1], w2[0], selector); + w2[2] = __byte_perm (w2[0], w1[3], selector); + w2[1] = __byte_perm (w1[3], w1[2], selector); + w2[0] = __byte_perm (w1[2], w1[1], selector); + w1[3] = __byte_perm (w1[1], w1[0], selector); + w1[2] = __byte_perm (w1[0], w0[3], selector); + w1[1] = __byte_perm (w0[3], w0[2], selector); + w1[0] = __byte_perm (w0[2], w0[1], selector); + w0[3] = __byte_perm (w0[1], w0[0], selector); + w0[2] = __byte_perm (w0[0], 0, selector); + w0[1] = 0; + w0[0] = 0; + + break; + + case 3: + c0[3] = __byte_perm ( 0, w3[3], selector); + c0[2] = __byte_perm (w3[3], w3[2], selector); + c0[1] = __byte_perm (w3[2], w3[1], selector); + c0[0] = __byte_perm (w3[1], w3[0], selector); + w3[3] = __byte_perm (w3[0], w2[3], selector); + w3[2] = __byte_perm (w2[3], w2[2], selector); + w3[1] = __byte_perm (w2[2], w2[1], selector); + w3[0] = __byte_perm (w2[1], w2[0], selector); + w2[3] = __byte_perm (w2[0], w1[3], selector); + w2[2] = __byte_perm (w1[3], w1[2], selector); + w2[1] = __byte_perm (w1[2], w1[1], selector); + w2[0] = __byte_perm (w1[1], w1[0], selector); + w1[3] = __byte_perm (w1[0], w0[3], selector); + w1[2] = __byte_perm (w0[3], w0[2], selector); + w1[1] = __byte_perm (w0[2], w0[1], selector); + w1[0] = __byte_perm (w0[1], w0[0], selector); + w0[3] = __byte_perm (w0[0], 0, selector); + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 4: + c1[0] = __byte_perm ( 0, w3[3], selector); + c0[3] = __byte_perm (w3[3], w3[2], selector); + c0[2] = __byte_perm (w3[2], w3[1], selector); + c0[1] = __byte_perm (w3[1], w3[0], selector); + c0[0] = __byte_perm (w3[0], w2[3], selector); + w3[3] = __byte_perm (w2[3], w2[2], selector); + w3[2] = __byte_perm (w2[2], w2[1], selector); + w3[1] = __byte_perm (w2[1], w2[0], selector); + w3[0] = __byte_perm (w2[0], w1[3], selector); + w2[3] = __byte_perm (w1[3], w1[2], selector); + w2[2] = __byte_perm (w1[2], w1[1], selector); + w2[1] = __byte_perm (w1[1], w1[0], selector); + w2[0] = __byte_perm (w1[0], w0[3], selector); + w1[3] = __byte_perm (w0[3], w0[2], selector); + w1[2] = __byte_perm (w0[2], w0[1], selector); + w1[1] = __byte_perm (w0[1], w0[0], selector); + w1[0] = __byte_perm (w0[0], 0, selector); + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 5: + c1[1] = __byte_perm ( 0, w3[3], selector); + c1[0] = __byte_perm (w3[3], w3[2], selector); + c0[3] = __byte_perm (w3[2], w3[1], selector); + c0[2] = __byte_perm (w3[1], w3[0], selector); + c0[1] = __byte_perm (w3[0], w2[3], selector); + c0[0] = __byte_perm (w2[3], w2[2], selector); + w3[3] = __byte_perm (w2[2], w2[1], selector); + w3[2] = __byte_perm (w2[1], w2[0], selector); + w3[1] = __byte_perm (w2[0], w1[3], selector); + w3[0] = __byte_perm (w1[3], w1[2], selector); + w2[3] = __byte_perm (w1[2], w1[1], selector); + w2[2] = __byte_perm (w1[1], w1[0], selector); + w2[1] = __byte_perm (w1[0], w0[3], selector); + w2[0] = __byte_perm (w0[3], w0[2], selector); + w1[3] = __byte_perm (w0[2], w0[1], selector); + w1[2] = __byte_perm (w0[1], w0[0], selector); + w1[1] = __byte_perm (w0[0], 0, selector); + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 6: + c1[2] = __byte_perm ( 0, w3[3], selector); + c1[1] = __byte_perm (w3[3], w3[2], selector); + c1[0] = __byte_perm (w3[2], w3[1], selector); + c0[3] = __byte_perm (w3[1], w3[0], selector); + c0[2] = __byte_perm (w3[0], w2[3], selector); + c0[1] = __byte_perm (w2[3], w2[2], selector); + c0[0] = __byte_perm (w2[2], w2[1], selector); + w3[3] = __byte_perm (w2[1], w2[0], selector); + w3[2] = __byte_perm (w2[0], w1[3], selector); + w3[1] = __byte_perm (w1[3], w1[2], selector); + w3[0] = __byte_perm (w1[2], w1[1], selector); + w2[3] = __byte_perm (w1[1], w1[0], selector); + w2[2] = __byte_perm (w1[0], w0[3], selector); + w2[1] = __byte_perm (w0[3], w0[2], selector); + w2[0] = __byte_perm (w0[2], w0[1], selector); + w1[3] = __byte_perm (w0[1], w0[0], selector); + w1[2] = __byte_perm (w0[0], 0, selector); + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 7: + c1[3] = __byte_perm ( 0, w3[3], selector); + c1[2] = __byte_perm (w3[3], w3[2], selector); + c1[1] = __byte_perm (w3[2], w3[1], selector); + c1[0] = __byte_perm (w3[1], w3[0], selector); + c0[3] = __byte_perm (w3[0], w2[3], selector); + c0[2] = __byte_perm (w2[3], w2[2], selector); + c0[1] = __byte_perm (w2[2], w2[1], selector); + c0[0] = __byte_perm (w2[1], w2[0], selector); + w3[3] = __byte_perm (w2[0], w1[3], selector); + w3[2] = __byte_perm (w1[3], w1[2], selector); + w3[1] = __byte_perm (w1[2], w1[1], selector); + w3[0] = __byte_perm (w1[1], w1[0], selector); + w2[3] = __byte_perm (w1[0], w0[3], selector); + w2[2] = __byte_perm (w0[3], w0[2], selector); + w2[1] = __byte_perm (w0[2], w0[1], selector); + w2[0] = __byte_perm (w0[1], w0[0], selector); + w1[3] = __byte_perm (w0[0], 0, selector); + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 8: + c2[0] = __byte_perm ( 0, w3[3], selector); + c1[3] = __byte_perm (w3[3], w3[2], selector); + c1[2] = __byte_perm (w3[2], w3[1], selector); + c1[1] = __byte_perm (w3[1], w3[0], selector); + c1[0] = __byte_perm (w3[0], w2[3], selector); + c0[3] = __byte_perm (w2[3], w2[2], selector); + c0[2] = __byte_perm (w2[2], w2[1], selector); + c0[1] = __byte_perm (w2[1], w2[0], selector); + c0[0] = __byte_perm (w2[0], w1[3], selector); + w3[3] = __byte_perm (w1[3], w1[2], selector); + w3[2] = __byte_perm (w1[2], w1[1], selector); + w3[1] = __byte_perm (w1[1], w1[0], selector); + w3[0] = __byte_perm (w1[0], w0[3], selector); + w2[3] = __byte_perm (w0[3], w0[2], selector); + w2[2] = __byte_perm (w0[2], w0[1], selector); + w2[1] = __byte_perm (w0[1], w0[0], selector); + w2[0] = __byte_perm (w0[0], 0, selector); + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 9: + c2[1] = __byte_perm ( 0, w3[3], selector); + c2[0] = __byte_perm (w3[3], w3[2], selector); + c1[3] = __byte_perm (w3[2], w3[1], selector); + c1[2] = __byte_perm (w3[1], w3[0], selector); + c1[1] = __byte_perm (w3[0], w2[3], selector); + c1[0] = __byte_perm (w2[3], w2[2], selector); + c0[3] = __byte_perm (w2[2], w2[1], selector); + c0[2] = __byte_perm (w2[1], w2[0], selector); + c0[1] = __byte_perm (w2[0], w1[3], selector); + c0[0] = __byte_perm (w1[3], w1[2], selector); + w3[3] = __byte_perm (w1[2], w1[1], selector); + w3[2] = __byte_perm (w1[1], w1[0], selector); + w3[1] = __byte_perm (w1[0], w0[3], selector); + w3[0] = __byte_perm (w0[3], w0[2], selector); + w2[3] = __byte_perm (w0[2], w0[1], selector); + w2[2] = __byte_perm (w0[1], w0[0], selector); + w2[1] = __byte_perm (w0[0], 0, selector); + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 10: + c2[2] = __byte_perm ( 0, w3[3], selector); + c2[1] = __byte_perm (w3[3], w3[2], selector); + c2[0] = __byte_perm (w3[2], w3[1], selector); + c1[3] = __byte_perm (w3[1], w3[0], selector); + c1[2] = __byte_perm (w3[0], w2[3], selector); + c1[1] = __byte_perm (w2[3], w2[2], selector); + c1[0] = __byte_perm (w2[2], w2[1], selector); + c0[3] = __byte_perm (w2[1], w2[0], selector); + c0[2] = __byte_perm (w2[0], w1[3], selector); + c0[1] = __byte_perm (w1[3], w1[2], selector); + c0[0] = __byte_perm (w1[2], w1[1], selector); + w3[3] = __byte_perm (w1[1], w1[0], selector); + w3[2] = __byte_perm (w1[0], w0[3], selector); + w3[1] = __byte_perm (w0[3], w0[2], selector); + w3[0] = __byte_perm (w0[2], w0[1], selector); + w2[3] = __byte_perm (w0[1], w0[0], selector); + w2[2] = __byte_perm (w0[0], 0, selector); + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 11: + c2[3] = __byte_perm ( 0, w3[3], selector); + c2[2] = __byte_perm (w3[3], w3[2], selector); + c2[1] = __byte_perm (w3[2], w3[1], selector); + c2[0] = __byte_perm (w3[1], w3[0], selector); + c1[3] = __byte_perm (w3[0], w2[3], selector); + c1[2] = __byte_perm (w2[3], w2[2], selector); + c1[1] = __byte_perm (w2[2], w2[1], selector); + c1[0] = __byte_perm (w2[1], w2[0], selector); + c0[3] = __byte_perm (w2[0], w1[3], selector); + c0[2] = __byte_perm (w1[3], w1[2], selector); + c0[1] = __byte_perm (w1[2], w1[1], selector); + c0[0] = __byte_perm (w1[1], w1[0], selector); + w3[3] = __byte_perm (w1[0], w0[3], selector); + w3[2] = __byte_perm (w0[3], w0[2], selector); + w3[1] = __byte_perm (w0[2], w0[1], selector); + w3[0] = __byte_perm (w0[1], w0[0], selector); + w2[3] = __byte_perm (w0[0], 0, selector); + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 12: + c3[0] = __byte_perm ( 0, w3[3], selector); + c2[3] = __byte_perm (w3[3], w3[2], selector); + c2[2] = __byte_perm (w3[2], w3[1], selector); + c2[1] = __byte_perm (w3[1], w3[0], selector); + c2[0] = __byte_perm (w3[0], w2[3], selector); + c1[3] = __byte_perm (w2[3], w2[2], selector); + c1[2] = __byte_perm (w2[2], w2[1], selector); + c1[1] = __byte_perm (w2[1], w2[0], selector); + c1[0] = __byte_perm (w2[0], w1[3], selector); + c0[3] = __byte_perm (w1[3], w1[2], selector); + c0[2] = __byte_perm (w1[2], w1[1], selector); + c0[1] = __byte_perm (w1[1], w1[0], selector); + c0[0] = __byte_perm (w1[0], w0[3], selector); + w3[3] = __byte_perm (w0[3], w0[2], selector); + w3[2] = __byte_perm (w0[2], w0[1], selector); + w3[1] = __byte_perm (w0[1], w0[0], selector); + w3[0] = __byte_perm (w0[0], 0, selector); + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 13: + c3[1] = __byte_perm ( 0, w3[3], selector); + c3[0] = __byte_perm (w3[3], w3[2], selector); + c2[3] = __byte_perm (w3[2], w3[1], selector); + c2[2] = __byte_perm (w3[1], w3[0], selector); + c2[1] = __byte_perm (w3[0], w2[3], selector); + c2[0] = __byte_perm (w2[3], w2[2], selector); + c1[3] = __byte_perm (w2[2], w2[1], selector); + c1[2] = __byte_perm (w2[1], w2[0], selector); + c1[1] = __byte_perm (w2[0], w1[3], selector); + c1[0] = __byte_perm (w1[3], w1[2], selector); + c0[3] = __byte_perm (w1[2], w1[1], selector); + c0[2] = __byte_perm (w1[1], w1[0], selector); + c0[1] = __byte_perm (w1[0], w0[3], selector); + c0[0] = __byte_perm (w0[3], w0[2], selector); + w3[3] = __byte_perm (w0[2], w0[1], selector); + w3[2] = __byte_perm (w0[1], w0[0], selector); + w3[1] = __byte_perm (w0[0], 0, selector); + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 14: + c3[2] = __byte_perm ( 0, w3[3], selector); + c3[1] = __byte_perm (w3[3], w3[2], selector); + c3[0] = __byte_perm (w3[2], w3[1], selector); + c2[3] = __byte_perm (w3[1], w3[0], selector); + c2[2] = __byte_perm (w3[0], w2[3], selector); + c2[1] = __byte_perm (w2[3], w2[2], selector); + c2[0] = __byte_perm (w2[2], w2[1], selector); + c1[3] = __byte_perm (w2[1], w2[0], selector); + c1[2] = __byte_perm (w2[0], w1[3], selector); + c1[1] = __byte_perm (w1[3], w1[2], selector); + c1[0] = __byte_perm (w1[2], w1[1], selector); + c0[3] = __byte_perm (w1[1], w1[0], selector); + c0[2] = __byte_perm (w1[0], w0[3], selector); + c0[1] = __byte_perm (w0[3], w0[2], selector); + c0[0] = __byte_perm (w0[2], w0[1], selector); + w3[3] = __byte_perm (w0[1], w0[0], selector); + w3[2] = __byte_perm (w0[0], 0, selector); + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 15: + c3[3] = __byte_perm ( 0, w3[3], selector); + c3[2] = __byte_perm (w3[3], w3[2], selector); + c3[1] = __byte_perm (w3[2], w3[1], selector); + c3[0] = __byte_perm (w3[1], w3[0], selector); + c2[3] = __byte_perm (w3[0], w2[3], selector); + c2[2] = __byte_perm (w2[3], w2[2], selector); + c2[1] = __byte_perm (w2[2], w2[1], selector); + c2[0] = __byte_perm (w2[1], w2[0], selector); + c1[3] = __byte_perm (w2[0], w1[3], selector); + c1[2] = __byte_perm (w1[3], w1[2], selector); + c1[1] = __byte_perm (w1[2], w1[1], selector); + c1[0] = __byte_perm (w1[1], w1[0], selector); + c0[3] = __byte_perm (w1[0], w0[3], selector); + c0[2] = __byte_perm (w0[3], w0[2], selector); + c0[1] = __byte_perm (w0[2], w0[1], selector); + c0[0] = __byte_perm (w0[1], w0[0], selector); + w3[3] = __byte_perm (w0[0], 0, selector); + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + } + #endif +} + +inline void overwrite_at_le (u32x sw[16], const u32x w0, const u32 salt_len) +{ + #if defined cl_amd_media_ops + switch (salt_len) + { + case 0: sw[0] = w0; + break; + case 1: sw[0] = amd_bytealign (w0, sw[0] << 24, 3); + sw[1] = amd_bytealign (sw[1] >> 8, w0, 3); + break; + case 2: sw[0] = amd_bytealign (w0, sw[0] << 16, 2); + sw[1] = amd_bytealign (sw[1] >> 16, w0, 2); + break; + case 3: sw[0] = amd_bytealign (w0, sw[0] << 8, 1); + sw[1] = amd_bytealign (sw[1] >> 24, w0, 1); + break; + case 4: sw[1] = w0; + break; + case 5: sw[1] = amd_bytealign (w0, sw[1] << 24, 3); + sw[2] = amd_bytealign (sw[2] >> 8, w0, 3); + break; + case 6: sw[1] = amd_bytealign (w0, sw[1] << 16, 2); + sw[2] = amd_bytealign (sw[2] >> 16, w0, 2); + break; + case 7: sw[1] = amd_bytealign (w0, sw[1] << 8, 1); + sw[2] = amd_bytealign (sw[2] >> 24, w0, 1); + break; + case 8: sw[2] = w0; + break; + case 9: sw[2] = amd_bytealign (w0, sw[2] << 24, 3); + sw[3] = amd_bytealign (sw[3] >> 8, w0, 3); + break; + case 10: sw[2] = amd_bytealign (w0, sw[2] << 16, 2); + sw[3] = amd_bytealign (sw[3] >> 16, w0, 2); + break; + case 11: sw[2] = amd_bytealign (w0, sw[2] << 8, 1); + sw[3] = amd_bytealign (sw[3] >> 24, w0, 1); + break; + case 12: sw[3] = w0; + break; + case 13: sw[3] = amd_bytealign (w0, sw[3] << 24, 3); + sw[4] = amd_bytealign (sw[4] >> 8, w0, 3); + break; + case 14: sw[3] = amd_bytealign (w0, sw[3] << 16, 2); + sw[4] = amd_bytealign (sw[4] >> 16, w0, 2); + break; + case 15: sw[3] = amd_bytealign (w0, sw[3] << 8, 1); + sw[4] = amd_bytealign (sw[4] >> 24, w0, 1); + break; + case 16: sw[4] = w0; + break; + case 17: sw[4] = amd_bytealign (w0, sw[4] << 24, 3); + sw[5] = amd_bytealign (sw[5] >> 8, w0, 3); + break; + case 18: sw[4] = amd_bytealign (w0, sw[4] << 16, 2); + sw[5] = amd_bytealign (sw[5] >> 16, w0, 2); + break; + case 19: sw[4] = amd_bytealign (w0, sw[4] << 8, 1); + sw[5] = amd_bytealign (sw[5] >> 24, w0, 1); + break; + case 20: sw[5] = w0; + break; + case 21: sw[5] = amd_bytealign (w0, sw[5] << 24, 3); + sw[6] = amd_bytealign (sw[6] >> 8, w0, 3); + break; + case 22: sw[5] = amd_bytealign (w0, sw[5] << 16, 2); + sw[6] = amd_bytealign (sw[6] >> 16, w0, 2); + break; + case 23: sw[5] = amd_bytealign (w0, sw[5] << 8, 1); + sw[6] = amd_bytealign (sw[6] >> 24, w0, 1); + break; + case 24: sw[6] = w0; + break; + case 25: sw[6] = amd_bytealign (w0, sw[6] << 24, 3); + sw[7] = amd_bytealign (sw[7] >> 8, w0, 3); + break; + case 26: sw[6] = amd_bytealign (w0, sw[6] << 16, 2); + sw[7] = amd_bytealign (sw[7] >> 16, w0, 2); + break; + case 27: sw[6] = amd_bytealign (w0, sw[6] << 8, 1); + sw[7] = amd_bytealign (sw[7] >> 24, w0, 1); + break; + case 28: sw[7] = w0; + break; + case 29: sw[7] = amd_bytealign (w0, sw[7] << 24, 3); + sw[8] = amd_bytealign (sw[8] >> 8, w0, 3); + break; + case 30: sw[7] = amd_bytealign (w0, sw[7] << 16, 2); + sw[8] = amd_bytealign (sw[8] >> 16, w0, 2); + break; + case 31: sw[7] = amd_bytealign (w0, sw[7] << 8, 1); + sw[8] = amd_bytealign (sw[8] >> 24, w0, 1); + break; + } + #else + switch (salt_len) + { + case 0: sw[0] = w0; + break; + case 1: sw[0] = (sw[0] & 0x000000ff) | (w0 << 8); + sw[1] = (sw[1] & 0xffffff00) | (w0 >> 24); + break; + case 2: sw[0] = (sw[0] & 0x0000ffff) | (w0 << 16); + sw[1] = (sw[1] & 0xffff0000) | (w0 >> 16); + break; + case 3: sw[0] = (sw[0] & 0x00ffffff) | (w0 << 24); + sw[1] = (sw[1] & 0xff000000) | (w0 >> 8); + break; + case 4: sw[1] = w0; + break; + case 5: sw[1] = (sw[1] & 0x000000ff) | (w0 << 8); + sw[2] = (sw[2] & 0xffffff00) | (w0 >> 24); + break; + case 6: sw[1] = (sw[1] & 0x0000ffff) | (w0 << 16); + sw[2] = (sw[2] & 0xffff0000) | (w0 >> 16); + break; + case 7: sw[1] = (sw[1] & 0x00ffffff) | (w0 << 24); + sw[2] = (sw[2] & 0xff000000) | (w0 >> 8); + break; + case 8: sw[2] = w0; + break; + case 9: sw[2] = (sw[2] & 0x000000ff) | (w0 << 8); + sw[3] = (sw[3] & 0xffffff00) | (w0 >> 24); + break; + case 10: sw[2] = (sw[2] & 0x0000ffff) | (w0 << 16); + sw[3] = (sw[3] & 0xffff0000) | (w0 >> 16); + break; + case 11: sw[2] = (sw[2] & 0x00ffffff) | (w0 << 24); + sw[3] = (sw[3] & 0xff000000) | (w0 >> 8); + break; + case 12: sw[3] = w0; + break; + case 13: sw[3] = (sw[3] & 0x000000ff) | (w0 << 8); + sw[4] = (sw[4] & 0xffffff00) | (w0 >> 24); + break; + case 14: sw[3] = (sw[3] & 0x0000ffff) | (w0 << 16); + sw[4] = (sw[4] & 0xffff0000) | (w0 >> 16); + break; + case 15: sw[3] = (sw[3] & 0x00ffffff) | (w0 << 24); + sw[4] = (sw[4] & 0xff000000) | (w0 >> 8); + break; + case 16: sw[4] = w0; + break; + case 17: sw[4] = (sw[4] & 0x000000ff) | (w0 << 8); + sw[5] = (sw[5] & 0xffffff00) | (w0 >> 24); + break; + case 18: sw[4] = (sw[4] & 0x0000ffff) | (w0 << 16); + sw[5] = (sw[5] & 0xffff0000) | (w0 >> 16); + break; + case 19: sw[4] = (sw[4] & 0x00ffffff) | (w0 << 24); + sw[5] = (sw[5] & 0xff000000) | (w0 >> 8); + break; + case 20: sw[5] = w0; + break; + case 21: sw[5] = (sw[5] & 0x000000ff) | (w0 << 8); + sw[6] = (sw[6] & 0xffffff00) | (w0 >> 24); + break; + case 22: sw[5] = (sw[5] & 0x0000ffff) | (w0 << 16); + sw[6] = (sw[6] & 0xffff0000) | (w0 >> 16); + break; + case 23: sw[5] = (sw[5] & 0x00ffffff) | (w0 << 24); + sw[6] = (sw[6] & 0xff000000) | (w0 >> 8); + break; + case 24: sw[6] = w0; + break; + case 25: sw[6] = (sw[6] & 0x000000ff) | (w0 << 8); + sw[7] = (sw[7] & 0xffffff00) | (w0 >> 24); + break; + case 26: sw[6] = (sw[6] & 0x0000ffff) | (w0 << 16); + sw[7] = (sw[7] & 0xffff0000) | (w0 >> 16); + break; + case 27: sw[6] = (sw[6] & 0x00ffffff) | (w0 << 24); + sw[7] = (sw[7] & 0xff000000) | (w0 >> 8); + break; + case 28: sw[7] = w0; + break; + case 29: sw[7] = (sw[7] & 0x000000ff) | (w0 << 8); + sw[8] = (sw[8] & 0xffffff00) | (w0 >> 24); + break; + case 30: sw[7] = (sw[7] & 0x0000ffff) | (w0 << 16); + sw[8] = (sw[8] & 0xffff0000) | (w0 >> 16); + break; + case 31: sw[7] = (sw[7] & 0x00ffffff) | (w0 << 24); + sw[8] = (sw[8] & 0xff000000) | (w0 >> 8); + break; + } + #endif +} + +inline void overwrite_at_be (u32x sw[16], const u32x w0, const u32 salt_len) +{ + // would be nice to have optimization based on amd_bytealign as with _le counterpart + + switch (salt_len) + { + case 0: sw[0] = w0; + break; + case 1: sw[0] = (sw[0] & 0xff000000) | (w0 >> 8); + sw[1] = (sw[1] & 0x00ffffff) | (w0 << 24); + break; + case 2: sw[0] = (sw[0] & 0xffff0000) | (w0 >> 16); + sw[1] = (sw[1] & 0x0000ffff) | (w0 << 16); + break; + case 3: sw[0] = (sw[0] & 0xffffff00) | (w0 >> 24); + sw[1] = (sw[1] & 0x000000ff) | (w0 << 8); + break; + case 4: sw[1] = w0; + break; + case 5: sw[1] = (sw[1] & 0xff000000) | (w0 >> 8); + sw[2] = (sw[2] & 0x00ffffff) | (w0 << 24); + break; + case 6: sw[1] = (sw[1] & 0xffff0000) | (w0 >> 16); + sw[2] = (sw[2] & 0x0000ffff) | (w0 << 16); + break; + case 7: sw[1] = (sw[1] & 0xffffff00) | (w0 >> 24); + sw[2] = (sw[2] & 0x000000ff) | (w0 << 8); + break; + case 8: sw[2] = w0; + break; + case 9: sw[2] = (sw[2] & 0xff000000) | (w0 >> 8); + sw[3] = (sw[3] & 0x00ffffff) | (w0 << 24); + break; + case 10: sw[2] = (sw[2] & 0xffff0000) | (w0 >> 16); + sw[3] = (sw[3] & 0x0000ffff) | (w0 << 16); + break; + case 11: sw[2] = (sw[2] & 0xffffff00) | (w0 >> 24); + sw[3] = (sw[3] & 0x000000ff) | (w0 << 8); + break; + case 12: sw[3] = w0; + break; + case 13: sw[3] = (sw[3] & 0xff000000) | (w0 >> 8); + sw[4] = (sw[4] & 0x00ffffff) | (w0 << 24); + break; + case 14: sw[3] = (sw[3] & 0xffff0000) | (w0 >> 16); + sw[4] = (sw[4] & 0x0000ffff) | (w0 << 16); + break; + case 15: sw[3] = (sw[3] & 0xffffff00) | (w0 >> 24); + sw[4] = (sw[4] & 0x000000ff) | (w0 << 8); + break; + case 16: sw[4] = w0; + break; + case 17: sw[4] = (sw[4] & 0xff000000) | (w0 >> 8); + sw[5] = (sw[5] & 0x00ffffff) | (w0 << 24); + break; + case 18: sw[4] = (sw[4] & 0xffff0000) | (w0 >> 16); + sw[5] = (sw[5] & 0x0000ffff) | (w0 << 16); + break; + case 19: sw[4] = (sw[4] & 0xffffff00) | (w0 >> 24); + sw[5] = (sw[5] & 0x000000ff) | (w0 << 8); + break; + case 20: sw[5] = w0; + break; + case 21: sw[5] = (sw[5] & 0xff000000) | (w0 >> 8); + sw[6] = (sw[6] & 0x00ffffff) | (w0 << 24); + break; + case 22: sw[5] = (sw[5] & 0xffff0000) | (w0 >> 16); + sw[6] = (sw[6] & 0x0000ffff) | (w0 << 16); + break; + case 23: sw[5] = (sw[5] & 0xffffff00) | (w0 >> 24); + sw[6] = (sw[6] & 0x000000ff) | (w0 << 8); + break; + case 24: sw[6] = w0; + break; + case 25: sw[6] = (sw[6] & 0xff000000) | (w0 >> 8); + sw[7] = (sw[7] & 0x00ffffff) | (w0 << 24); + break; + case 26: sw[6] = (sw[6] & 0xffff0000) | (w0 >> 16); + sw[7] = (sw[7] & 0x0000ffff) | (w0 << 16); + break; + case 27: sw[6] = (sw[6] & 0xffffff00) | (w0 >> 24); + sw[7] = (sw[7] & 0x000000ff) | (w0 << 8); + break; + case 28: sw[7] = w0; + break; + case 29: sw[7] = (sw[7] & 0xff000000) | (w0 >> 8); + sw[8] = (sw[8] & 0x00ffffff) | (w0 << 24); + break; + case 30: sw[7] = (sw[7] & 0xffff0000) | (w0 >> 16); + sw[8] = (sw[8] & 0x0000ffff) | (w0 << 16); + break; + case 31: sw[7] = (sw[7] & 0xffffff00) | (w0 >> 24); + sw[8] = (sw[8] & 0x000000ff) | (w0 << 8); + break; + } +} + +inline void overwrite_at_le_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x wx, const u32 salt_len) +{ + #if defined cl_amd_media_ops + switch (salt_len) + { + case 0: w0[0] = wx; + break; + case 1: w0[0] = amd_bytealign (wx, w0[0] << 24, 3); + w0[1] = amd_bytealign (w0[1] >> 8, wx, 3); + break; + case 2: w0[0] = amd_bytealign (wx, w0[0] << 16, 2); + w0[1] = amd_bytealign (w0[1] >> 16, wx, 2); + break; + case 3: w0[0] = amd_bytealign (wx, w0[0] << 8, 1); + w0[1] = amd_bytealign (w0[1] >> 24, wx, 1); + break; + case 4: w0[1] = wx; + break; + case 5: w0[1] = amd_bytealign (wx, w0[1] << 24, 3); + w0[2] = amd_bytealign (w0[2] >> 8, wx, 3); + break; + case 6: w0[1] = amd_bytealign (wx, w0[1] << 16, 2); + w0[2] = amd_bytealign (w0[2] >> 16, wx, 2); + break; case 7: w0[1] = amd_bytealign (wx, w0[1] << 8, 1); w0[2] = amd_bytealign (w0[2] >> 24, wx, 1); break; @@ -5611,1362 +6140,3243 @@ inline void overwrite_at_le_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], break; case 44: w2[3] = wx; break; - case 45: w2[3] = (w2[3] & 0x000000ff) | (wx << 8); - w3[0] = (w3[0] & 0xffffff00) | (wx >> 24); + case 45: w2[3] = (w2[3] & 0x000000ff) | (wx << 8); + w3[0] = (w3[0] & 0xffffff00) | (wx >> 24); + break; + case 46: w2[3] = (w2[3] & 0x0000ffff) | (wx << 16); + w3[0] = (w3[0] & 0xffff0000) | (wx >> 16); + break; + case 47: w2[3] = (w2[3] & 0x00ffffff) | (wx << 24); + w3[0] = (w3[0] & 0xff000000) | (wx >> 8); + break; + case 48: w3[0] = wx; + break; + case 49: w3[0] = (w3[0] & 0x000000ff) | (wx << 8); + w3[1] = (w3[1] & 0xffffff00) | (wx >> 24); + break; + case 50: w3[0] = (w3[0] & 0x0000ffff) | (wx << 16); + w3[1] = (w3[1] & 0xffff0000) | (wx >> 16); + break; + case 51: w3[0] = (w3[0] & 0x00ffffff) | (wx << 24); + w3[1] = (w3[1] & 0xff000000) | (wx >> 8); + break; + case 52: w3[1] = wx; + break; + case 53: w3[1] = (w3[1] & 0x000000ff) | (wx << 8); + w3[2] = (w3[2] & 0xffffff00) | (wx >> 24); + break; + case 54: w3[1] = (w3[1] & 0x0000ffff) | (wx << 16); + w3[2] = (w3[2] & 0xffff0000) | (wx >> 16); + break; + case 55: w3[1] = (w3[1] & 0x00ffffff) | (wx << 24); + w3[2] = (w3[2] & 0xff000000) | (wx >> 8); + break; + case 56: w3[2] = wx; + break; + case 57: w3[2] = (w3[2] & 0x000000ff) | (wx << 8); + w3[3] = (w3[3] & 0xffffff00) | (wx >> 24); + break; + case 58: w3[2] = (w3[2] & 0x0000ffff) | (wx << 16); + w3[3] = (w3[3] & 0xffff0000) | (wx >> 16); + break; + case 59: w3[2] = (w3[2] & 0x00ffffff) | (wx << 24); + w3[3] = (w3[3] & 0xff000000) | (wx >> 8); + break; + case 60: w3[3] = wx; + break; + case 61: w3[3] = (w3[3] & 0x000000ff) | (wx << 8); + //w4[0] = (w4[0] & 0xffffff00) | (wx >> 24); + break; + case 62: w3[3] = (w3[3] & 0x0000ffff) | (wx << 16); + //w4[0] = (w4[0] & 0xffff0000) | (wx >> 16); + break; + case 63: w3[3] = (w3[3] & 0x00ffffff) | (wx << 24); + //w4[0] = (w4[0] & 0xff000000) | (wx >> 8); + break; + } + #endif +} + +inline void overwrite_at_be_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x wx, const u32 salt_len) +{ + // would be nice to have optimization based on amd_bytealign as with _le counterpart + + switch (salt_len) + { + case 0: w0[0] = wx; + break; + case 1: w0[0] = (w0[0] & 0xff000000) | (wx >> 8); + w0[1] = (w0[1] & 0x00ffffff) | (wx << 24); + break; + case 2: w0[0] = (w0[0] & 0xffff0000) | (wx >> 16); + w0[1] = (w0[1] & 0x0000ffff) | (wx << 16); + break; + case 3: w0[0] = (w0[0] & 0xffffff00) | (wx >> 24); + w0[1] = (w0[1] & 0x000000ff) | (wx << 8); + break; + case 4: w0[1] = wx; + break; + case 5: w0[1] = (w0[1] & 0xff000000) | (wx >> 8); + w0[2] = (w0[2] & 0x00ffffff) | (wx << 24); + break; + case 6: w0[1] = (w0[1] & 0xffff0000) | (wx >> 16); + w0[2] = (w0[2] & 0x0000ffff) | (wx << 16); + break; + case 7: w0[1] = (w0[1] & 0xffffff00) | (wx >> 24); + w0[2] = (w0[2] & 0x000000ff) | (wx << 8); + break; + case 8: w0[2] = wx; + break; + case 9: w0[2] = (w0[2] & 0xff000000) | (wx >> 8); + w0[3] = (w0[3] & 0x00ffffff) | (wx << 24); + break; + case 10: w0[2] = (w0[2] & 0xffff0000) | (wx >> 16); + w0[3] = (w0[3] & 0x0000ffff) | (wx << 16); + break; + case 11: w0[2] = (w0[2] & 0xffffff00) | (wx >> 24); + w0[3] = (w0[3] & 0x000000ff) | (wx << 8); + break; + case 12: w0[3] = wx; + break; + case 13: w0[3] = (w0[3] & 0xff000000) | (wx >> 8); + w1[0] = (w1[0] & 0x00ffffff) | (wx << 24); + break; + case 14: w0[3] = (w0[3] & 0xffff0000) | (wx >> 16); + w1[0] = (w1[0] & 0x0000ffff) | (wx << 16); + break; + case 15: w0[3] = (w0[3] & 0xffffff00) | (wx >> 24); + w1[0] = (w1[0] & 0x000000ff) | (wx << 8); + break; + case 16: w1[0] = wx; + break; + case 17: w1[0] = (w1[0] & 0xff000000) | (wx >> 8); + w1[1] = (w1[1] & 0x00ffffff) | (wx << 24); + break; + case 18: w1[0] = (w1[0] & 0xffff0000) | (wx >> 16); + w1[1] = (w1[1] & 0x0000ffff) | (wx << 16); + break; + case 19: w1[0] = (w1[0] & 0xffffff00) | (wx >> 24); + w1[1] = (w1[1] & 0x000000ff) | (wx << 8); + break; + case 20: w1[1] = wx; + break; + case 21: w1[1] = (w1[1] & 0xff000000) | (wx >> 8); + w1[2] = (w1[2] & 0x00ffffff) | (wx << 24); + break; + case 22: w1[1] = (w1[1] & 0xffff0000) | (wx >> 16); + w1[2] = (w1[2] & 0x0000ffff) | (wx << 16); + break; + case 23: w1[1] = (w1[1] & 0xffffff00) | (wx >> 24); + w1[2] = (w1[2] & 0x000000ff) | (wx << 8); + break; + case 24: w1[2] = wx; + break; + case 25: w1[2] = (w1[2] & 0xff000000) | (wx >> 8); + w1[3] = (w1[3] & 0x00ffffff) | (wx << 24); + break; + case 26: w1[2] = (w1[2] & 0xffff0000) | (wx >> 16); + w1[3] = (w1[3] & 0x0000ffff) | (wx << 16); + break; + case 27: w1[2] = (w1[2] & 0xffffff00) | (wx >> 24); + w1[3] = (w1[3] & 0x000000ff) | (wx << 8); + break; + case 28: w1[3] = wx; + break; + case 29: w1[3] = (w1[3] & 0xff000000) | (wx >> 8); + w2[0] = (w2[0] & 0x00ffffff) | (wx << 24); + break; + case 30: w1[3] = (w1[3] & 0xffff0000) | (wx >> 16); + w2[0] = (w2[0] & 0x0000ffff) | (wx << 16); + break; + case 31: w1[3] = (w1[3] & 0xffffff00) | (wx >> 24); + w2[0] = (w2[0] & 0x000000ff) | (wx << 8); + break; + case 32: w2[0] = wx; + break; + case 33: w2[0] = (w2[0] & 0xff000000) | (wx >> 8); + w2[1] = (w2[1] & 0x00ffffff) | (wx << 24); + break; + case 34: w2[0] = (w2[0] & 0xffff0000) | (wx >> 16); + w2[1] = (w2[1] & 0x0000ffff) | (wx << 16); + break; + case 35: w2[0] = (w2[0] & 0xffffff00) | (wx >> 24); + w2[1] = (w2[1] & 0x000000ff) | (wx << 8); + break; + case 36: w2[1] = wx; + break; + case 37: w2[1] = (w2[1] & 0xff000000) | (wx >> 8); + w2[2] = (w2[2] & 0x00ffffff) | (wx << 24); + break; + case 38: w2[1] = (w2[1] & 0xffff0000) | (wx >> 16); + w2[2] = (w2[2] & 0x0000ffff) | (wx << 16); + break; + case 39: w2[1] = (w2[1] & 0xffffff00) | (wx >> 24); + w2[2] = (w2[2] & 0x000000ff) | (wx << 8); + break; + case 40: w2[2] = wx; + break; + case 41: w2[2] = (w2[2] & 0xff000000) | (wx >> 8); + w2[3] = (w2[3] & 0x00ffffff) | (wx << 24); + break; + case 42: w2[2] = (w2[2] & 0xffff0000) | (wx >> 16); + w2[3] = (w2[3] & 0x0000ffff) | (wx << 16); + break; + case 43: w2[2] = (w2[2] & 0xffffff00) | (wx >> 24); + w2[3] = (w2[3] & 0x000000ff) | (wx << 8); + break; + case 44: w2[3] = wx; + break; + case 45: w2[3] = (w2[3] & 0xff000000) | (wx >> 8); + w3[0] = (w3[0] & 0x00ffffff) | (wx << 24); break; - case 46: w2[3] = (w2[3] & 0x0000ffff) | (wx << 16); - w3[0] = (w3[0] & 0xffff0000) | (wx >> 16); + case 46: w2[3] = (w2[3] & 0xffff0000) | (wx >> 16); + w3[0] = (w3[0] & 0x0000ffff) | (wx << 16); break; - case 47: w2[3] = (w2[3] & 0x00ffffff) | (wx << 24); - w3[0] = (w3[0] & 0xff000000) | (wx >> 8); + case 47: w2[3] = (w2[3] & 0xffffff00) | (wx >> 24); + w3[0] = (w3[0] & 0x000000ff) | (wx << 8); break; case 48: w3[0] = wx; break; - case 49: w3[0] = (w3[0] & 0x000000ff) | (wx << 8); - w3[1] = (w3[1] & 0xffffff00) | (wx >> 24); + case 49: w3[0] = (w3[0] & 0xff000000) | (wx >> 8); + w3[1] = (w3[1] & 0x00ffffff) | (wx << 24); break; - case 50: w3[0] = (w3[0] & 0x0000ffff) | (wx << 16); - w3[1] = (w3[1] & 0xffff0000) | (wx >> 16); + case 50: w3[0] = (w3[0] & 0xffff0000) | (wx >> 16); + w3[1] = (w3[1] & 0x0000ffff) | (wx << 16); break; - case 51: w3[0] = (w3[0] & 0x00ffffff) | (wx << 24); - w3[1] = (w3[1] & 0xff000000) | (wx >> 8); + case 51: w3[0] = (w3[0] & 0xffffff00) | (wx >> 24); + w3[1] = (w3[1] & 0x000000ff) | (wx << 8); break; case 52: w3[1] = wx; break; - case 53: w3[1] = (w3[1] & 0x000000ff) | (wx << 8); - w3[2] = (w3[2] & 0xffffff00) | (wx >> 24); + case 53: w3[1] = (w3[1] & 0xff000000) | (wx >> 8); + w3[2] = (w3[2] & 0x00ffffff) | (wx << 24); break; - case 54: w3[1] = (w3[1] & 0x0000ffff) | (wx << 16); - w3[2] = (w3[2] & 0xffff0000) | (wx >> 16); + case 54: w3[1] = (w3[1] & 0xffff0000) | (wx >> 16); + w3[2] = (w3[2] & 0x0000ffff) | (wx << 16); break; - case 55: w3[1] = (w3[1] & 0x00ffffff) | (wx << 24); - w3[2] = (w3[2] & 0xff000000) | (wx >> 8); + case 55: w3[1] = (w3[1] & 0xffffff00) | (wx >> 24); + w3[2] = (w3[2] & 0x000000ff) | (wx << 8); break; case 56: w3[2] = wx; break; - case 57: w3[2] = (w3[2] & 0x000000ff) | (wx << 8); - w3[3] = (w3[3] & 0xffffff00) | (wx >> 24); + case 57: w3[2] = (w3[2] & 0xff000000) | (wx >> 8); + w3[3] = (w3[3] & 0x00ffffff) | (wx << 24); break; - case 58: w3[2] = (w3[2] & 0x0000ffff) | (wx << 16); - w3[3] = (w3[3] & 0xffff0000) | (wx >> 16); + case 58: w3[2] = (w3[2] & 0xffff0000) | (wx >> 16); + w3[3] = (w3[3] & 0x0000ffff) | (wx << 16); break; - case 59: w3[2] = (w3[2] & 0x00ffffff) | (wx << 24); - w3[3] = (w3[3] & 0xff000000) | (wx >> 8); + case 59: w3[2] = (w3[2] & 0xffffff00) | (wx >> 24); + w3[3] = (w3[3] & 0x000000ff) | (wx << 8); break; case 60: w3[3] = wx; break; - case 61: w3[3] = (w3[3] & 0x000000ff) | (wx << 8); - //w4[0] = (w4[0] & 0xffffff00) | (wx >> 24); + case 61: w3[3] = (w3[3] & 0xff000000) | (wx >> 8); + //w4[0] = (w4[0] & 0x00ffffff) | (wx << 24); break; - case 62: w3[3] = (w3[3] & 0x0000ffff) | (wx << 16); - //w4[0] = (w4[0] & 0xffff0000) | (wx >> 16); + case 62: w3[3] = (w3[3] & 0xffff0000) | (wx >> 16); + //w4[0] = (w4[0] & 0x0000ffff) | (wx << 16); break; - case 63: w3[3] = (w3[3] & 0x00ffffff) | (wx << 24); - //w4[0] = (w4[0] & 0xff000000) | (wx >> 8); + case 63: w3[3] = (w3[3] & 0xffffff00) | (wx >> 24); + //w4[0] = (w4[0] & 0x000000ff) | (wx << 8); break; } - #endif } -inline void overwrite_at_be_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x wx, const u32 salt_len) +/** + * vector functions as scalar (for outer loop usage) + */ + +inline void append_0x01_1x4_S (u32 w0[4], const u32 offset) +{ + const u32 tmp = 0x01 << ((offset & 3) * 8); + + w0[0] |= (offset < 4) ? tmp : 0; + w0[1] |= ((offset >= 4) && (offset < 8)) ? tmp : 0; + w0[2] |= ((offset >= 8) && (offset < 12)) ? tmp : 0; + w0[3] |= (offset >= 12) ? tmp : 0; +} + +inline void append_0x01_2x4_S (u32 w0[4], u32 w1[4], const u32 offset) +{ + const u32 tmp = 0x01 << ((offset & 3) * 8); + + w0[0] |= (offset < 4) ? tmp : 0; + w0[1] |= ((offset >= 4) && (offset < 8)) ? tmp : 0; + w0[2] |= ((offset >= 8) && (offset < 12)) ? tmp : 0; + w0[3] |= ((offset >= 12) && (offset < 16)) ? tmp : 0; + w1[0] |= ((offset >= 16) && (offset < 20)) ? tmp : 0; + w1[1] |= ((offset >= 20) && (offset < 24)) ? tmp : 0; + w1[2] |= ((offset >= 24) && (offset < 28)) ? tmp : 0; + w1[3] |= (offset >= 28) ? tmp : 0; +} + +inline void append_0x01_3x4_S (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset) +{ + const u32 tmp = 0x01 << ((offset & 3) * 8); + + w0[0] |= (offset < 4) ? tmp : 0; + w0[1] |= ((offset >= 4) && (offset < 8)) ? tmp : 0; + w0[2] |= ((offset >= 8) && (offset < 12)) ? tmp : 0; + w0[3] |= ((offset >= 12) && (offset < 16)) ? tmp : 0; + w1[0] |= ((offset >= 16) && (offset < 20)) ? tmp : 0; + w1[1] |= ((offset >= 20) && (offset < 24)) ? tmp : 0; + w1[2] |= ((offset >= 24) && (offset < 28)) ? tmp : 0; + w1[3] |= ((offset >= 28) && (offset < 32)) ? tmp : 0; + w2[0] |= ((offset >= 32) && (offset < 36)) ? tmp : 0; + w2[1] |= ((offset >= 36) && (offset < 40)) ? tmp : 0; + w2[2] |= ((offset >= 40) && (offset < 44)) ? tmp : 0; + w2[3] |= (offset >= 44) ? tmp : 0; +} + +inline void append_0x01_4x4_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) +{ + const u32 tmp = 0x01 << ((offset & 3) * 8); + + w0[0] |= (offset < 4) ? tmp : 0; + w0[1] |= ((offset >= 4) && (offset < 8)) ? tmp : 0; + w0[2] |= ((offset >= 8) && (offset < 12)) ? tmp : 0; + w0[3] |= ((offset >= 12) && (offset < 16)) ? tmp : 0; + w1[0] |= ((offset >= 16) && (offset < 20)) ? tmp : 0; + w1[1] |= ((offset >= 20) && (offset < 24)) ? tmp : 0; + w1[2] |= ((offset >= 24) && (offset < 28)) ? tmp : 0; + w1[3] |= ((offset >= 28) && (offset < 32)) ? tmp : 0; + w2[0] |= ((offset >= 32) && (offset < 36)) ? tmp : 0; + w2[1] |= ((offset >= 36) && (offset < 40)) ? tmp : 0; + w2[2] |= ((offset >= 40) && (offset < 44)) ? tmp : 0; + w2[3] |= ((offset >= 44) && (offset < 48)) ? tmp : 0; + w3[0] |= ((offset >= 48) && (offset < 52)) ? tmp : 0; + w3[1] |= ((offset >= 52) && (offset < 56)) ? tmp : 0; + w3[2] |= ((offset >= 56) && (offset < 60)) ? tmp : 0; + w3[3] |= (offset >= 60) ? tmp : 0; +} + +inline void append_0x02_1x4_S (u32 w0[4], const u32 offset) +{ + const u32 tmp = 0x02 << ((offset & 3) * 8); + + w0[0] |= (offset < 4) ? tmp : 0; + w0[1] |= ((offset >= 4) && (offset < 8)) ? tmp : 0; + w0[2] |= ((offset >= 8) && (offset < 12)) ? tmp : 0; + w0[3] |= (offset >= 12) ? tmp : 0; +} + +inline void append_0x02_2x4_S (u32 w0[4], u32 w1[4], const u32 offset) +{ + const u32 tmp = 0x02 << ((offset & 3) * 8); + + w0[0] |= (offset < 4) ? tmp : 0; + w0[1] |= ((offset >= 4) && (offset < 8)) ? tmp : 0; + w0[2] |= ((offset >= 8) && (offset < 12)) ? tmp : 0; + w0[3] |= ((offset >= 12) && (offset < 16)) ? tmp : 0; + w1[0] |= ((offset >= 16) && (offset < 20)) ? tmp : 0; + w1[1] |= ((offset >= 20) && (offset < 24)) ? tmp : 0; + w1[2] |= ((offset >= 24) && (offset < 28)) ? tmp : 0; + w1[3] |= (offset >= 28) ? tmp : 0; +} + +inline void append_0x02_3x4_S (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset) +{ + const u32 tmp = 0x02 << ((offset & 3) * 8); + + w0[0] |= (offset < 4) ? tmp : 0; + w0[1] |= ((offset >= 4) && (offset < 8)) ? tmp : 0; + w0[2] |= ((offset >= 8) && (offset < 12)) ? tmp : 0; + w0[3] |= ((offset >= 12) && (offset < 16)) ? tmp : 0; + w1[0] |= ((offset >= 16) && (offset < 20)) ? tmp : 0; + w1[1] |= ((offset >= 20) && (offset < 24)) ? tmp : 0; + w1[2] |= ((offset >= 24) && (offset < 28)) ? tmp : 0; + w1[3] |= ((offset >= 28) && (offset < 32)) ? tmp : 0; + w2[0] |= ((offset >= 32) && (offset < 36)) ? tmp : 0; + w2[1] |= ((offset >= 36) && (offset < 40)) ? tmp : 0; + w2[2] |= ((offset >= 40) && (offset < 44)) ? tmp : 0; + w2[3] |= (offset >= 44) ? tmp : 0; +} + +inline void append_0x02_4x4_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) +{ + const u32 tmp = 0x02 << ((offset & 3) * 8); + + w0[0] |= (offset < 4) ? tmp : 0; + w0[1] |= ((offset >= 4) && (offset < 8)) ? tmp : 0; + w0[2] |= ((offset >= 8) && (offset < 12)) ? tmp : 0; + w0[3] |= ((offset >= 12) && (offset < 16)) ? tmp : 0; + w1[0] |= ((offset >= 16) && (offset < 20)) ? tmp : 0; + w1[1] |= ((offset >= 20) && (offset < 24)) ? tmp : 0; + w1[2] |= ((offset >= 24) && (offset < 28)) ? tmp : 0; + w1[3] |= ((offset >= 28) && (offset < 32)) ? tmp : 0; + w2[0] |= ((offset >= 32) && (offset < 36)) ? tmp : 0; + w2[1] |= ((offset >= 36) && (offset < 40)) ? tmp : 0; + w2[2] |= ((offset >= 40) && (offset < 44)) ? tmp : 0; + w2[3] |= ((offset >= 44) && (offset < 48)) ? tmp : 0; + w3[0] |= ((offset >= 48) && (offset < 52)) ? tmp : 0; + w3[1] |= ((offset >= 52) && (offset < 56)) ? tmp : 0; + w3[2] |= ((offset >= 56) && (offset < 60)) ? tmp : 0; + w3[3] |= (offset >= 60) ? tmp : 0; +} + +inline void append_0x80_1x4_S (u32 w0[4], const u32 offset) +{ + const u32 tmp = 0x80 << ((offset & 3) * 8); + + w0[0] |= (offset < 4) ? tmp : 0; + w0[1] |= ((offset >= 4) && (offset < 8)) ? tmp : 0; + w0[2] |= ((offset >= 8) && (offset < 12)) ? tmp : 0; + w0[3] |= (offset >= 12) ? tmp : 0; +} + +inline void append_0x80_2x4_S (u32 w0[4], u32 w1[4], const u32 offset) +{ + const u32 tmp = 0x80 << ((offset & 3) * 8); + + w0[0] |= (offset < 4) ? tmp : 0; + w0[1] |= ((offset >= 4) && (offset < 8)) ? tmp : 0; + w0[2] |= ((offset >= 8) && (offset < 12)) ? tmp : 0; + w0[3] |= ((offset >= 12) && (offset < 16)) ? tmp : 0; + w1[0] |= ((offset >= 16) && (offset < 20)) ? tmp : 0; + w1[1] |= ((offset >= 20) && (offset < 24)) ? tmp : 0; + w1[2] |= ((offset >= 24) && (offset < 28)) ? tmp : 0; + w1[3] |= (offset >= 28) ? tmp : 0; +} + +inline void append_0x80_3x4_S (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset) +{ + const u32 tmp = 0x80 << ((offset & 3) * 8); + + w0[0] |= (offset < 4) ? tmp : 0; + w0[1] |= ((offset >= 4) && (offset < 8)) ? tmp : 0; + w0[2] |= ((offset >= 8) && (offset < 12)) ? tmp : 0; + w0[3] |= ((offset >= 12) && (offset < 16)) ? tmp : 0; + w1[0] |= ((offset >= 16) && (offset < 20)) ? tmp : 0; + w1[1] |= ((offset >= 20) && (offset < 24)) ? tmp : 0; + w1[2] |= ((offset >= 24) && (offset < 28)) ? tmp : 0; + w1[3] |= ((offset >= 28) && (offset < 32)) ? tmp : 0; + w2[0] |= ((offset >= 32) && (offset < 36)) ? tmp : 0; + w2[1] |= ((offset >= 36) && (offset < 40)) ? tmp : 0; + w2[2] |= ((offset >= 40) && (offset < 44)) ? tmp : 0; + w2[3] |= (offset >= 44) ? tmp : 0; +} + +inline void append_0x80_4x4_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) +{ + const u32 tmp = 0x80 << ((offset & 3) * 8); + + w0[0] |= (offset < 4) ? tmp : 0; + w0[1] |= ((offset >= 4) && (offset < 8)) ? tmp : 0; + w0[2] |= ((offset >= 8) && (offset < 12)) ? tmp : 0; + w0[3] |= ((offset >= 12) && (offset < 16)) ? tmp : 0; + w1[0] |= ((offset >= 16) && (offset < 20)) ? tmp : 0; + w1[1] |= ((offset >= 20) && (offset < 24)) ? tmp : 0; + w1[2] |= ((offset >= 24) && (offset < 28)) ? tmp : 0; + w1[3] |= ((offset >= 28) && (offset < 32)) ? tmp : 0; + w2[0] |= ((offset >= 32) && (offset < 36)) ? tmp : 0; + w2[1] |= ((offset >= 36) && (offset < 40)) ? tmp : 0; + w2[2] |= ((offset >= 40) && (offset < 44)) ? tmp : 0; + w2[3] |= ((offset >= 44) && (offset < 48)) ? tmp : 0; + w3[0] |= ((offset >= 48) && (offset < 52)) ? tmp : 0; + w3[1] |= ((offset >= 52) && (offset < 56)) ? tmp : 0; + w3[2] |= ((offset >= 56) && (offset < 60)) ? tmp : 0; + w3[3] |= (offset >= 60) ? tmp : 0; +} + +inline void append_0x80_8x4_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset) +{ + switch (offset) + { + case 0: + w0[0] = 0x80; + break; + + case 1: + w0[0] = w0[0] | 0x8000; + break; + + case 2: + w0[0] = w0[0] | 0x800000; + break; + + case 3: + w0[0] = w0[0] | 0x80000000; + break; + + case 4: + w0[1] = 0x80; + break; + + case 5: + w0[1] = w0[1] | 0x8000; + break; + + case 6: + w0[1] = w0[1] | 0x800000; + break; + + case 7: + w0[1] = w0[1] | 0x80000000; + break; + + case 8: + w0[2] = 0x80; + break; + + case 9: + w0[2] = w0[2] | 0x8000; + break; + + case 10: + w0[2] = w0[2] | 0x800000; + break; + + case 11: + w0[2] = w0[2] | 0x80000000; + break; + + case 12: + w0[3] = 0x80; + break; + + case 13: + w0[3] = w0[3] | 0x8000; + break; + + case 14: + w0[3] = w0[3] | 0x800000; + break; + + case 15: + w0[3] = w0[3] | 0x80000000; + break; + + case 16: + w1[0] = 0x80; + break; + + case 17: + w1[0] = w1[0] | 0x8000; + break; + + case 18: + w1[0] = w1[0] | 0x800000; + break; + + case 19: + w1[0] = w1[0] | 0x80000000; + break; + + case 20: + w1[1] = 0x80; + break; + + case 21: + w1[1] = w1[1] | 0x8000; + break; + + case 22: + w1[1] = w1[1] | 0x800000; + break; + + case 23: + w1[1] = w1[1] | 0x80000000; + break; + + case 24: + w1[2] = 0x80; + break; + + case 25: + w1[2] = w1[2] | 0x8000; + break; + + case 26: + w1[2] = w1[2] | 0x800000; + break; + + case 27: + w1[2] = w1[2] | 0x80000000; + break; + + case 28: + w1[3] = 0x80; + break; + + case 29: + w1[3] = w1[3] | 0x8000; + break; + + case 30: + w1[3] = w1[3] | 0x800000; + break; + + case 31: + w1[3] = w1[3] | 0x80000000; + break; + + case 32: + w2[0] = 0x80; + break; + + case 33: + w2[0] = w2[0] | 0x8000; + break; + + case 34: + w2[0] = w2[0] | 0x800000; + break; + + case 35: + w2[0] = w2[0] | 0x80000000; + break; + + case 36: + w2[1] = 0x80; + break; + + case 37: + w2[1] = w2[1] | 0x8000; + break; + + case 38: + w2[1] = w2[1] | 0x800000; + break; + + case 39: + w2[1] = w2[1] | 0x80000000; + break; + + case 40: + w2[2] = 0x80; + break; + + case 41: + w2[2] = w2[2] | 0x8000; + break; + + case 42: + w2[2] = w2[2] | 0x800000; + break; + + case 43: + w2[2] = w2[2] | 0x80000000; + break; + + case 44: + w2[3] = 0x80; + break; + + case 45: + w2[3] = w2[3] | 0x8000; + break; + + case 46: + w2[3] = w2[3] | 0x800000; + break; + + case 47: + w2[3] = w2[3] | 0x80000000; + break; + + case 48: + w3[0] = 0x80; + break; + + case 49: + w3[0] = w3[0] | 0x8000; + break; + + case 50: + w3[0] = w3[0] | 0x800000; + break; + + case 51: + w3[0] = w3[0] | 0x80000000; + break; + + case 52: + w3[1] = 0x80; + break; + + case 53: + w3[1] = w3[1] | 0x8000; + break; + + case 54: + w3[1] = w3[1] | 0x800000; + break; + + case 55: + w3[1] = w3[1] | 0x80000000; + break; + + case 56: + w3[2] = 0x80; + break; + + case 57: + w3[2] = w3[2] | 0x8000; + break; + + case 58: + w3[2] = w3[2] | 0x800000; + break; + + case 59: + w3[2] = w3[2] | 0x80000000; + break; + + case 60: + w3[3] = 0x80; + break; + + case 61: + w3[3] = w3[3] | 0x8000; + break; + + case 62: + w3[3] = w3[3] | 0x800000; + break; + + case 63: + w3[3] = w3[3] | 0x80000000; + break; + + case 64: + w4[0] = 0x80; + break; + + case 65: + w4[0] = w4[0] | 0x8000; + break; + + case 66: + w4[0] = w4[0] | 0x800000; + break; + + case 67: + w4[0] = w4[0] | 0x80000000; + break; + + case 68: + w4[1] = 0x80; + break; + + case 69: + w4[1] = w4[1] | 0x8000; + break; + + case 70: + w4[1] = w4[1] | 0x800000; + break; + + case 71: + w4[1] = w4[1] | 0x80000000; + break; + + case 72: + w4[2] = 0x80; + break; + + case 73: + w4[2] = w4[2] | 0x8000; + break; + + case 74: + w4[2] = w4[2] | 0x800000; + break; + + case 75: + w4[2] = w4[2] | 0x80000000; + break; + + case 76: + w4[3] = 0x80; + break; + + case 77: + w4[3] = w4[3] | 0x8000; + break; + + case 78: + w4[3] = w4[3] | 0x800000; + break; + + case 79: + w4[3] = w4[3] | 0x80000000; + break; + + case 80: + w5[0] = 0x80; + break; + + case 81: + w5[0] = w5[0] | 0x8000; + break; + + case 82: + w5[0] = w5[0] | 0x800000; + break; + + case 83: + w5[0] = w5[0] | 0x80000000; + break; + + case 84: + w5[1] = 0x80; + break; + + case 85: + w5[1] = w5[1] | 0x8000; + break; + + case 86: + w5[1] = w5[1] | 0x800000; + break; + + case 87: + w5[1] = w5[1] | 0x80000000; + break; + + case 88: + w5[2] = 0x80; + break; + + case 89: + w5[2] = w5[2] | 0x8000; + break; + + case 90: + w5[2] = w5[2] | 0x800000; + break; + + case 91: + w5[2] = w5[2] | 0x80000000; + break; + + case 92: + w5[3] = 0x80; + break; + + case 93: + w5[3] = w5[3] | 0x8000; + break; + + case 94: + w5[3] = w5[3] | 0x800000; + break; + + case 95: + w5[3] = w5[3] | 0x80000000; + break; + + case 96: + w6[0] = 0x80; + break; + + case 97: + w6[0] = w6[0] | 0x8000; + break; + + case 98: + w6[0] = w6[0] | 0x800000; + break; + + case 99: + w6[0] = w6[0] | 0x80000000; + break; + + case 100: + w6[1] = 0x80; + break; + + case 101: + w6[1] = w6[1] | 0x8000; + break; + + case 102: + w6[1] = w6[1] | 0x800000; + break; + + case 103: + w6[1] = w6[1] | 0x80000000; + break; + + case 104: + w6[2] = 0x80; + break; + + case 105: + w6[2] = w6[2] | 0x8000; + break; + + case 106: + w6[2] = w6[2] | 0x800000; + break; + + case 107: + w6[2] = w6[2] | 0x80000000; + break; + + case 108: + w6[3] = 0x80; + break; + + case 109: + w6[3] = w6[3] | 0x8000; + break; + + case 110: + w6[3] = w6[3] | 0x800000; + break; + + case 111: + w6[3] = w6[3] | 0x80000000; + break; + + case 112: + w7[0] = 0x80; + break; + + case 113: + w7[0] = w7[0] | 0x8000; + break; + + case 114: + w7[0] = w7[0] | 0x800000; + break; + + case 115: + w7[0] = w7[0] | 0x80000000; + break; + + case 116: + w7[1] = 0x80; + break; + + case 117: + w7[1] = w7[1] | 0x8000; + break; + + case 118: + w7[1] = w7[1] | 0x800000; + break; + + case 119: + w7[1] = w7[1] | 0x80000000; + break; + + case 120: + w7[2] = 0x80; + break; + + case 121: + w7[2] = w7[2] | 0x8000; + break; + + case 122: + w7[2] = w7[2] | 0x800000; + break; + + case 123: + w7[2] = w7[2] | 0x80000000; + break; + + case 124: + w7[3] = 0x80; + break; + + case 125: + w7[3] = w7[3] | 0x8000; + break; + + case 126: + w7[3] = w7[3] | 0x800000; + break; + + case 127: + w7[3] = w7[3] | 0x80000000; + break; + } +} + +inline void truncate_block_S (u32 w[4], const u32 len) { - // would be nice to have optimization based on amd_bytealign as with _le counterpart - - switch (salt_len) + switch (len) { - case 0: w0[0] = wx; - break; - case 1: w0[0] = (w0[0] & 0xff000000) | (wx >> 8); - w0[1] = (w0[1] & 0x00ffffff) | (wx << 24); - break; - case 2: w0[0] = (w0[0] & 0xffff0000) | (wx >> 16); - w0[1] = (w0[1] & 0x0000ffff) | (wx << 16); - break; - case 3: w0[0] = (w0[0] & 0xffffff00) | (wx >> 24); - w0[1] = (w0[1] & 0x000000ff) | (wx << 8); - break; - case 4: w0[1] = wx; - break; - case 5: w0[1] = (w0[1] & 0xff000000) | (wx >> 8); - w0[2] = (w0[2] & 0x00ffffff) | (wx << 24); - break; - case 6: w0[1] = (w0[1] & 0xffff0000) | (wx >> 16); - w0[2] = (w0[2] & 0x0000ffff) | (wx << 16); - break; - case 7: w0[1] = (w0[1] & 0xffffff00) | (wx >> 24); - w0[2] = (w0[2] & 0x000000ff) | (wx << 8); - break; - case 8: w0[2] = wx; - break; - case 9: w0[2] = (w0[2] & 0xff000000) | (wx >> 8); - w0[3] = (w0[3] & 0x00ffffff) | (wx << 24); - break; - case 10: w0[2] = (w0[2] & 0xffff0000) | (wx >> 16); - w0[3] = (w0[3] & 0x0000ffff) | (wx << 16); - break; - case 11: w0[2] = (w0[2] & 0xffffff00) | (wx >> 24); - w0[3] = (w0[3] & 0x000000ff) | (wx << 8); - break; - case 12: w0[3] = wx; - break; - case 13: w0[3] = (w0[3] & 0xff000000) | (wx >> 8); - w1[0] = (w1[0] & 0x00ffffff) | (wx << 24); - break; - case 14: w0[3] = (w0[3] & 0xffff0000) | (wx >> 16); - w1[0] = (w1[0] & 0x0000ffff) | (wx << 16); - break; - case 15: w0[3] = (w0[3] & 0xffffff00) | (wx >> 24); - w1[0] = (w1[0] & 0x000000ff) | (wx << 8); - break; - case 16: w1[0] = wx; - break; - case 17: w1[0] = (w1[0] & 0xff000000) | (wx >> 8); - w1[1] = (w1[1] & 0x00ffffff) | (wx << 24); - break; - case 18: w1[0] = (w1[0] & 0xffff0000) | (wx >> 16); - w1[1] = (w1[1] & 0x0000ffff) | (wx << 16); - break; - case 19: w1[0] = (w1[0] & 0xffffff00) | (wx >> 24); - w1[1] = (w1[1] & 0x000000ff) | (wx << 8); - break; - case 20: w1[1] = wx; - break; - case 21: w1[1] = (w1[1] & 0xff000000) | (wx >> 8); - w1[2] = (w1[2] & 0x00ffffff) | (wx << 24); - break; - case 22: w1[1] = (w1[1] & 0xffff0000) | (wx >> 16); - w1[2] = (w1[2] & 0x0000ffff) | (wx << 16); - break; - case 23: w1[1] = (w1[1] & 0xffffff00) | (wx >> 24); - w1[2] = (w1[2] & 0x000000ff) | (wx << 8); - break; - case 24: w1[2] = wx; - break; - case 25: w1[2] = (w1[2] & 0xff000000) | (wx >> 8); - w1[3] = (w1[3] & 0x00ffffff) | (wx << 24); - break; - case 26: w1[2] = (w1[2] & 0xffff0000) | (wx >> 16); - w1[3] = (w1[3] & 0x0000ffff) | (wx << 16); - break; - case 27: w1[2] = (w1[2] & 0xffffff00) | (wx >> 24); - w1[3] = (w1[3] & 0x000000ff) | (wx << 8); - break; - case 28: w1[3] = wx; - break; - case 29: w1[3] = (w1[3] & 0xff000000) | (wx >> 8); - w2[0] = (w2[0] & 0x00ffffff) | (wx << 24); - break; - case 30: w1[3] = (w1[3] & 0xffff0000) | (wx >> 16); - w2[0] = (w2[0] & 0x0000ffff) | (wx << 16); - break; - case 31: w1[3] = (w1[3] & 0xffffff00) | (wx >> 24); - w2[0] = (w2[0] & 0x000000ff) | (wx << 8); - break; - case 32: w2[0] = wx; - break; - case 33: w2[0] = (w2[0] & 0xff000000) | (wx >> 8); - w2[1] = (w2[1] & 0x00ffffff) | (wx << 24); - break; - case 34: w2[0] = (w2[0] & 0xffff0000) | (wx >> 16); - w2[1] = (w2[1] & 0x0000ffff) | (wx << 16); - break; - case 35: w2[0] = (w2[0] & 0xffffff00) | (wx >> 24); - w2[1] = (w2[1] & 0x000000ff) | (wx << 8); - break; - case 36: w2[1] = wx; - break; - case 37: w2[1] = (w2[1] & 0xff000000) | (wx >> 8); - w2[2] = (w2[2] & 0x00ffffff) | (wx << 24); - break; - case 38: w2[1] = (w2[1] & 0xffff0000) | (wx >> 16); - w2[2] = (w2[2] & 0x0000ffff) | (wx << 16); - break; - case 39: w2[1] = (w2[1] & 0xffffff00) | (wx >> 24); - w2[2] = (w2[2] & 0x000000ff) | (wx << 8); - break; - case 40: w2[2] = wx; - break; - case 41: w2[2] = (w2[2] & 0xff000000) | (wx >> 8); - w2[3] = (w2[3] & 0x00ffffff) | (wx << 24); - break; - case 42: w2[2] = (w2[2] & 0xffff0000) | (wx >> 16); - w2[3] = (w2[3] & 0x0000ffff) | (wx << 16); - break; - case 43: w2[2] = (w2[2] & 0xffffff00) | (wx >> 24); - w2[3] = (w2[3] & 0x000000ff) | (wx << 8); - break; - case 44: w2[3] = wx; - break; - case 45: w2[3] = (w2[3] & 0xff000000) | (wx >> 8); - w3[0] = (w3[0] & 0x00ffffff) | (wx << 24); - break; - case 46: w2[3] = (w2[3] & 0xffff0000) | (wx >> 16); - w3[0] = (w3[0] & 0x0000ffff) | (wx << 16); - break; - case 47: w2[3] = (w2[3] & 0xffffff00) | (wx >> 24); - w3[0] = (w3[0] & 0x000000ff) | (wx << 8); - break; - case 48: w3[0] = wx; + case 0: w[0] &= 0; + w[1] &= 0; + w[2] &= 0; + w[3] &= 0; break; - case 49: w3[0] = (w3[0] & 0xff000000) | (wx >> 8); - w3[1] = (w3[1] & 0x00ffffff) | (wx << 24); + case 1: w[0] &= 0x000000FF; + w[1] &= 0; + w[2] &= 0; + w[3] &= 0; break; - case 50: w3[0] = (w3[0] & 0xffff0000) | (wx >> 16); - w3[1] = (w3[1] & 0x0000ffff) | (wx << 16); + case 2: w[0] &= 0x0000FFFF; + w[1] &= 0; + w[2] &= 0; + w[3] &= 0; break; - case 51: w3[0] = (w3[0] & 0xffffff00) | (wx >> 24); - w3[1] = (w3[1] & 0x000000ff) | (wx << 8); + case 3: w[0] &= 0x00FFFFFF; + w[1] &= 0; + w[2] &= 0; + w[3] &= 0; break; - case 52: w3[1] = wx; + case 4: w[1] &= 0; + w[2] &= 0; + w[3] &= 0; break; - case 53: w3[1] = (w3[1] & 0xff000000) | (wx >> 8); - w3[2] = (w3[2] & 0x00ffffff) | (wx << 24); + case 5: w[1] &= 0x000000FF; + w[2] &= 0; + w[3] &= 0; break; - case 54: w3[1] = (w3[1] & 0xffff0000) | (wx >> 16); - w3[2] = (w3[2] & 0x0000ffff) | (wx << 16); + case 6: w[1] &= 0x0000FFFF; + w[2] &= 0; + w[3] &= 0; break; - case 55: w3[1] = (w3[1] & 0xffffff00) | (wx >> 24); - w3[2] = (w3[2] & 0x000000ff) | (wx << 8); + case 7: w[1] &= 0x00FFFFFF; + w[2] &= 0; + w[3] &= 0; break; - case 56: w3[2] = wx; + case 8: w[2] &= 0; + w[3] &= 0; break; - case 57: w3[2] = (w3[2] & 0xff000000) | (wx >> 8); - w3[3] = (w3[3] & 0x00ffffff) | (wx << 24); + case 9: w[2] &= 0x000000FF; + w[3] &= 0; break; - case 58: w3[2] = (w3[2] & 0xffff0000) | (wx >> 16); - w3[3] = (w3[3] & 0x0000ffff) | (wx << 16); + case 10: w[2] &= 0x0000FFFF; + w[3] &= 0; break; - case 59: w3[2] = (w3[2] & 0xffffff00) | (wx >> 24); - w3[3] = (w3[3] & 0x000000ff) | (wx << 8); + case 11: w[2] &= 0x00FFFFFF; + w[3] &= 0; break; - case 60: w3[3] = wx; + case 12: w[3] &= 0; break; - case 61: w3[3] = (w3[3] & 0xff000000) | (wx >> 8); - //w4[0] = (w4[0] & 0x00ffffff) | (wx << 24); + case 13: w[3] &= 0x000000FF; break; - case 62: w3[3] = (w3[3] & 0xffff0000) | (wx >> 16); - //w4[0] = (w4[0] & 0x0000ffff) | (wx << 16); + case 14: w[3] &= 0x0000FFFF; break; - case 63: w3[3] = (w3[3] & 0xffffff00) | (wx >> 24); - //w4[0] = (w4[0] & 0x000000ff) | (wx << 8); + case 15: w[3] &= 0x00FFFFFF; break; } } -/** - * vector functions as scalar (for outer loop usage) - */ - -inline void append_0x01_1x4_S (u32 w0[4], const u32 offset) +inline void make_utf16be_S (const u32 in[4], u32 out1[4], u32 out2[4]) +{ + #ifdef IS_NV + out2[3] = __byte_perm_S (in[3], 0, 0x3727); + out2[2] = __byte_perm_S (in[3], 0, 0x1707); + out2[1] = __byte_perm_S (in[2], 0, 0x3727); + out2[0] = __byte_perm_S (in[2], 0, 0x1707); + out1[3] = __byte_perm_S (in[1], 0, 0x3727); + out1[2] = __byte_perm_S (in[1], 0, 0x1707); + out1[1] = __byte_perm_S (in[0], 0, 0x3727); + out1[0] = __byte_perm_S (in[0], 0, 0x1707); + #endif + + #if defined IS_AMD || defined IS_GENERIC + out2[3] = ((in[3] >> 0) & 0xFF000000) | ((in[3] >> 8) & 0x0000FF00); + out2[2] = ((in[3] << 16) & 0xFF000000) | ((in[3] << 8) & 0x0000FF00); + out2[1] = ((in[2] >> 0) & 0xFF000000) | ((in[2] >> 8) & 0x0000FF00); + out2[0] = ((in[2] << 16) & 0xFF000000) | ((in[2] << 8) & 0x0000FF00); + out1[3] = ((in[1] >> 0) & 0xFF000000) | ((in[1] >> 8) & 0x0000FF00); + out1[2] = ((in[1] << 16) & 0xFF000000) | ((in[1] << 8) & 0x0000FF00); + out1[1] = ((in[0] >> 0) & 0xFF000000) | ((in[0] >> 8) & 0x0000FF00); + out1[0] = ((in[0] << 16) & 0xFF000000) | ((in[0] << 8) & 0x0000FF00); + #endif +} + +inline void make_utf16le_S (const u32 in[4], u32 out1[4], u32 out2[4]) +{ + #ifdef IS_NV + out2[3] = __byte_perm_S (in[3], 0, 0x7372); + out2[2] = __byte_perm_S (in[3], 0, 0x7170); + out2[1] = __byte_perm_S (in[2], 0, 0x7372); + out2[0] = __byte_perm_S (in[2], 0, 0x7170); + out1[3] = __byte_perm_S (in[1], 0, 0x7372); + out1[2] = __byte_perm_S (in[1], 0, 0x7170); + out1[1] = __byte_perm_S (in[0], 0, 0x7372); + out1[0] = __byte_perm_S (in[0], 0, 0x7170); + #endif + + #if defined IS_AMD || defined IS_GENERIC + out2[3] = ((in[3] >> 8) & 0x00FF0000) | ((in[3] >> 16) & 0x000000FF); + out2[2] = ((in[3] << 8) & 0x00FF0000) | ((in[3] >> 0) & 0x000000FF); + out2[1] = ((in[2] >> 8) & 0x00FF0000) | ((in[2] >> 16) & 0x000000FF); + out2[0] = ((in[2] << 8) & 0x00FF0000) | ((in[2] >> 0) & 0x000000FF); + out1[3] = ((in[1] >> 8) & 0x00FF0000) | ((in[1] >> 16) & 0x000000FF); + out1[2] = ((in[1] << 8) & 0x00FF0000) | ((in[1] >> 0) & 0x000000FF); + out1[1] = ((in[0] >> 8) & 0x00FF0000) | ((in[0] >> 16) & 0x000000FF); + out1[0] = ((in[0] << 8) & 0x00FF0000) | ((in[0] >> 0) & 0x000000FF); + #endif +} + +inline void undo_utf16be_S (const u32 in1[4], const u32 in2[4], u32 out[4]) { - const u32 tmp = 0x01 << ((offset & 3) * 8); + #ifdef IS_NV + out[0] = __byte_perm_S (in1[0], in1[1], 0x4602); + out[1] = __byte_perm_S (in1[2], in1[3], 0x4602); + out[2] = __byte_perm_S (in2[0], in2[1], 0x4602); + out[3] = __byte_perm_S (in2[2], in2[3], 0x4602); + #endif - w0[0] |= (offset < 4) ? tmp : 0; - w0[1] |= ((offset >= 4) && (offset < 8)) ? tmp : 0; - w0[2] |= ((offset >= 8) && (offset < 12)) ? tmp : 0; - w0[3] |= (offset >= 12) ? tmp : 0; + #if defined IS_AMD || defined IS_GENERIC + out[0] = ((in1[0] & 0x0000ff00) >> 8) | ((in1[0] & 0xff000000) >> 16) + | ((in1[1] & 0x0000ff00) << 8) | ((in1[1] & 0xff000000) << 0); + out[1] = ((in1[2] & 0x0000ff00) >> 8) | ((in1[2] & 0xff000000) >> 16) + | ((in1[3] & 0x0000ff00) << 8) | ((in1[3] & 0xff000000) << 0); + out[2] = ((in2[0] & 0x0000ff00) >> 8) | ((in2[0] & 0xff000000) >> 16) + | ((in2[1] & 0x0000ff00) << 8) | ((in2[1] & 0xff000000) << 0); + out[3] = ((in2[2] & 0x0000ff00) >> 8) | ((in2[2] & 0xff000000) >> 16) + | ((in2[3] & 0x0000ff00) << 8) | ((in2[3] & 0xff000000) << 0); + #endif } -inline void append_0x01_2x4_S (u32 w0[4], u32 w1[4], const u32 offset) +inline void undo_utf16le_S (const u32 in1[4], const u32 in2[4], u32 out[4]) { - const u32 tmp = 0x01 << ((offset & 3) * 8); + #ifdef IS_NV + out[0] = __byte_perm_S (in1[0], in1[1], 0x6420); + out[1] = __byte_perm_S (in1[2], in1[3], 0x6420); + out[2] = __byte_perm_S (in2[0], in2[1], 0x6420); + out[3] = __byte_perm_S (in2[2], in2[3], 0x6420); + #endif - w0[0] |= (offset < 4) ? tmp : 0; - w0[1] |= ((offset >= 4) && (offset < 8)) ? tmp : 0; - w0[2] |= ((offset >= 8) && (offset < 12)) ? tmp : 0; - w0[3] |= ((offset >= 12) && (offset < 16)) ? tmp : 0; - w1[0] |= ((offset >= 16) && (offset < 20)) ? tmp : 0; - w1[1] |= ((offset >= 20) && (offset < 24)) ? tmp : 0; - w1[2] |= ((offset >= 24) && (offset < 28)) ? tmp : 0; - w1[3] |= (offset >= 28) ? tmp : 0; + #if defined IS_AMD || defined IS_GENERIC + out[0] = ((in1[0] & 0x000000ff) >> 0) | ((in1[0] & 0x00ff0000) >> 8) + | ((in1[1] & 0x000000ff) << 16) | ((in1[1] & 0x00ff0000) << 8); + out[1] = ((in1[2] & 0x000000ff) >> 0) | ((in1[2] & 0x00ff0000) >> 8) + | ((in1[3] & 0x000000ff) << 16) | ((in1[3] & 0x00ff0000) << 8); + out[2] = ((in2[0] & 0x000000ff) >> 0) | ((in2[0] & 0x00ff0000) >> 8) + | ((in2[1] & 0x000000ff) << 16) | ((in2[1] & 0x00ff0000) << 8); + out[3] = ((in2[2] & 0x000000ff) >> 0) | ((in2[2] & 0x00ff0000) >> 8) + | ((in2[3] & 0x000000ff) << 16) | ((in2[3] & 0x00ff0000) << 8); + #endif } -inline void append_0x01_3x4_S (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset) +inline void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) { - const u32 tmp = 0x01 << ((offset & 3) * 8); + #if defined IS_AMD || defined IS_GENERIC + const int offset_mod_4 = offset & 3; - w0[0] |= (offset < 4) ? tmp : 0; - w0[1] |= ((offset >= 4) && (offset < 8)) ? tmp : 0; - w0[2] |= ((offset >= 8) && (offset < 12)) ? tmp : 0; - w0[3] |= ((offset >= 12) && (offset < 16)) ? tmp : 0; - w1[0] |= ((offset >= 16) && (offset < 20)) ? tmp : 0; - w1[1] |= ((offset >= 20) && (offset < 24)) ? tmp : 0; - w1[2] |= ((offset >= 24) && (offset < 28)) ? tmp : 0; - w1[3] |= ((offset >= 28) && (offset < 32)) ? tmp : 0; - w2[0] |= ((offset >= 32) && (offset < 36)) ? tmp : 0; - w2[1] |= ((offset >= 36) && (offset < 40)) ? tmp : 0; - w2[2] |= ((offset >= 40) && (offset < 44)) ? tmp : 0; - w2[3] |= (offset >= 44) ? tmp : 0; -} + const int offset_minus_4 = 4 - offset; -inline void append_0x01_4x4_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) -{ - const u32 tmp = 0x01 << ((offset & 3) * 8); + switch (offset / 4) + { + case 0: + w3[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w3[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w3[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w3[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w2[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w2[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w2[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w2[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w1[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w1[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w1[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w1[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w0[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w0[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w0[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w0[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w0[0] |= (offset < 4) ? tmp : 0; - w0[1] |= ((offset >= 4) && (offset < 8)) ? tmp : 0; - w0[2] |= ((offset >= 8) && (offset < 12)) ? tmp : 0; - w0[3] |= ((offset >= 12) && (offset < 16)) ? tmp : 0; - w1[0] |= ((offset >= 16) && (offset < 20)) ? tmp : 0; - w1[1] |= ((offset >= 20) && (offset < 24)) ? tmp : 0; - w1[2] |= ((offset >= 24) && (offset < 28)) ? tmp : 0; - w1[3] |= ((offset >= 28) && (offset < 32)) ? tmp : 0; - w2[0] |= ((offset >= 32) && (offset < 36)) ? tmp : 0; - w2[1] |= ((offset >= 36) && (offset < 40)) ? tmp : 0; - w2[2] |= ((offset >= 40) && (offset < 44)) ? tmp : 0; - w2[3] |= ((offset >= 44) && (offset < 48)) ? tmp : 0; - w3[0] |= ((offset >= 48) && (offset < 52)) ? tmp : 0; - w3[1] |= ((offset >= 52) && (offset < 56)) ? tmp : 0; - w3[2] |= ((offset >= 56) && (offset < 60)) ? tmp : 0; - w3[3] |= (offset >= 60) ? tmp : 0; -} + if (offset_mod_4 == 0) + { + w0[0] = w0[1]; + w0[1] = w0[2]; + w0[2] = w0[3]; + w0[3] = w1[0]; + w1[0] = w1[1]; + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = 0; + } -inline void append_0x02_1x4_S (u32 w0[4], const u32 offset) -{ - const u32 tmp = 0x02 << ((offset & 3) * 8); + break; - w0[0] |= (offset < 4) ? tmp : 0; - w0[1] |= ((offset >= 4) && (offset < 8)) ? tmp : 0; - w0[2] |= ((offset >= 8) && (offset < 12)) ? tmp : 0; - w0[3] |= (offset >= 12) ? tmp : 0; -} + case 1: + w3[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w3[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w3[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w3[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w2[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w2[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w2[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w2[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w1[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w1[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w1[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w1[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w0[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w0[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w0[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w0[0] = 0; -inline void append_0x02_2x4_S (u32 w0[4], u32 w1[4], const u32 offset) -{ - const u32 tmp = 0x02 << ((offset & 3) * 8); + if (offset_mod_4 == 0) + { + w0[1] = w0[2]; + w0[2] = w0[3]; + w0[3] = w1[0]; + w1[0] = w1[1]; + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = 0; + } - w0[0] |= (offset < 4) ? tmp : 0; - w0[1] |= ((offset >= 4) && (offset < 8)) ? tmp : 0; - w0[2] |= ((offset >= 8) && (offset < 12)) ? tmp : 0; - w0[3] |= ((offset >= 12) && (offset < 16)) ? tmp : 0; - w1[0] |= ((offset >= 16) && (offset < 20)) ? tmp : 0; - w1[1] |= ((offset >= 20) && (offset < 24)) ? tmp : 0; - w1[2] |= ((offset >= 24) && (offset < 28)) ? tmp : 0; - w1[3] |= (offset >= 28) ? tmp : 0; -} + break; -inline void append_0x02_3x4_S (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset) -{ - const u32 tmp = 0x02 << ((offset & 3) * 8); + case 2: + w3[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w3[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w3[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w3[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w2[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w2[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w2[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w2[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w1[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w1[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w1[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w1[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w0[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w0[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w0[1] = 0; + w0[0] = 0; - w0[0] |= (offset < 4) ? tmp : 0; - w0[1] |= ((offset >= 4) && (offset < 8)) ? tmp : 0; - w0[2] |= ((offset >= 8) && (offset < 12)) ? tmp : 0; - w0[3] |= ((offset >= 12) && (offset < 16)) ? tmp : 0; - w1[0] |= ((offset >= 16) && (offset < 20)) ? tmp : 0; - w1[1] |= ((offset >= 20) && (offset < 24)) ? tmp : 0; - w1[2] |= ((offset >= 24) && (offset < 28)) ? tmp : 0; - w1[3] |= ((offset >= 28) && (offset < 32)) ? tmp : 0; - w2[0] |= ((offset >= 32) && (offset < 36)) ? tmp : 0; - w2[1] |= ((offset >= 36) && (offset < 40)) ? tmp : 0; - w2[2] |= ((offset >= 40) && (offset < 44)) ? tmp : 0; - w2[3] |= (offset >= 44) ? tmp : 0; -} + if (offset_mod_4 == 0) + { + w0[2] = w0[3]; + w0[3] = w1[0]; + w1[0] = w1[1]; + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = 0; + } + + break; + + case 3: + w3[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w3[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w3[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w3[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w2[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w2[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w2[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w2[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w1[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w1[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w1[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w1[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w0[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; -inline void append_0x02_4x4_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) -{ - const u32 tmp = 0x02 << ((offset & 3) * 8); + if (offset_mod_4 == 0) + { + w0[3] = w1[0]; + w1[0] = w1[1]; + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = 0; + } - w0[0] |= (offset < 4) ? tmp : 0; - w0[1] |= ((offset >= 4) && (offset < 8)) ? tmp : 0; - w0[2] |= ((offset >= 8) && (offset < 12)) ? tmp : 0; - w0[3] |= ((offset >= 12) && (offset < 16)) ? tmp : 0; - w1[0] |= ((offset >= 16) && (offset < 20)) ? tmp : 0; - w1[1] |= ((offset >= 20) && (offset < 24)) ? tmp : 0; - w1[2] |= ((offset >= 24) && (offset < 28)) ? tmp : 0; - w1[3] |= ((offset >= 28) && (offset < 32)) ? tmp : 0; - w2[0] |= ((offset >= 32) && (offset < 36)) ? tmp : 0; - w2[1] |= ((offset >= 36) && (offset < 40)) ? tmp : 0; - w2[2] |= ((offset >= 40) && (offset < 44)) ? tmp : 0; - w2[3] |= ((offset >= 44) && (offset < 48)) ? tmp : 0; - w3[0] |= ((offset >= 48) && (offset < 52)) ? tmp : 0; - w3[1] |= ((offset >= 52) && (offset < 56)) ? tmp : 0; - w3[2] |= ((offset >= 56) && (offset < 60)) ? tmp : 0; - w3[3] |= (offset >= 60) ? tmp : 0; -} + break; -inline void append_0x80_1x4_S (u32 w0[4], const u32 offset) -{ - const u32 tmp = 0x80 << ((offset & 3) * 8); + case 4: + w3[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w3[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w3[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w3[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w2[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w2[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w2[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w2[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w1[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w1[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w1[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w1[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - w0[0] |= (offset < 4) ? tmp : 0; - w0[1] |= ((offset >= 4) && (offset < 8)) ? tmp : 0; - w0[2] |= ((offset >= 8) && (offset < 12)) ? tmp : 0; - w0[3] |= (offset >= 12) ? tmp : 0; -} + if (offset_mod_4 == 0) + { + w1[0] = w1[1]; + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = 0; + } -inline void append_0x80_2x4_S (u32 w0[4], u32 w1[4], const u32 offset) -{ - const u32 tmp = 0x80 << ((offset & 3) * 8); + break; - w0[0] |= (offset < 4) ? tmp : 0; - w0[1] |= ((offset >= 4) && (offset < 8)) ? tmp : 0; - w0[2] |= ((offset >= 8) && (offset < 12)) ? tmp : 0; - w0[3] |= ((offset >= 12) && (offset < 16)) ? tmp : 0; - w1[0] |= ((offset >= 16) && (offset < 20)) ? tmp : 0; - w1[1] |= ((offset >= 20) && (offset < 24)) ? tmp : 0; - w1[2] |= ((offset >= 24) && (offset < 28)) ? tmp : 0; - w1[3] |= (offset >= 28) ? tmp : 0; -} + case 5: + w3[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w3[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w3[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w3[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w2[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w2[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w2[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w2[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w1[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w1[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w1[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; -inline void append_0x80_3x4_S (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset) -{ - const u32 tmp = 0x80 << ((offset & 3) * 8); + if (offset_mod_4 == 0) + { + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = 0; + } - w0[0] |= (offset < 4) ? tmp : 0; - w0[1] |= ((offset >= 4) && (offset < 8)) ? tmp : 0; - w0[2] |= ((offset >= 8) && (offset < 12)) ? tmp : 0; - w0[3] |= ((offset >= 12) && (offset < 16)) ? tmp : 0; - w1[0] |= ((offset >= 16) && (offset < 20)) ? tmp : 0; - w1[1] |= ((offset >= 20) && (offset < 24)) ? tmp : 0; - w1[2] |= ((offset >= 24) && (offset < 28)) ? tmp : 0; - w1[3] |= ((offset >= 28) && (offset < 32)) ? tmp : 0; - w2[0] |= ((offset >= 32) && (offset < 36)) ? tmp : 0; - w2[1] |= ((offset >= 36) && (offset < 40)) ? tmp : 0; - w2[2] |= ((offset >= 40) && (offset < 44)) ? tmp : 0; - w2[3] |= (offset >= 44) ? tmp : 0; -} + break; -inline void append_0x80_4x4_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) -{ - const u32 tmp = 0x80 << ((offset & 3) * 8); + case 6: + w3[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w3[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w3[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w3[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w2[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w2[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w2[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w2[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w1[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w1[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - w0[0] |= (offset < 4) ? tmp : 0; - w0[1] |= ((offset >= 4) && (offset < 8)) ? tmp : 0; - w0[2] |= ((offset >= 8) && (offset < 12)) ? tmp : 0; - w0[3] |= ((offset >= 12) && (offset < 16)) ? tmp : 0; - w1[0] |= ((offset >= 16) && (offset < 20)) ? tmp : 0; - w1[1] |= ((offset >= 20) && (offset < 24)) ? tmp : 0; - w1[2] |= ((offset >= 24) && (offset < 28)) ? tmp : 0; - w1[3] |= ((offset >= 28) && (offset < 32)) ? tmp : 0; - w2[0] |= ((offset >= 32) && (offset < 36)) ? tmp : 0; - w2[1] |= ((offset >= 36) && (offset < 40)) ? tmp : 0; - w2[2] |= ((offset >= 40) && (offset < 44)) ? tmp : 0; - w2[3] |= ((offset >= 44) && (offset < 48)) ? tmp : 0; - w3[0] |= ((offset >= 48) && (offset < 52)) ? tmp : 0; - w3[1] |= ((offset >= 52) && (offset < 56)) ? tmp : 0; - w3[2] |= ((offset >= 56) && (offset < 60)) ? tmp : 0; - w3[3] |= (offset >= 60) ? tmp : 0; -} + if (offset_mod_4 == 0) + { + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = 0; + } -inline void append_0x80_8x4_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset) -{ - switch (offset) - { - case 0: - w0[0] = 0x80; break; - case 1: - w0[0] = w0[0] | 0x8000; - break; + case 7: + w3[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w3[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w3[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w3[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w2[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w2[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w2[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w2[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w1[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = 0; + } - case 2: - w0[0] = w0[0] | 0x800000; break; - case 3: - w0[0] = w0[0] | 0x80000000; - break; + case 8: + w3[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w3[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w3[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w3[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w2[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w2[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w2[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w2[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = 0; + } - case 4: - w0[1] = 0x80; break; - case 5: - w0[1] = w0[1] | 0x8000; - break; + case 9: + w3[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w3[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w3[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w3[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w2[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w2[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w2[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 6: - w0[1] = w0[1] | 0x800000; - break; + if (offset_mod_4 == 0) + { + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = 0; + } - case 7: - w0[1] = w0[1] | 0x80000000; break; - case 8: - w0[2] = 0x80; - break; + case 10: + w3[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w3[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w3[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w3[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w2[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w2[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 9: - w0[2] = w0[2] | 0x8000; - break; + if (offset_mod_4 == 0) + { + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = 0; + } - case 10: - w0[2] = w0[2] | 0x800000; break; case 11: - w0[2] = w0[2] | 0x80000000; + w3[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w3[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w3[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w3[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w2[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = 0; + } + break; case 12: - w0[3] = 0x80; - break; + w3[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w3[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w3[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w3[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 13: - w0[3] = w0[3] | 0x8000; - break; + if (offset_mod_4 == 0) + { + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = 0; + } - case 14: - w0[3] = w0[3] | 0x800000; break; - case 15: - w0[3] = w0[3] | 0x80000000; - break; + case 13: + w3[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w3[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w3[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 16: - w1[0] = 0x80; - break; + if (offset_mod_4 == 0) + { + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = 0; + } - case 17: - w1[0] = w1[0] | 0x8000; break; - case 18: - w1[0] = w1[0] | 0x800000; - break; + case 14: + w3[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w3[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 19: - w1[0] = w1[0] | 0x80000000; - break; + if (offset_mod_4 == 0) + { + w3[2] = w3[3]; + w3[3] = 0; + } - case 20: - w1[1] = 0x80; break; - case 21: - w1[1] = w1[1] | 0x8000; - break; + case 15: + w3[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 22: - w1[1] = w1[1] | 0x800000; - break; + if (offset_mod_4 == 0) + { + w3[3] = 0; + } - case 23: - w1[1] = w1[1] | 0x80000000; break; + } + #endif - case 24: - w1[2] = 0x80; - break; + #ifdef IS_NV + const int offset_minus_4 = 4 - (offset % 4); - case 25: - w1[2] = w1[2] | 0x8000; - break; + const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; + + switch (offset / 4) + { + case 0: + w3[3] = __byte_perm_S (w3[2], w3[3], selector); + w3[2] = __byte_perm_S (w3[1], w3[2], selector); + w3[1] = __byte_perm_S (w3[0], w3[1], selector); + w3[0] = __byte_perm_S (w2[3], w3[0], selector); + w2[3] = __byte_perm_S (w2[2], w2[3], selector); + w2[2] = __byte_perm_S (w2[1], w2[2], selector); + w2[1] = __byte_perm_S (w2[0], w2[1], selector); + w2[0] = __byte_perm_S (w1[3], w2[0], selector); + w1[3] = __byte_perm_S (w1[2], w1[3], selector); + w1[2] = __byte_perm_S (w1[1], w1[2], selector); + w1[1] = __byte_perm_S (w1[0], w1[1], selector); + w1[0] = __byte_perm_S (w0[3], w1[0], selector); + w0[3] = __byte_perm_S (w0[2], w0[3], selector); + w0[2] = __byte_perm_S (w0[1], w0[2], selector); + w0[1] = __byte_perm_S (w0[0], w0[1], selector); + w0[0] = __byte_perm_S ( 0, w0[0], selector); - case 26: - w1[2] = w1[2] | 0x800000; break; - case 27: - w1[2] = w1[2] | 0x80000000; - break; + case 1: + w3[3] = __byte_perm_S (w3[1], w3[2], selector); + w3[2] = __byte_perm_S (w3[0], w3[1], selector); + w3[1] = __byte_perm_S (w2[3], w3[0], selector); + w3[0] = __byte_perm_S (w2[2], w2[3], selector); + w2[3] = __byte_perm_S (w2[1], w2[2], selector); + w2[2] = __byte_perm_S (w2[0], w2[1], selector); + w2[1] = __byte_perm_S (w1[3], w2[0], selector); + w2[0] = __byte_perm_S (w1[2], w1[3], selector); + w1[3] = __byte_perm_S (w1[1], w1[2], selector); + w1[2] = __byte_perm_S (w1[0], w1[1], selector); + w1[1] = __byte_perm_S (w0[3], w1[0], selector); + w1[0] = __byte_perm_S (w0[2], w0[3], selector); + w0[3] = __byte_perm_S (w0[1], w0[2], selector); + w0[2] = __byte_perm_S (w0[0], w0[1], selector); + w0[1] = __byte_perm_S ( 0, w0[0], selector); + w0[0] = 0; - case 28: - w1[3] = 0x80; break; - case 29: - w1[3] = w1[3] | 0x8000; - break; + case 2: + w3[3] = __byte_perm_S (w3[0], w3[1], selector); + w3[2] = __byte_perm_S (w2[3], w3[0], selector); + w3[1] = __byte_perm_S (w2[2], w2[3], selector); + w3[0] = __byte_perm_S (w2[1], w2[2], selector); + w2[3] = __byte_perm_S (w2[0], w2[1], selector); + w2[2] = __byte_perm_S (w1[3], w2[0], selector); + w2[1] = __byte_perm_S (w1[2], w1[3], selector); + w2[0] = __byte_perm_S (w1[1], w1[2], selector); + w1[3] = __byte_perm_S (w1[0], w1[1], selector); + w1[2] = __byte_perm_S (w0[3], w1[0], selector); + w1[1] = __byte_perm_S (w0[2], w0[3], selector); + w1[0] = __byte_perm_S (w0[1], w0[2], selector); + w0[3] = __byte_perm_S (w0[0], w0[1], selector); + w0[2] = __byte_perm_S ( 0, w0[0], selector); + w0[1] = 0; + w0[0] = 0; - case 30: - w1[3] = w1[3] | 0x800000; break; - case 31: - w1[3] = w1[3] | 0x80000000; - break; + case 3: + w3[3] = __byte_perm_S (w2[3], w3[0], selector); + w3[2] = __byte_perm_S (w2[2], w2[3], selector); + w3[1] = __byte_perm_S (w2[1], w2[2], selector); + w3[0] = __byte_perm_S (w2[0], w2[1], selector); + w2[3] = __byte_perm_S (w1[3], w2[0], selector); + w2[2] = __byte_perm_S (w1[2], w1[3], selector); + w2[1] = __byte_perm_S (w1[1], w1[2], selector); + w2[0] = __byte_perm_S (w1[0], w1[1], selector); + w1[3] = __byte_perm_S (w0[3], w1[0], selector); + w1[2] = __byte_perm_S (w0[2], w0[3], selector); + w1[1] = __byte_perm_S (w0[1], w0[2], selector); + w1[0] = __byte_perm_S (w0[0], w0[1], selector); + w0[3] = __byte_perm_S ( 0, w0[0], selector); + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 32: - w2[0] = 0x80; break; - case 33: - w2[0] = w2[0] | 0x8000; - break; + case 4: + w3[3] = __byte_perm_S (w2[2], w2[3], selector); + w3[2] = __byte_perm_S (w2[1], w2[2], selector); + w3[1] = __byte_perm_S (w2[0], w2[1], selector); + w3[0] = __byte_perm_S (w1[3], w2[0], selector); + w2[3] = __byte_perm_S (w1[2], w1[3], selector); + w2[2] = __byte_perm_S (w1[1], w1[2], selector); + w2[1] = __byte_perm_S (w1[0], w1[1], selector); + w2[0] = __byte_perm_S (w0[3], w1[0], selector); + w1[3] = __byte_perm_S (w0[2], w0[3], selector); + w1[2] = __byte_perm_S (w0[1], w0[2], selector); + w1[1] = __byte_perm_S (w0[0], w0[1], selector); + w1[0] = __byte_perm_S ( 0, w0[0], selector); + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 34: - w2[0] = w2[0] | 0x800000; break; - case 35: - w2[0] = w2[0] | 0x80000000; - break; + case 5: + w3[3] = __byte_perm_S (w2[1], w2[2], selector); + w3[2] = __byte_perm_S (w2[0], w2[1], selector); + w3[1] = __byte_perm_S (w1[3], w2[0], selector); + w3[0] = __byte_perm_S (w1[2], w1[3], selector); + w2[3] = __byte_perm_S (w1[1], w1[2], selector); + w2[2] = __byte_perm_S (w1[0], w1[1], selector); + w2[1] = __byte_perm_S (w0[3], w1[0], selector); + w2[0] = __byte_perm_S (w0[2], w0[3], selector); + w1[3] = __byte_perm_S (w0[1], w0[2], selector); + w1[2] = __byte_perm_S (w0[0], w0[1], selector); + w1[1] = __byte_perm_S ( 0, w0[0], selector); + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 36: - w2[1] = 0x80; break; - case 37: - w2[1] = w2[1] | 0x8000; - break; + case 6: + w3[3] = __byte_perm_S (w2[0], w2[1], selector); + w3[2] = __byte_perm_S (w1[3], w2[0], selector); + w3[1] = __byte_perm_S (w1[2], w1[3], selector); + w3[0] = __byte_perm_S (w1[1], w1[2], selector); + w2[3] = __byte_perm_S (w1[0], w1[1], selector); + w2[2] = __byte_perm_S (w0[3], w1[0], selector); + w2[1] = __byte_perm_S (w0[2], w0[3], selector); + w2[0] = __byte_perm_S (w0[1], w0[2], selector); + w1[3] = __byte_perm_S (w0[0], w0[1], selector); + w1[2] = __byte_perm_S ( 0, w0[0], selector); + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 38: - w2[1] = w2[1] | 0x800000; break; - case 39: - w2[1] = w2[1] | 0x80000000; - break; + case 7: + w3[3] = __byte_perm_S (w1[3], w2[0], selector); + w3[2] = __byte_perm_S (w1[2], w1[3], selector); + w3[1] = __byte_perm_S (w1[1], w1[2], selector); + w3[0] = __byte_perm_S (w1[0], w1[1], selector); + w2[3] = __byte_perm_S (w0[3], w1[0], selector); + w2[2] = __byte_perm_S (w0[2], w0[3], selector); + w2[1] = __byte_perm_S (w0[1], w0[2], selector); + w2[0] = __byte_perm_S (w0[0], w0[1], selector); + w1[3] = __byte_perm_S ( 0, w0[0], selector); + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 40: - w2[2] = 0x80; break; - case 41: - w2[2] = w2[2] | 0x8000; - break; + case 8: + w3[3] = __byte_perm_S (w1[2], w1[3], selector); + w3[2] = __byte_perm_S (w1[1], w1[2], selector); + w3[1] = __byte_perm_S (w1[0], w1[1], selector); + w3[0] = __byte_perm_S (w0[3], w1[0], selector); + w2[3] = __byte_perm_S (w0[2], w0[3], selector); + w2[2] = __byte_perm_S (w0[1], w0[2], selector); + w2[1] = __byte_perm_S (w0[0], w0[1], selector); + w2[0] = __byte_perm_S ( 0, w0[0], selector); + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 42: - w2[2] = w2[2] | 0x800000; break; - case 43: - w2[2] = w2[2] | 0x80000000; - break; + case 9: + w3[3] = __byte_perm_S (w1[1], w1[2], selector); + w3[2] = __byte_perm_S (w1[0], w1[1], selector); + w3[1] = __byte_perm_S (w0[3], w1[0], selector); + w3[0] = __byte_perm_S (w0[2], w0[3], selector); + w2[3] = __byte_perm_S (w0[1], w0[2], selector); + w2[2] = __byte_perm_S (w0[0], w0[1], selector); + w2[1] = __byte_perm_S ( 0, w0[0], selector); + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 44: - w2[3] = 0x80; break; - case 45: - w2[3] = w2[3] | 0x8000; - break; + case 10: + w3[3] = __byte_perm_S (w1[0], w1[1], selector); + w3[2] = __byte_perm_S (w0[3], w1[0], selector); + w3[1] = __byte_perm_S (w0[2], w0[3], selector); + w3[0] = __byte_perm_S (w0[1], w0[2], selector); + w2[3] = __byte_perm_S (w0[0], w0[1], selector); + w2[2] = __byte_perm_S ( 0, w0[0], selector); + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 46: - w2[3] = w2[3] | 0x800000; break; - case 47: - w2[3] = w2[3] | 0x80000000; - break; + case 11: + w3[3] = __byte_perm_S (w0[3], w1[0], selector); + w3[2] = __byte_perm_S (w0[2], w0[3], selector); + w3[1] = __byte_perm_S (w0[1], w0[2], selector); + w3[0] = __byte_perm_S (w0[0], w0[1], selector); + w2[3] = __byte_perm_S ( 0, w0[0], selector); + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 48: - w3[0] = 0x80; break; - case 49: - w3[0] = w3[0] | 0x8000; - break; + case 12: + w3[3] = __byte_perm_S (w0[2], w0[3], selector); + w3[2] = __byte_perm_S (w0[1], w0[2], selector); + w3[1] = __byte_perm_S (w0[0], w0[1], selector); + w3[0] = __byte_perm_S ( 0, w0[0], selector); + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 50: - w3[0] = w3[0] | 0x800000; break; - case 51: - w3[0] = w3[0] | 0x80000000; - break; + case 13: + w3[3] = __byte_perm_S (w0[1], w0[2], selector); + w3[2] = __byte_perm_S (w0[0], w0[1], selector); + w3[1] = __byte_perm_S ( 0, w0[0], selector); + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 52: - w3[1] = 0x80; break; - case 53: - w3[1] = w3[1] | 0x8000; - break; + case 14: + w3[3] = __byte_perm_S (w0[0], w0[1], selector); + w3[2] = __byte_perm_S ( 0, w0[0], selector); + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 54: - w3[1] = w3[1] | 0x800000; break; - case 55: - w3[1] = w3[1] | 0x80000000; - break; + case 15: + w3[3] = __byte_perm_S ( 0, w0[0], selector); + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 56: - w3[2] = 0x80; break; + } + #endif +} - case 57: - w3[2] = w3[2] | 0x8000; - break; +inline void switch_buffer_by_offset_carry_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 c0[4], u32 c1[4], u32 c2[4], u32 c3[4], const u32 offset) +{ + const int offset_mod_4 = offset & 3; - case 58: - w3[2] = w3[2] | 0x800000; - break; + const int offset_minus_4 = 4 - offset; - case 59: - w3[2] = w3[2] | 0x80000000; - break; + switch (offset / 4) + { + case 0: + c0[0] = amd_bytealign_S ( 0, w3[3], offset_minus_4); + w3[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w3[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w3[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w3[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w2[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w2[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w2[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w2[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w1[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w1[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w1[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w1[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w0[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w0[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w0[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w0[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); - case 60: - w3[3] = 0x80; - break; + if (offset_mod_4 == 0) + { + w0[0] = w0[1]; + w0[1] = w0[2]; + w0[2] = w0[3]; + w0[3] = w1[0]; + w1[0] = w1[1]; + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = c0[0]; + c0[0] = 0; + } - case 61: - w3[3] = w3[3] | 0x8000; break; - case 62: - w3[3] = w3[3] | 0x800000; - break; + case 1: + c0[1] = amd_bytealign_S ( 0, w3[3], offset_minus_4); + c0[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w3[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w3[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w3[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w3[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w2[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w2[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w2[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w2[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w1[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w1[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w1[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w1[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w0[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w0[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w0[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w0[0] = 0; - case 63: - w3[3] = w3[3] | 0x80000000; - break; + if (offset_mod_4 == 0) + { + w0[1] = w0[2]; + w0[2] = w0[3]; + w0[3] = w1[0]; + w1[0] = w1[1]; + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = c0[0]; + c0[0] = c0[1]; + c0[1] = 0; + } - case 64: - w4[0] = 0x80; break; - case 65: - w4[0] = w4[0] | 0x8000; - break; + case 2: + c0[2] = amd_bytealign_S ( 0, w3[3], offset_minus_4); + c0[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + c0[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w3[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w3[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w3[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w3[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w2[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w2[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w2[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w2[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w1[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w1[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w1[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w1[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w0[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w0[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w0[1] = 0; + w0[0] = 0; - case 66: - w4[0] = w4[0] | 0x800000; - break; + if (offset_mod_4 == 0) + { + w0[2] = w0[3]; + w0[3] = w1[0]; + w1[0] = w1[1]; + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = c0[0]; + c0[0] = c0[1]; + c0[1] = c0[2]; + c0[2] = 0; + } - case 67: - w4[0] = w4[0] | 0x80000000; break; - case 68: - w4[1] = 0x80; - break; + case 3: + c0[3] = amd_bytealign_S ( 0, w3[3], offset_minus_4); + c0[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + c0[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + c0[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w3[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w3[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w3[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w3[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w2[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w2[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w2[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w2[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w1[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w1[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w1[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w1[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w0[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 69: - w4[1] = w4[1] | 0x8000; - break; + if (offset_mod_4 == 0) + { + w0[3] = w1[0]; + w1[0] = w1[1]; + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = c0[0]; + c0[0] = c0[1]; + c0[1] = c0[2]; + c0[2] = c0[3]; + c0[3] = 0; + } - case 70: - w4[1] = w4[1] | 0x800000; break; - case 71: - w4[1] = w4[1] | 0x80000000; - break; + case 4: + c1[0] = amd_bytealign_S ( 0, w3[3], offset_minus_4); + c0[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + c0[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + c0[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + c0[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w3[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w3[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w3[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w3[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w2[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w2[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w2[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w2[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w1[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w1[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w1[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w1[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 72: - w4[2] = 0x80; - break; + if (offset_mod_4 == 0) + { + w1[0] = w1[1]; + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = c0[0]; + c0[0] = c0[1]; + c0[1] = c0[2]; + c0[2] = c0[3]; + c0[3] = c1[0]; + c1[0] = 0; + } - case 73: - w4[2] = w4[2] | 0x8000; break; - case 74: - w4[2] = w4[2] | 0x800000; - break; + case 5: + c1[1] = amd_bytealign_S ( 0, w3[3], offset_minus_4); + c1[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + c0[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + c0[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + c0[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + c0[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w3[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w3[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w3[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w3[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w2[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w2[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w2[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w2[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w1[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w1[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w1[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 75: - w4[2] = w4[2] | 0x80000000; - break; + if (offset_mod_4 == 0) + { + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = c0[0]; + c0[0] = c0[1]; + c0[1] = c0[2]; + c0[2] = c0[3]; + c0[3] = c1[0]; + c1[0] = c1[1]; + c1[1] = 0; + } - case 76: - w4[3] = 0x80; break; - case 77: - w4[3] = w4[3] | 0x8000; - break; + case 6: + c1[2] = amd_bytealign_S ( 0, w3[3], offset_minus_4); + c1[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + c1[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + c0[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + c0[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + c0[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + c0[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w3[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w3[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w3[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w3[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w2[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w2[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w2[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w2[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w1[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w1[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 78: - w4[3] = w4[3] | 0x800000; - break; + if (offset_mod_4 == 0) + { + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = c0[0]; + c0[0] = c0[1]; + c0[1] = c0[2]; + c0[2] = c0[3]; + c0[3] = c1[0]; + c1[0] = c1[1]; + c1[1] = c1[2]; + c1[2] = 0; + } - case 79: - w4[3] = w4[3] | 0x80000000; break; - case 80: - w5[0] = 0x80; - break; + case 7: + c1[3] = amd_bytealign_S ( 0, w3[3], offset_minus_4); + c1[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + c1[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + c1[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + c0[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + c0[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + c0[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + c0[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w3[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w3[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w3[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w3[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w2[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w2[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w2[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w2[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w1[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 81: - w5[0] = w5[0] | 0x8000; - break; + if (offset_mod_4 == 0) + { + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = c0[0]; + c0[0] = c0[1]; + c0[1] = c0[2]; + c0[2] = c0[3]; + c0[3] = c1[0]; + c1[0] = c1[1]; + c1[1] = c1[2]; + c1[2] = c1[3]; + c1[3] = 0; + } - case 82: - w5[0] = w5[0] | 0x800000; break; - case 83: - w5[0] = w5[0] | 0x80000000; - break; + case 8: + c2[0] = amd_bytealign_S ( 0, w3[3], offset_minus_4); + c1[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + c1[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + c1[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + c1[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + c0[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + c0[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + c0[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + c0[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w3[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w3[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w3[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w3[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w2[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w2[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w2[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w2[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 84: - w5[1] = 0x80; - break; + if (offset_mod_4 == 0) + { + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = c0[0]; + c0[0] = c0[1]; + c0[1] = c0[2]; + c0[2] = c0[3]; + c0[3] = c1[0]; + c1[0] = c1[1]; + c1[1] = c1[2]; + c1[2] = c1[3]; + c1[3] = c2[0]; + c2[0] = 0; + } - case 85: - w5[1] = w5[1] | 0x8000; break; - case 86: - w5[1] = w5[1] | 0x800000; - break; + case 9: + c2[1] = amd_bytealign_S ( 0, w3[3], offset_minus_4); + c2[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + c1[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + c1[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + c1[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + c1[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + c0[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + c0[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + c0[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + c0[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w3[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w3[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w3[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w3[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w2[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w2[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w2[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 87: - w5[1] = w5[1] | 0x80000000; - break; + if (offset_mod_4 == 0) + { + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = c0[0]; + c0[0] = c0[1]; + c0[1] = c0[2]; + c0[2] = c0[3]; + c0[3] = c1[0]; + c1[0] = c1[1]; + c1[1] = c1[2]; + c1[2] = c1[3]; + c1[3] = c2[0]; + c2[0] = c2[1]; + c2[1] = 0; + } - case 88: - w5[2] = 0x80; break; - case 89: - w5[2] = w5[2] | 0x8000; - break; + case 10: + c2[2] = amd_bytealign_S ( 0, w3[3], offset_minus_4); + c2[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + c2[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + c1[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + c1[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + c1[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + c1[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + c0[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + c0[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + c0[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + c0[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w3[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w3[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w3[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w3[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w2[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w2[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 90: - w5[2] = w5[2] | 0x800000; - break; + if (offset_mod_4 == 0) + { + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = c0[0]; + c0[0] = c0[1]; + c0[1] = c0[2]; + c0[2] = c0[3]; + c0[3] = c1[0]; + c1[0] = c1[1]; + c1[1] = c1[2]; + c1[2] = c1[3]; + c1[3] = c2[0]; + c2[0] = c2[1]; + c2[1] = c2[2]; + c2[2] = 0; + } - case 91: - w5[2] = w5[2] | 0x80000000; break; - case 92: - w5[3] = 0x80; - break; + case 11: + c2[3] = amd_bytealign_S ( 0, w3[3], offset_minus_4); + c2[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + c2[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + c2[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + c1[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + c1[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + c1[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + c1[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + c0[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + c0[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + c0[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + c0[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w3[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w3[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w3[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w3[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w2[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 93: - w5[3] = w5[3] | 0x8000; - break; + if (offset_mod_4 == 0) + { + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = c0[0]; + c0[0] = c0[1]; + c0[1] = c0[2]; + c0[2] = c0[3]; + c0[3] = c1[0]; + c1[0] = c1[1]; + c1[1] = c1[2]; + c1[2] = c1[3]; + c1[3] = c2[0]; + c2[0] = c2[1]; + c2[1] = c2[2]; + c2[2] = c2[3]; + c2[3] = 0; + } - case 94: - w5[3] = w5[3] | 0x800000; break; - case 95: - w5[3] = w5[3] | 0x80000000; - break; + case 12: + c3[0] = amd_bytealign_S ( 0, w3[3], offset_minus_4); + c2[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + c2[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + c2[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + c2[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + c1[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + c1[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + c1[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + c1[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + c0[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + c0[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + c0[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + c0[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w3[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w3[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w3[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w3[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 96: - w6[0] = 0x80; - break; + if (offset_mod_4 == 0) + { + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = c0[0]; + c0[0] = c0[1]; + c0[1] = c0[2]; + c0[2] = c0[3]; + c0[3] = c1[0]; + c1[0] = c1[1]; + c1[1] = c1[2]; + c1[2] = c1[3]; + c1[3] = c2[0]; + c2[0] = c2[1]; + c2[1] = c2[2]; + c2[2] = c2[3]; + c2[3] = c3[0]; + c3[0] = 0; + } - case 97: - w6[0] = w6[0] | 0x8000; break; - case 98: - w6[0] = w6[0] | 0x800000; - break; + case 13: + c3[1] = amd_bytealign_S ( 0, w3[3], offset_minus_4); + c3[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + c2[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + c2[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + c2[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + c2[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + c1[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + c1[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + c1[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + c1[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + c0[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + c0[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + c0[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + c0[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w3[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w3[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w3[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 99: - w6[0] = w6[0] | 0x80000000; - break; + if (offset_mod_4 == 0) + { + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = c0[0]; + c0[0] = c0[1]; + c0[1] = c0[2]; + c0[2] = c0[3]; + c0[3] = c1[0]; + c1[0] = c1[1]; + c1[1] = c1[2]; + c1[2] = c1[3]; + c1[3] = c2[0]; + c2[0] = c2[1]; + c2[1] = c2[2]; + c2[2] = c2[3]; + c2[3] = c3[0]; + c3[0] = c3[1]; + c3[1] = 0; + } - case 100: - w6[1] = 0x80; break; - case 101: - w6[1] = w6[1] | 0x8000; - break; + case 14: + c3[2] = amd_bytealign_S ( 0, w3[3], offset_minus_4); + c3[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + c3[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + c2[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + c2[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + c2[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + c2[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + c1[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + c1[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + c1[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + c1[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + c0[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + c0[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + c0[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + c0[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w3[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w3[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 102: - w6[1] = w6[1] | 0x800000; - break; + if (offset_mod_4 == 0) + { + w3[2] = w3[3]; + w3[3] = c0[0]; + c0[0] = c0[1]; + c0[1] = c0[2]; + c0[2] = c0[3]; + c0[3] = c1[0]; + c1[0] = c1[1]; + c1[1] = c1[2]; + c1[2] = c1[3]; + c1[3] = c2[0]; + c2[0] = c2[1]; + c2[1] = c2[2]; + c2[2] = c2[3]; + c2[3] = c3[0]; + c3[0] = c3[1]; + c3[1] = c3[2]; + c3[2] = 0; + } - case 103: - w6[1] = w6[1] | 0x80000000; break; - case 104: - w6[2] = 0x80; - break; + case 15: + c3[3] = amd_bytealign_S ( 0, w3[3], offset_minus_4); + c3[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + c3[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + c3[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + c2[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + c2[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + c2[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + c2[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + c1[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + c1[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + c1[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + c1[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + c0[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + c0[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + c0[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + c0[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w3[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 105: - w6[2] = w6[2] | 0x8000; - break; + if (offset_mod_4 == 0) + { + w3[3] = c0[0]; + c0[0] = c0[1]; + c0[1] = c0[2]; + c0[2] = c0[3]; + c0[3] = c1[0]; + c1[0] = c1[1]; + c1[1] = c1[2]; + c1[2] = c1[3]; + c1[3] = c2[0]; + c2[0] = c2[1]; + c2[1] = c2[2]; + c2[2] = c2[3]; + c2[3] = c3[0]; + c3[0] = c3[1]; + c3[1] = c3[2]; + c3[2] = c3[3]; + c3[3] = 0; + } - case 106: - w6[2] = w6[2] | 0x800000; break; + } +} - case 107: - w6[2] = w6[2] | 0x80000000; - break; +inline void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) +{ + #if defined IS_AMD || defined IS_GENERIC + switch (offset / 4) + { + case 0: + w3[3] = amd_bytealign_S (w3[2], w3[3], offset); + w3[2] = amd_bytealign_S (w3[1], w3[2], offset); + w3[1] = amd_bytealign_S (w3[0], w3[1], offset); + w3[0] = amd_bytealign_S (w2[3], w3[0], offset); + w2[3] = amd_bytealign_S (w2[2], w2[3], offset); + w2[2] = amd_bytealign_S (w2[1], w2[2], offset); + w2[1] = amd_bytealign_S (w2[0], w2[1], offset); + w2[0] = amd_bytealign_S (w1[3], w2[0], offset); + w1[3] = amd_bytealign_S (w1[2], w1[3], offset); + w1[2] = amd_bytealign_S (w1[1], w1[2], offset); + w1[1] = amd_bytealign_S (w1[0], w1[1], offset); + w1[0] = amd_bytealign_S (w0[3], w1[0], offset); + w0[3] = amd_bytealign_S (w0[2], w0[3], offset); + w0[2] = amd_bytealign_S (w0[1], w0[2], offset); + w0[1] = amd_bytealign_S (w0[0], w0[1], offset); + w0[0] = amd_bytealign_S ( 0, w0[0], offset); - case 108: - w6[3] = 0x80; break; - case 109: - w6[3] = w6[3] | 0x8000; - break; + case 1: + w3[3] = amd_bytealign_S (w3[1], w3[2], offset); + w3[2] = amd_bytealign_S (w3[0], w3[1], offset); + w3[1] = amd_bytealign_S (w2[3], w3[0], offset); + w3[0] = amd_bytealign_S (w2[2], w2[3], offset); + w2[3] = amd_bytealign_S (w2[1], w2[2], offset); + w2[2] = amd_bytealign_S (w2[0], w2[1], offset); + w2[1] = amd_bytealign_S (w1[3], w2[0], offset); + w2[0] = amd_bytealign_S (w1[2], w1[3], offset); + w1[3] = amd_bytealign_S (w1[1], w1[2], offset); + w1[2] = amd_bytealign_S (w1[0], w1[1], offset); + w1[1] = amd_bytealign_S (w0[3], w1[0], offset); + w1[0] = amd_bytealign_S (w0[2], w0[3], offset); + w0[3] = amd_bytealign_S (w0[1], w0[2], offset); + w0[2] = amd_bytealign_S (w0[0], w0[1], offset); + w0[1] = amd_bytealign_S ( 0, w0[0], offset); + w0[0] = 0; - case 110: - w6[3] = w6[3] | 0x800000; break; - case 111: - w6[3] = w6[3] | 0x80000000; - break; + case 2: + w3[3] = amd_bytealign_S (w3[0], w3[1], offset); + w3[2] = amd_bytealign_S (w2[3], w3[0], offset); + w3[1] = amd_bytealign_S (w2[2], w2[3], offset); + w3[0] = amd_bytealign_S (w2[1], w2[2], offset); + w2[3] = amd_bytealign_S (w2[0], w2[1], offset); + w2[2] = amd_bytealign_S (w1[3], w2[0], offset); + w2[1] = amd_bytealign_S (w1[2], w1[3], offset); + w2[0] = amd_bytealign_S (w1[1], w1[2], offset); + w1[3] = amd_bytealign_S (w1[0], w1[1], offset); + w1[2] = amd_bytealign_S (w0[3], w1[0], offset); + w1[1] = amd_bytealign_S (w0[2], w0[3], offset); + w1[0] = amd_bytealign_S (w0[1], w0[2], offset); + w0[3] = amd_bytealign_S (w0[0], w0[1], offset); + w0[2] = amd_bytealign_S ( 0, w0[0], offset); + w0[1] = 0; + w0[0] = 0; - case 112: - w7[0] = 0x80; break; - case 113: - w7[0] = w7[0] | 0x8000; - break; + case 3: + w3[3] = amd_bytealign_S (w2[3], w3[0], offset); + w3[2] = amd_bytealign_S (w2[2], w2[3], offset); + w3[1] = amd_bytealign_S (w2[1], w2[2], offset); + w3[0] = amd_bytealign_S (w2[0], w2[1], offset); + w2[3] = amd_bytealign_S (w1[3], w2[0], offset); + w2[2] = amd_bytealign_S (w1[2], w1[3], offset); + w2[1] = amd_bytealign_S (w1[1], w1[2], offset); + w2[0] = amd_bytealign_S (w1[0], w1[1], offset); + w1[3] = amd_bytealign_S (w0[3], w1[0], offset); + w1[2] = amd_bytealign_S (w0[2], w0[3], offset); + w1[1] = amd_bytealign_S (w0[1], w0[2], offset); + w1[0] = amd_bytealign_S (w0[0], w0[1], offset); + w0[3] = amd_bytealign_S ( 0, w0[0], offset); + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 114: - w7[0] = w7[0] | 0x800000; break; - case 115: - w7[0] = w7[0] | 0x80000000; - break; + case 4: + w3[3] = amd_bytealign_S (w2[2], w2[3], offset); + w3[2] = amd_bytealign_S (w2[1], w2[2], offset); + w3[1] = amd_bytealign_S (w2[0], w2[1], offset); + w3[0] = amd_bytealign_S (w1[3], w2[0], offset); + w2[3] = amd_bytealign_S (w1[2], w1[3], offset); + w2[2] = amd_bytealign_S (w1[1], w1[2], offset); + w2[1] = amd_bytealign_S (w1[0], w1[1], offset); + w2[0] = amd_bytealign_S (w0[3], w1[0], offset); + w1[3] = amd_bytealign_S (w0[2], w0[3], offset); + w1[2] = amd_bytealign_S (w0[1], w0[2], offset); + w1[1] = amd_bytealign_S (w0[0], w0[1], offset); + w1[0] = amd_bytealign_S ( 0, w0[0], offset); + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 116: - w7[1] = 0x80; break; - case 117: - w7[1] = w7[1] | 0x8000; - break; + case 5: + w3[3] = amd_bytealign_S (w2[1], w2[2], offset); + w3[2] = amd_bytealign_S (w2[0], w2[1], offset); + w3[1] = amd_bytealign_S (w1[3], w2[0], offset); + w3[0] = amd_bytealign_S (w1[2], w1[3], offset); + w2[3] = amd_bytealign_S (w1[1], w1[2], offset); + w2[2] = amd_bytealign_S (w1[0], w1[1], offset); + w2[1] = amd_bytealign_S (w0[3], w1[0], offset); + w2[0] = amd_bytealign_S (w0[2], w0[3], offset); + w1[3] = amd_bytealign_S (w0[1], w0[2], offset); + w1[2] = amd_bytealign_S (w0[0], w0[1], offset); + w1[1] = amd_bytealign_S ( 0, w0[0], offset); + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 118: - w7[1] = w7[1] | 0x800000; break; - case 119: - w7[1] = w7[1] | 0x80000000; - break; + case 6: + w3[3] = amd_bytealign_S (w2[0], w2[1], offset); + w3[2] = amd_bytealign_S (w1[3], w2[0], offset); + w3[1] = amd_bytealign_S (w1[2], w1[3], offset); + w3[0] = amd_bytealign_S (w1[1], w1[2], offset); + w2[3] = amd_bytealign_S (w1[0], w1[1], offset); + w2[2] = amd_bytealign_S (w0[3], w1[0], offset); + w2[1] = amd_bytealign_S (w0[2], w0[3], offset); + w2[0] = amd_bytealign_S (w0[1], w0[2], offset); + w1[3] = amd_bytealign_S (w0[0], w0[1], offset); + w1[2] = amd_bytealign_S ( 0, w0[0], offset); + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 120: - w7[2] = 0x80; break; - case 121: - w7[2] = w7[2] | 0x8000; - break; + case 7: + w3[3] = amd_bytealign_S (w1[3], w2[0], offset); + w3[2] = amd_bytealign_S (w1[2], w1[3], offset); + w3[1] = amd_bytealign_S (w1[1], w1[2], offset); + w3[0] = amd_bytealign_S (w1[0], w1[1], offset); + w2[3] = amd_bytealign_S (w0[3], w1[0], offset); + w2[2] = amd_bytealign_S (w0[2], w0[3], offset); + w2[1] = amd_bytealign_S (w0[1], w0[2], offset); + w2[0] = amd_bytealign_S (w0[0], w0[1], offset); + w1[3] = amd_bytealign_S ( 0, w0[0], offset); + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 122: - w7[2] = w7[2] | 0x800000; break; - case 123: - w7[2] = w7[2] | 0x80000000; - break; + case 8: + w3[3] = amd_bytealign_S (w1[2], w1[3], offset); + w3[2] = amd_bytealign_S (w1[1], w1[2], offset); + w3[1] = amd_bytealign_S (w1[0], w1[1], offset); + w3[0] = amd_bytealign_S (w0[3], w1[0], offset); + w2[3] = amd_bytealign_S (w0[2], w0[3], offset); + w2[2] = amd_bytealign_S (w0[1], w0[2], offset); + w2[1] = amd_bytealign_S (w0[0], w0[1], offset); + w2[0] = amd_bytealign_S ( 0, w0[0], offset); + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 124: - w7[3] = 0x80; break; - case 125: - w7[3] = w7[3] | 0x8000; - break; + case 9: + w3[3] = amd_bytealign_S (w1[1], w1[2], offset); + w3[2] = amd_bytealign_S (w1[0], w1[1], offset); + w3[1] = amd_bytealign_S (w0[3], w1[0], offset); + w3[0] = amd_bytealign_S (w0[2], w0[3], offset); + w2[3] = amd_bytealign_S (w0[1], w0[2], offset); + w2[2] = amd_bytealign_S (w0[0], w0[1], offset); + w2[1] = amd_bytealign_S ( 0, w0[0], offset); + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 126: - w7[3] = w7[3] | 0x800000; break; - case 127: - w7[3] = w7[3] | 0x80000000; + case 10: + w3[3] = amd_bytealign_S (w1[0], w1[1], offset); + w3[2] = amd_bytealign_S (w0[3], w1[0], offset); + w3[1] = amd_bytealign_S (w0[2], w0[3], offset); + w3[0] = amd_bytealign_S (w0[1], w0[2], offset); + w2[3] = amd_bytealign_S (w0[0], w0[1], offset); + w2[2] = amd_bytealign_S ( 0, w0[0], offset); + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; - } -} -inline void truncate_block_S (u32 w[4], const u32 len) -{ - switch (len) - { - case 0: w[0] &= 0; - w[1] &= 0; - w[2] &= 0; - w[3] &= 0; - break; - case 1: w[0] &= 0x000000FF; - w[1] &= 0; - w[2] &= 0; - w[3] &= 0; - break; - case 2: w[0] &= 0x0000FFFF; - w[1] &= 0; - w[2] &= 0; - w[3] &= 0; - break; - case 3: w[0] &= 0x00FFFFFF; - w[1] &= 0; - w[2] &= 0; - w[3] &= 0; - break; - case 4: w[1] &= 0; - w[2] &= 0; - w[3] &= 0; - break; - case 5: w[1] &= 0x000000FF; - w[2] &= 0; - w[3] &= 0; - break; - case 6: w[1] &= 0x0000FFFF; - w[2] &= 0; - w[3] &= 0; - break; - case 7: w[1] &= 0x00FFFFFF; - w[2] &= 0; - w[3] &= 0; - break; - case 8: w[2] &= 0; - w[3] &= 0; - break; - case 9: w[2] &= 0x000000FF; - w[3] &= 0; - break; - case 10: w[2] &= 0x0000FFFF; - w[3] &= 0; - break; - case 11: w[2] &= 0x00FFFFFF; - w[3] &= 0; - break; - case 12: w[3] &= 0; - break; - case 13: w[3] &= 0x000000FF; - break; - case 14: w[3] &= 0x0000FFFF; - break; - case 15: w[3] &= 0x00FFFFFF; - break; - } -} + case 11: + w3[3] = amd_bytealign_S (w0[3], w1[0], offset); + w3[2] = amd_bytealign_S (w0[2], w0[3], offset); + w3[1] = amd_bytealign_S (w0[1], w0[2], offset); + w3[0] = amd_bytealign_S (w0[0], w0[1], offset); + w2[3] = amd_bytealign_S ( 0, w0[0], offset); + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; -inline void make_utf16be_S (const u32 in[4], u32 out1[4], u32 out2[4]) -{ - #ifdef IS_NV - out2[3] = __byte_perm_S (in[3], 0, 0x3727); - out2[2] = __byte_perm_S (in[3], 0, 0x1707); - out2[1] = __byte_perm_S (in[2], 0, 0x3727); - out2[0] = __byte_perm_S (in[2], 0, 0x1707); - out1[3] = __byte_perm_S (in[1], 0, 0x3727); - out1[2] = __byte_perm_S (in[1], 0, 0x1707); - out1[1] = __byte_perm_S (in[0], 0, 0x3727); - out1[0] = __byte_perm_S (in[0], 0, 0x1707); - #endif + break; - #if defined IS_AMD || defined IS_GENERIC - out2[3] = ((in[3] >> 0) & 0xFF000000) | ((in[3] >> 8) & 0x0000FF00); - out2[2] = ((in[3] << 16) & 0xFF000000) | ((in[3] << 8) & 0x0000FF00); - out2[1] = ((in[2] >> 0) & 0xFF000000) | ((in[2] >> 8) & 0x0000FF00); - out2[0] = ((in[2] << 16) & 0xFF000000) | ((in[2] << 8) & 0x0000FF00); - out1[3] = ((in[1] >> 0) & 0xFF000000) | ((in[1] >> 8) & 0x0000FF00); - out1[2] = ((in[1] << 16) & 0xFF000000) | ((in[1] << 8) & 0x0000FF00); - out1[1] = ((in[0] >> 0) & 0xFF000000) | ((in[0] >> 8) & 0x0000FF00); - out1[0] = ((in[0] << 16) & 0xFF000000) | ((in[0] << 8) & 0x0000FF00); - #endif -} + case 12: + w3[3] = amd_bytealign_S (w0[2], w0[3], offset); + w3[2] = amd_bytealign_S (w0[1], w0[2], offset); + w3[1] = amd_bytealign_S (w0[0], w0[1], offset); + w3[0] = amd_bytealign_S ( 0, w0[0], offset); + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; -inline void make_utf16le_S (const u32 in[4], u32 out1[4], u32 out2[4]) -{ - #ifdef IS_NV - out2[3] = __byte_perm_S (in[3], 0, 0x7372); - out2[2] = __byte_perm_S (in[3], 0, 0x7170); - out2[1] = __byte_perm_S (in[2], 0, 0x7372); - out2[0] = __byte_perm_S (in[2], 0, 0x7170); - out1[3] = __byte_perm_S (in[1], 0, 0x7372); - out1[2] = __byte_perm_S (in[1], 0, 0x7170); - out1[1] = __byte_perm_S (in[0], 0, 0x7372); - out1[0] = __byte_perm_S (in[0], 0, 0x7170); - #endif + break; - #if defined IS_AMD || defined IS_GENERIC - out2[3] = ((in[3] >> 8) & 0x00FF0000) | ((in[3] >> 16) & 0x000000FF); - out2[2] = ((in[3] << 8) & 0x00FF0000) | ((in[3] >> 0) & 0x000000FF); - out2[1] = ((in[2] >> 8) & 0x00FF0000) | ((in[2] >> 16) & 0x000000FF); - out2[0] = ((in[2] << 8) & 0x00FF0000) | ((in[2] >> 0) & 0x000000FF); - out1[3] = ((in[1] >> 8) & 0x00FF0000) | ((in[1] >> 16) & 0x000000FF); - out1[2] = ((in[1] << 8) & 0x00FF0000) | ((in[1] >> 0) & 0x000000FF); - out1[1] = ((in[0] >> 8) & 0x00FF0000) | ((in[0] >> 16) & 0x000000FF); - out1[0] = ((in[0] << 8) & 0x00FF0000) | ((in[0] >> 0) & 0x000000FF); - #endif -} + case 13: + w3[3] = amd_bytealign_S (w0[1], w0[2], offset); + w3[2] = amd_bytealign_S (w0[0], w0[1], offset); + w3[1] = amd_bytealign_S ( 0, w0[0], offset); + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; -inline void undo_utf16be_S (const u32 in1[4], const u32 in2[4], u32 out[4]) -{ - #ifdef IS_NV - out[0] = __byte_perm_S (in1[0], in1[1], 0x4602); - out[1] = __byte_perm_S (in1[2], in1[3], 0x4602); - out[2] = __byte_perm_S (in2[0], in2[1], 0x4602); - out[3] = __byte_perm_S (in2[2], in2[3], 0x4602); - #endif + break; - #if defined IS_AMD || defined IS_GENERIC - out[0] = ((in1[0] & 0x0000ff00) >> 8) | ((in1[0] & 0xff000000) >> 16) - | ((in1[1] & 0x0000ff00) << 8) | ((in1[1] & 0xff000000) << 0); - out[1] = ((in1[2] & 0x0000ff00) >> 8) | ((in1[2] & 0xff000000) >> 16) - | ((in1[3] & 0x0000ff00) << 8) | ((in1[3] & 0xff000000) << 0); - out[2] = ((in2[0] & 0x0000ff00) >> 8) | ((in2[0] & 0xff000000) >> 16) - | ((in2[1] & 0x0000ff00) << 8) | ((in2[1] & 0xff000000) << 0); - out[3] = ((in2[2] & 0x0000ff00) >> 8) | ((in2[2] & 0xff000000) >> 16) - | ((in2[3] & 0x0000ff00) << 8) | ((in2[3] & 0xff000000) << 0); - #endif -} + case 14: + w3[3] = amd_bytealign_S (w0[0], w0[1], offset); + w3[2] = amd_bytealign_S ( 0, w0[0], offset); + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; -inline void undo_utf16le_S (const u32 in1[4], const u32 in2[4], u32 out[4]) -{ - #ifdef IS_NV - out[0] = __byte_perm_S (in1[0], in1[1], 0x6420); - out[1] = __byte_perm_S (in1[2], in1[3], 0x6420); - out[2] = __byte_perm_S (in2[0], in2[1], 0x6420); - out[3] = __byte_perm_S (in2[2], in2[3], 0x6420); - #endif + break; - #if defined IS_AMD || defined IS_GENERIC - out[0] = ((in1[0] & 0x000000ff) >> 0) | ((in1[0] & 0x00ff0000) >> 8) - | ((in1[1] & 0x000000ff) << 16) | ((in1[1] & 0x00ff0000) << 8); - out[1] = ((in1[2] & 0x000000ff) >> 0) | ((in1[2] & 0x00ff0000) >> 8) - | ((in1[3] & 0x000000ff) << 16) | ((in1[3] & 0x00ff0000) << 8); - out[2] = ((in2[0] & 0x000000ff) >> 0) | ((in2[0] & 0x00ff0000) >> 8) - | ((in2[1] & 0x000000ff) << 16) | ((in2[1] & 0x00ff0000) << 8); - out[3] = ((in2[2] & 0x000000ff) >> 0) | ((in2[2] & 0x00ff0000) >> 8) - | ((in2[3] & 0x000000ff) << 16) | ((in2[3] & 0x00ff0000) << 8); - #endif -} + case 15: + w3[3] = amd_bytealign_S ( 0, w0[0], offset); + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; -inline void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) -{ - #if defined IS_AMD || defined IS_GENERIC - const int offset_mod_4 = offset & 3; + break; + } + #endif - const int offset_minus_4 = 4 - offset; + #ifdef IS_NV + const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; switch (offset / 4) { case 0: - w3[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w3[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w3[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w2[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w2[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w2[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w1[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w1[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w1[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w0[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w0[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w0[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); - - if (offset_mod_4 == 0) - { - w0[0] = w0[1]; - w0[1] = w0[2]; - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } + w3[3] = __byte_perm_S (w3[3], w3[2], selector); + w3[2] = __byte_perm_S (w3[2], w3[1], selector); + w3[1] = __byte_perm_S (w3[1], w3[0], selector); + w3[0] = __byte_perm_S (w3[0], w2[3], selector); + w2[3] = __byte_perm_S (w2[3], w2[2], selector); + w2[2] = __byte_perm_S (w2[2], w2[1], selector); + w2[1] = __byte_perm_S (w2[1], w2[0], selector); + w2[0] = __byte_perm_S (w2[0], w1[3], selector); + w1[3] = __byte_perm_S (w1[3], w1[2], selector); + w1[2] = __byte_perm_S (w1[2], w1[1], selector); + w1[1] = __byte_perm_S (w1[1], w1[0], selector); + w1[0] = __byte_perm_S (w1[0], w0[3], selector); + w0[3] = __byte_perm_S (w0[3], w0[2], selector); + w0[2] = __byte_perm_S (w0[2], w0[1], selector); + w0[1] = __byte_perm_S (w0[1], w0[0], selector); + w0[0] = __byte_perm_S (w0[0], 0, selector); break; case 1: - w3[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w3[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w2[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w2[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w1[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w1[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w0[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w0[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w0[1] = w0[2]; - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } + w3[3] = __byte_perm_S (w3[2], w3[1], selector); + w3[2] = __byte_perm_S (w3[1], w3[0], selector); + w3[1] = __byte_perm_S (w3[0], w2[3], selector); + w3[0] = __byte_perm_S (w2[3], w2[2], selector); + w2[3] = __byte_perm_S (w2[2], w2[1], selector); + w2[2] = __byte_perm_S (w2[1], w2[0], selector); + w2[1] = __byte_perm_S (w2[0], w1[3], selector); + w2[0] = __byte_perm_S (w1[3], w1[2], selector); + w1[3] = __byte_perm_S (w1[2], w1[1], selector); + w1[2] = __byte_perm_S (w1[1], w1[0], selector); + w1[1] = __byte_perm_S (w1[0], w0[3], selector); + w1[0] = __byte_perm_S (w0[3], w0[2], selector); + w0[3] = __byte_perm_S (w0[2], w0[1], selector); + w0[2] = __byte_perm_S (w0[1], w0[0], selector); + w0[1] = __byte_perm_S (w0[0], 0, selector); + w0[0] = 0; break; case 2: - w3[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w2[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w1[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w0[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = __byte_perm_S (w3[1], w3[0], selector); + w3[2] = __byte_perm_S (w3[0], w2[3], selector); + w3[1] = __byte_perm_S (w2[3], w2[2], selector); + w3[0] = __byte_perm_S (w2[2], w2[1], selector); + w2[3] = __byte_perm_S (w2[1], w2[0], selector); + w2[2] = __byte_perm_S (w2[0], w1[3], selector); + w2[1] = __byte_perm_S (w1[3], w1[2], selector); + w2[0] = __byte_perm_S (w1[2], w1[1], selector); + w1[3] = __byte_perm_S (w1[1], w1[0], selector); + w1[2] = __byte_perm_S (w1[0], w0[3], selector); + w1[1] = __byte_perm_S (w0[3], w0[2], selector); + w1[0] = __byte_perm_S (w0[2], w0[1], selector); + w0[3] = __byte_perm_S (w0[1], w0[0], selector); + w0[2] = __byte_perm_S (w0[0], 0, selector); w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 3: - w3[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = __byte_perm_S (w3[0], w2[3], selector); + w3[2] = __byte_perm_S (w2[3], w2[2], selector); + w3[1] = __byte_perm_S (w2[2], w2[1], selector); + w3[0] = __byte_perm_S (w2[1], w2[0], selector); + w2[3] = __byte_perm_S (w2[0], w1[3], selector); + w2[2] = __byte_perm_S (w1[3], w1[2], selector); + w2[1] = __byte_perm_S (w1[2], w1[1], selector); + w2[0] = __byte_perm_S (w1[1], w1[0], selector); + w1[3] = __byte_perm_S (w1[0], w0[3], selector); + w1[2] = __byte_perm_S (w0[3], w0[2], selector); + w1[1] = __byte_perm_S (w0[2], w0[1], selector); + w1[0] = __byte_perm_S (w0[1], w0[0], selector); + w0[3] = __byte_perm_S (w0[0], 0, selector); w0[2] = 0; w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 4: - w3[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = __byte_perm_S (w2[3], w2[2], selector); + w3[2] = __byte_perm_S (w2[2], w2[1], selector); + w3[1] = __byte_perm_S (w2[1], w2[0], selector); + w3[0] = __byte_perm_S (w2[0], w1[3], selector); + w2[3] = __byte_perm_S (w1[3], w1[2], selector); + w2[2] = __byte_perm_S (w1[2], w1[1], selector); + w2[1] = __byte_perm_S (w1[1], w1[0], selector); + w2[0] = __byte_perm_S (w1[0], w0[3], selector); + w1[3] = __byte_perm_S (w0[3], w0[2], selector); + w1[2] = __byte_perm_S (w0[2], w0[1], selector); + w1[1] = __byte_perm_S (w0[1], w0[0], selector); + w1[0] = __byte_perm_S (w0[0], 0, selector); w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 5: - w3[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = __byte_perm_S (w2[2], w2[1], selector); + w3[2] = __byte_perm_S (w2[1], w2[0], selector); + w3[1] = __byte_perm_S (w2[0], w1[3], selector); + w3[0] = __byte_perm_S (w1[3], w1[2], selector); + w2[3] = __byte_perm_S (w1[2], w1[1], selector); + w2[2] = __byte_perm_S (w1[1], w1[0], selector); + w2[1] = __byte_perm_S (w1[0], w0[3], selector); + w2[0] = __byte_perm_S (w0[3], w0[2], selector); + w1[3] = __byte_perm_S (w0[2], w0[1], selector); + w1[2] = __byte_perm_S (w0[1], w0[0], selector); + w1[1] = __byte_perm_S (w0[0], 0, selector); w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 6: - w3[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = __byte_perm_S (w2[1], w2[0], selector); + w3[2] = __byte_perm_S (w2[0], w1[3], selector); + w3[1] = __byte_perm_S (w1[3], w1[2], selector); + w3[0] = __byte_perm_S (w1[2], w1[1], selector); + w2[3] = __byte_perm_S (w1[1], w1[0], selector); + w2[2] = __byte_perm_S (w1[0], w0[3], selector); + w2[1] = __byte_perm_S (w0[3], w0[2], selector); + w2[0] = __byte_perm_S (w0[2], w0[1], selector); + w1[3] = __byte_perm_S (w0[1], w0[0], selector); + w1[2] = __byte_perm_S (w0[0], 0, selector); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -6974,32 +9384,18 @@ inline void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 7: - w3[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = __byte_perm_S (w2[0], w1[3], selector); + w3[2] = __byte_perm_S (w1[3], w1[2], selector); + w3[1] = __byte_perm_S (w1[2], w1[1], selector); + w3[0] = __byte_perm_S (w1[1], w1[0], selector); + w2[3] = __byte_perm_S (w1[0], w0[3], selector); + w2[2] = __byte_perm_S (w0[3], w0[2], selector); + w2[1] = __byte_perm_S (w0[2], w0[1], selector); + w2[0] = __byte_perm_S (w0[1], w0[0], selector); + w1[3] = __byte_perm_S (w0[0], 0, selector); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -7008,30 +9404,17 @@ inline void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; - - case 8: - w3[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + + case 8: + w3[3] = __byte_perm_S (w1[3], w1[2], selector); + w3[2] = __byte_perm_S (w1[2], w1[1], selector); + w3[1] = __byte_perm_S (w1[1], w1[0], selector); + w3[0] = __byte_perm_S (w1[0], w0[3], selector); + w2[3] = __byte_perm_S (w0[3], w0[2], selector); + w2[2] = __byte_perm_S (w0[2], w0[1], selector); + w2[1] = __byte_perm_S (w0[1], w0[0], selector); + w2[0] = __byte_perm_S (w0[0], 0, selector); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -7041,28 +9424,16 @@ inline void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 9: - w3[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = __byte_perm_S (w1[2], w1[1], selector); + w3[2] = __byte_perm_S (w1[1], w1[0], selector); + w3[1] = __byte_perm_S (w1[0], w0[3], selector); + w3[0] = __byte_perm_S (w0[3], w0[2], selector); + w2[3] = __byte_perm_S (w0[2], w0[1], selector); + w2[2] = __byte_perm_S (w0[1], w0[0], selector); + w2[1] = __byte_perm_S (w0[0], 0, selector); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -7073,26 +9444,15 @@ inline void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 10: - w3[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = __byte_perm_S (w1[1], w1[0], selector); + w3[2] = __byte_perm_S (w1[0], w0[3], selector); + w3[1] = __byte_perm_S (w0[3], w0[2], selector); + w3[0] = __byte_perm_S (w0[2], w0[1], selector); + w2[3] = __byte_perm_S (w0[1], w0[0], selector); + w2[2] = __byte_perm_S (w0[0], 0, selector); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -7104,24 +9464,14 @@ inline void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 11: - w3[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = __byte_perm_S (w1[0], w0[3], selector); + w3[2] = __byte_perm_S (w0[3], w0[2], selector); + w3[1] = __byte_perm_S (w0[2], w0[1], selector); + w3[0] = __byte_perm_S (w0[1], w0[0], selector); + w2[3] = __byte_perm_S (w0[0], 0, selector); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -7134,22 +9484,13 @@ inline void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 12: - w3[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = __byte_perm_S (w0[3], w0[2], selector); + w3[2] = __byte_perm_S (w0[2], w0[1], selector); + w3[1] = __byte_perm_S (w0[1], w0[0], selector); + w3[0] = __byte_perm_S (w0[0], 0, selector); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -7163,20 +9504,12 @@ inline void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 13: - w3[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = __byte_perm_S (w0[2], w0[1], selector); + w3[2] = __byte_perm_S (w0[1], w0[0], selector); + w3[1] = __byte_perm_S (w0[0], 0, selector); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -7191,18 +9524,11 @@ inline void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 14: - w3[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = __byte_perm_S (w0[1], w0[0], selector); + w3[2] = __byte_perm_S (w0[0], 0, selector); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -7218,16 +9544,10 @@ inline void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 15: - w3[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = __byte_perm_S (w0[0], 0, selector); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -7244,96 +9564,100 @@ inline void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[3] = 0; - } - break; } #endif +} - #ifdef IS_NV - const int offset_minus_4 = 4 - (offset % 4); - - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - +inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 c0[4], u32 c1[4], u32 c2[4], u32 c3[4], const u32 offset) +{ + #if defined IS_AMD || defined IS_GENERIC switch (offset / 4) { case 0: - w3[3] = __byte_perm_S (w3[2], w3[3], selector); - w3[2] = __byte_perm_S (w3[1], w3[2], selector); - w3[1] = __byte_perm_S (w3[0], w3[1], selector); - w3[0] = __byte_perm_S (w2[3], w3[0], selector); - w2[3] = __byte_perm_S (w2[2], w2[3], selector); - w2[2] = __byte_perm_S (w2[1], w2[2], selector); - w2[1] = __byte_perm_S (w2[0], w2[1], selector); - w2[0] = __byte_perm_S (w1[3], w2[0], selector); - w1[3] = __byte_perm_S (w1[2], w1[3], selector); - w1[2] = __byte_perm_S (w1[1], w1[2], selector); - w1[1] = __byte_perm_S (w1[0], w1[1], selector); - w1[0] = __byte_perm_S (w0[3], w1[0], selector); - w0[3] = __byte_perm_S (w0[2], w0[3], selector); - w0[2] = __byte_perm_S (w0[1], w0[2], selector); - w0[1] = __byte_perm_S (w0[0], w0[1], selector); - w0[0] = __byte_perm_S ( 0, w0[0], selector); + c0[0] = amd_bytealign_S (w3[3], 0, offset); + w3[3] = amd_bytealign_S (w3[2], w3[3], offset); + w3[2] = amd_bytealign_S (w3[1], w3[2], offset); + w3[1] = amd_bytealign_S (w3[0], w3[1], offset); + w3[0] = amd_bytealign_S (w2[3], w3[0], offset); + w2[3] = amd_bytealign_S (w2[2], w2[3], offset); + w2[2] = amd_bytealign_S (w2[1], w2[2], offset); + w2[1] = amd_bytealign_S (w2[0], w2[1], offset); + w2[0] = amd_bytealign_S (w1[3], w2[0], offset); + w1[3] = amd_bytealign_S (w1[2], w1[3], offset); + w1[2] = amd_bytealign_S (w1[1], w1[2], offset); + w1[1] = amd_bytealign_S (w1[0], w1[1], offset); + w1[0] = amd_bytealign_S (w0[3], w1[0], offset); + w0[3] = amd_bytealign_S (w0[2], w0[3], offset); + w0[2] = amd_bytealign_S (w0[1], w0[2], offset); + w0[1] = amd_bytealign_S (w0[0], w0[1], offset); + w0[0] = amd_bytealign_S ( 0, w0[0], offset); break; case 1: - w3[3] = __byte_perm_S (w3[1], w3[2], selector); - w3[2] = __byte_perm_S (w3[0], w3[1], selector); - w3[1] = __byte_perm_S (w2[3], w3[0], selector); - w3[0] = __byte_perm_S (w2[2], w2[3], selector); - w2[3] = __byte_perm_S (w2[1], w2[2], selector); - w2[2] = __byte_perm_S (w2[0], w2[1], selector); - w2[1] = __byte_perm_S (w1[3], w2[0], selector); - w2[0] = __byte_perm_S (w1[2], w1[3], selector); - w1[3] = __byte_perm_S (w1[1], w1[2], selector); - w1[2] = __byte_perm_S (w1[0], w1[1], selector); - w1[1] = __byte_perm_S (w0[3], w1[0], selector); - w1[0] = __byte_perm_S (w0[2], w0[3], selector); - w0[3] = __byte_perm_S (w0[1], w0[2], selector); - w0[2] = __byte_perm_S (w0[0], w0[1], selector); - w0[1] = __byte_perm_S ( 0, w0[0], selector); + c0[1] = amd_bytealign_S (w3[3], 0, offset); + c0[0] = amd_bytealign_S (w3[2], w3[3], offset); + w3[3] = amd_bytealign_S (w3[1], w3[2], offset); + w3[2] = amd_bytealign_S (w3[0], w3[1], offset); + w3[1] = amd_bytealign_S (w2[3], w3[0], offset); + w3[0] = amd_bytealign_S (w2[2], w2[3], offset); + w2[3] = amd_bytealign_S (w2[1], w2[2], offset); + w2[2] = amd_bytealign_S (w2[0], w2[1], offset); + w2[1] = amd_bytealign_S (w1[3], w2[0], offset); + w2[0] = amd_bytealign_S (w1[2], w1[3], offset); + w1[3] = amd_bytealign_S (w1[1], w1[2], offset); + w1[2] = amd_bytealign_S (w1[0], w1[1], offset); + w1[1] = amd_bytealign_S (w0[3], w1[0], offset); + w1[0] = amd_bytealign_S (w0[2], w0[3], offset); + w0[3] = amd_bytealign_S (w0[1], w0[2], offset); + w0[2] = amd_bytealign_S (w0[0], w0[1], offset); + w0[1] = amd_bytealign_S ( 0, w0[0], offset); w0[0] = 0; break; case 2: - w3[3] = __byte_perm_S (w3[0], w3[1], selector); - w3[2] = __byte_perm_S (w2[3], w3[0], selector); - w3[1] = __byte_perm_S (w2[2], w2[3], selector); - w3[0] = __byte_perm_S (w2[1], w2[2], selector); - w2[3] = __byte_perm_S (w2[0], w2[1], selector); - w2[2] = __byte_perm_S (w1[3], w2[0], selector); - w2[1] = __byte_perm_S (w1[2], w1[3], selector); - w2[0] = __byte_perm_S (w1[1], w1[2], selector); - w1[3] = __byte_perm_S (w1[0], w1[1], selector); - w1[2] = __byte_perm_S (w0[3], w1[0], selector); - w1[1] = __byte_perm_S (w0[2], w0[3], selector); - w1[0] = __byte_perm_S (w0[1], w0[2], selector); - w0[3] = __byte_perm_S (w0[0], w0[1], selector); - w0[2] = __byte_perm_S ( 0, w0[0], selector); + c0[2] = amd_bytealign_S (w3[3], 0, offset); + c0[1] = amd_bytealign_S (w3[2], w3[3], offset); + c0[0] = amd_bytealign_S (w3[1], w3[2], offset); + w3[3] = amd_bytealign_S (w3[0], w3[1], offset); + w3[2] = amd_bytealign_S (w2[3], w3[0], offset); + w3[1] = amd_bytealign_S (w2[2], w2[3], offset); + w3[0] = amd_bytealign_S (w2[1], w2[2], offset); + w2[3] = amd_bytealign_S (w2[0], w2[1], offset); + w2[2] = amd_bytealign_S (w1[3], w2[0], offset); + w2[1] = amd_bytealign_S (w1[2], w1[3], offset); + w2[0] = amd_bytealign_S (w1[1], w1[2], offset); + w1[3] = amd_bytealign_S (w1[0], w1[1], offset); + w1[2] = amd_bytealign_S (w0[3], w1[0], offset); + w1[1] = amd_bytealign_S (w0[2], w0[3], offset); + w1[0] = amd_bytealign_S (w0[1], w0[2], offset); + w0[3] = amd_bytealign_S (w0[0], w0[1], offset); + w0[2] = amd_bytealign_S ( 0, w0[0], offset); w0[1] = 0; w0[0] = 0; break; case 3: - w3[3] = __byte_perm_S (w2[3], w3[0], selector); - w3[2] = __byte_perm_S (w2[2], w2[3], selector); - w3[1] = __byte_perm_S (w2[1], w2[2], selector); - w3[0] = __byte_perm_S (w2[0], w2[1], selector); - w2[3] = __byte_perm_S (w1[3], w2[0], selector); - w2[2] = __byte_perm_S (w1[2], w1[3], selector); - w2[1] = __byte_perm_S (w1[1], w1[2], selector); - w2[0] = __byte_perm_S (w1[0], w1[1], selector); - w1[3] = __byte_perm_S (w0[3], w1[0], selector); - w1[2] = __byte_perm_S (w0[2], w0[3], selector); - w1[1] = __byte_perm_S (w0[1], w0[2], selector); - w1[0] = __byte_perm_S (w0[0], w0[1], selector); - w0[3] = __byte_perm_S ( 0, w0[0], selector); + c0[3] = amd_bytealign_S (w3[3], 0, offset); + c0[2] = amd_bytealign_S (w3[2], w3[3], offset); + c0[1] = amd_bytealign_S (w3[1], w3[2], offset); + c0[0] = amd_bytealign_S (w3[0], w3[1], offset); + w3[3] = amd_bytealign_S (w2[3], w3[0], offset); + w3[2] = amd_bytealign_S (w2[2], w2[3], offset); + w3[1] = amd_bytealign_S (w2[1], w2[2], offset); + w3[0] = amd_bytealign_S (w2[0], w2[1], offset); + w2[3] = amd_bytealign_S (w1[3], w2[0], offset); + w2[2] = amd_bytealign_S (w1[2], w1[3], offset); + w2[1] = amd_bytealign_S (w1[1], w1[2], offset); + w2[0] = amd_bytealign_S (w1[0], w1[1], offset); + w1[3] = amd_bytealign_S (w0[3], w1[0], offset); + w1[2] = amd_bytealign_S (w0[2], w0[3], offset); + w1[1] = amd_bytealign_S (w0[1], w0[2], offset); + w1[0] = amd_bytealign_S (w0[0], w0[1], offset); + w0[3] = amd_bytealign_S ( 0, w0[0], offset); w0[2] = 0; w0[1] = 0; w0[0] = 0; @@ -7341,18 +9665,23 @@ inline void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w break; case 4: - w3[3] = __byte_perm_S (w2[2], w2[3], selector); - w3[2] = __byte_perm_S (w2[1], w2[2], selector); - w3[1] = __byte_perm_S (w2[0], w2[1], selector); - w3[0] = __byte_perm_S (w1[3], w2[0], selector); - w2[3] = __byte_perm_S (w1[2], w1[3], selector); - w2[2] = __byte_perm_S (w1[1], w1[2], selector); - w2[1] = __byte_perm_S (w1[0], w1[1], selector); - w2[0] = __byte_perm_S (w0[3], w1[0], selector); - w1[3] = __byte_perm_S (w0[2], w0[3], selector); - w1[2] = __byte_perm_S (w0[1], w0[2], selector); - w1[1] = __byte_perm_S (w0[0], w0[1], selector); - w1[0] = __byte_perm_S ( 0, w0[0], selector); + c1[0] = amd_bytealign_S (w3[3], 0, offset); + c0[3] = amd_bytealign_S (w3[2], w3[3], offset); + c0[2] = amd_bytealign_S (w3[1], w3[2], offset); + c0[1] = amd_bytealign_S (w3[0], w3[1], offset); + c0[0] = amd_bytealign_S (w2[3], w3[0], offset); + w3[3] = amd_bytealign_S (w2[2], w2[3], offset); + w3[2] = amd_bytealign_S (w2[1], w2[2], offset); + w3[1] = amd_bytealign_S (w2[0], w2[1], offset); + w3[0] = amd_bytealign_S (w1[3], w2[0], offset); + w2[3] = amd_bytealign_S (w1[2], w1[3], offset); + w2[2] = amd_bytealign_S (w1[1], w1[2], offset); + w2[1] = amd_bytealign_S (w1[0], w1[1], offset); + w2[0] = amd_bytealign_S (w0[3], w1[0], offset); + w1[3] = amd_bytealign_S (w0[2], w0[3], offset); + w1[2] = amd_bytealign_S (w0[1], w0[2], offset); + w1[1] = amd_bytealign_S (w0[0], w0[1], offset); + w1[0] = amd_bytealign_S ( 0, w0[0], offset); w0[3] = 0; w0[2] = 0; w0[1] = 0; @@ -7361,17 +9690,23 @@ inline void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w break; case 5: - w3[3] = __byte_perm_S (w2[1], w2[2], selector); - w3[2] = __byte_perm_S (w2[0], w2[1], selector); - w3[1] = __byte_perm_S (w1[3], w2[0], selector); - w3[0] = __byte_perm_S (w1[2], w1[3], selector); - w2[3] = __byte_perm_S (w1[1], w1[2], selector); - w2[2] = __byte_perm_S (w1[0], w1[1], selector); - w2[1] = __byte_perm_S (w0[3], w1[0], selector); - w2[0] = __byte_perm_S (w0[2], w0[3], selector); - w1[3] = __byte_perm_S (w0[1], w0[2], selector); - w1[2] = __byte_perm_S (w0[0], w0[1], selector); - w1[1] = __byte_perm_S ( 0, w0[0], selector); + c1[1] = amd_bytealign_S (w3[3], 0, offset); + c1[0] = amd_bytealign_S (w3[2], w3[3], offset); + c0[3] = amd_bytealign_S (w3[1], w3[2], offset); + c0[2] = amd_bytealign_S (w3[0], w3[1], offset); + c0[1] = amd_bytealign_S (w2[3], w3[0], offset); + c0[0] = amd_bytealign_S (w2[2], w2[3], offset); + w3[3] = amd_bytealign_S (w2[1], w2[2], offset); + w3[2] = amd_bytealign_S (w2[0], w2[1], offset); + w3[1] = amd_bytealign_S (w1[3], w2[0], offset); + w3[0] = amd_bytealign_S (w1[2], w1[3], offset); + w2[3] = amd_bytealign_S (w1[1], w1[2], offset); + w2[2] = amd_bytealign_S (w1[0], w1[1], offset); + w2[1] = amd_bytealign_S (w0[3], w1[0], offset); + w2[0] = amd_bytealign_S (w0[2], w0[3], offset); + w1[3] = amd_bytealign_S (w0[1], w0[2], offset); + w1[2] = amd_bytealign_S (w0[0], w0[1], offset); + w1[1] = amd_bytealign_S ( 0, w0[0], offset); w1[0] = 0; w0[3] = 0; w0[2] = 0; @@ -7381,16 +9716,23 @@ inline void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w break; case 6: - w3[3] = __byte_perm_S (w2[0], w2[1], selector); - w3[2] = __byte_perm_S (w1[3], w2[0], selector); - w3[1] = __byte_perm_S (w1[2], w1[3], selector); - w3[0] = __byte_perm_S (w1[1], w1[2], selector); - w2[3] = __byte_perm_S (w1[0], w1[1], selector); - w2[2] = __byte_perm_S (w0[3], w1[0], selector); - w2[1] = __byte_perm_S (w0[2], w0[3], selector); - w2[0] = __byte_perm_S (w0[1], w0[2], selector); - w1[3] = __byte_perm_S (w0[0], w0[1], selector); - w1[2] = __byte_perm_S ( 0, w0[0], selector); + c1[2] = amd_bytealign_S (w3[3], 0, offset); + c1[1] = amd_bytealign_S (w3[2], w3[3], offset); + c1[0] = amd_bytealign_S (w3[1], w3[2], offset); + c0[3] = amd_bytealign_S (w3[0], w3[1], offset); + c0[2] = amd_bytealign_S (w2[3], w3[0], offset); + c0[1] = amd_bytealign_S (w2[2], w2[3], offset); + c0[0] = amd_bytealign_S (w2[1], w2[2], offset); + w3[3] = amd_bytealign_S (w2[0], w2[1], offset); + w3[2] = amd_bytealign_S (w1[3], w2[0], offset); + w3[1] = amd_bytealign_S (w1[2], w1[3], offset); + w3[0] = amd_bytealign_S (w1[1], w1[2], offset); + w2[3] = amd_bytealign_S (w1[0], w1[1], offset); + w2[2] = amd_bytealign_S (w0[3], w1[0], offset); + w2[1] = amd_bytealign_S (w0[2], w0[3], offset); + w2[0] = amd_bytealign_S (w0[1], w0[2], offset); + w1[3] = amd_bytealign_S (w0[0], w0[1], offset); + w1[2] = amd_bytealign_S ( 0, w0[0], offset); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -7401,15 +9743,23 @@ inline void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w break; case 7: - w3[3] = __byte_perm_S (w1[3], w2[0], selector); - w3[2] = __byte_perm_S (w1[2], w1[3], selector); - w3[1] = __byte_perm_S (w1[1], w1[2], selector); - w3[0] = __byte_perm_S (w1[0], w1[1], selector); - w2[3] = __byte_perm_S (w0[3], w1[0], selector); - w2[2] = __byte_perm_S (w0[2], w0[3], selector); - w2[1] = __byte_perm_S (w0[1], w0[2], selector); - w2[0] = __byte_perm_S (w0[0], w0[1], selector); - w1[3] = __byte_perm_S ( 0, w0[0], selector); + c1[3] = amd_bytealign_S (w3[3], 0, offset); + c1[2] = amd_bytealign_S (w3[2], w3[3], offset); + c1[1] = amd_bytealign_S (w3[1], w3[2], offset); + c1[0] = amd_bytealign_S (w3[0], w3[1], offset); + c0[3] = amd_bytealign_S (w2[3], w3[0], offset); + c0[2] = amd_bytealign_S (w2[2], w2[3], offset); + c0[1] = amd_bytealign_S (w2[1], w2[2], offset); + c0[0] = amd_bytealign_S (w2[0], w2[1], offset); + w3[3] = amd_bytealign_S (w1[3], w2[0], offset); + w3[2] = amd_bytealign_S (w1[2], w1[3], offset); + w3[1] = amd_bytealign_S (w1[1], w1[2], offset); + w3[0] = amd_bytealign_S (w1[0], w1[1], offset); + w2[3] = amd_bytealign_S (w0[3], w1[0], offset); + w2[2] = amd_bytealign_S (w0[2], w0[3], offset); + w2[1] = amd_bytealign_S (w0[1], w0[2], offset); + w2[0] = amd_bytealign_S (w0[0], w0[1], offset); + w1[3] = amd_bytealign_S ( 0, w0[0], offset); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -7421,14 +9771,23 @@ inline void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w break; case 8: - w3[3] = __byte_perm_S (w1[2], w1[3], selector); - w3[2] = __byte_perm_S (w1[1], w1[2], selector); - w3[1] = __byte_perm_S (w1[0], w1[1], selector); - w3[0] = __byte_perm_S (w0[3], w1[0], selector); - w2[3] = __byte_perm_S (w0[2], w0[3], selector); - w2[2] = __byte_perm_S (w0[1], w0[2], selector); - w2[1] = __byte_perm_S (w0[0], w0[1], selector); - w2[0] = __byte_perm_S ( 0, w0[0], selector); + c2[0] = amd_bytealign_S (w3[3], 0, offset); + c1[3] = amd_bytealign_S (w3[2], w3[3], offset); + c1[2] = amd_bytealign_S (w3[1], w3[2], offset); + c1[1] = amd_bytealign_S (w3[0], w3[1], offset); + c1[0] = amd_bytealign_S (w2[3], w3[0], offset); + c0[3] = amd_bytealign_S (w2[2], w2[3], offset); + c0[2] = amd_bytealign_S (w2[1], w2[2], offset); + c0[1] = amd_bytealign_S (w2[0], w2[1], offset); + c0[0] = amd_bytealign_S (w1[3], w2[0], offset); + w3[3] = amd_bytealign_S (w1[2], w1[3], offset); + w3[2] = amd_bytealign_S (w1[1], w1[2], offset); + w3[1] = amd_bytealign_S (w1[0], w1[1], offset); + w3[0] = amd_bytealign_S (w0[3], w1[0], offset); + w2[3] = amd_bytealign_S (w0[2], w0[3], offset); + w2[2] = amd_bytealign_S (w0[1], w0[2], offset); + w2[1] = amd_bytealign_S (w0[0], w0[1], offset); + w2[0] = amd_bytealign_S ( 0, w0[0], offset); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -7441,13 +9800,23 @@ inline void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w break; case 9: - w3[3] = __byte_perm_S (w1[1], w1[2], selector); - w3[2] = __byte_perm_S (w1[0], w1[1], selector); - w3[1] = __byte_perm_S (w0[3], w1[0], selector); - w3[0] = __byte_perm_S (w0[2], w0[3], selector); - w2[3] = __byte_perm_S (w0[1], w0[2], selector); - w2[2] = __byte_perm_S (w0[0], w0[1], selector); - w2[1] = __byte_perm_S ( 0, w0[0], selector); + c2[1] = amd_bytealign_S (w3[3], 0, offset); + c2[0] = amd_bytealign_S (w3[2], w3[3], offset); + c1[3] = amd_bytealign_S (w3[1], w3[2], offset); + c1[2] = amd_bytealign_S (w3[0], w3[1], offset); + c1[1] = amd_bytealign_S (w2[3], w3[0], offset); + c1[0] = amd_bytealign_S (w2[2], w2[3], offset); + c0[3] = amd_bytealign_S (w2[1], w2[2], offset); + c0[2] = amd_bytealign_S (w2[0], w2[1], offset); + c0[1] = amd_bytealign_S (w1[3], w2[0], offset); + c0[0] = amd_bytealign_S (w1[2], w1[3], offset); + w3[3] = amd_bytealign_S (w1[1], w1[2], offset); + w3[2] = amd_bytealign_S (w1[0], w1[1], offset); + w3[1] = amd_bytealign_S (w0[3], w1[0], offset); + w3[0] = amd_bytealign_S (w0[2], w0[3], offset); + w2[3] = amd_bytealign_S (w0[1], w0[2], offset); + w2[2] = amd_bytealign_S (w0[0], w0[1], offset); + w2[1] = amd_bytealign_S ( 0, w0[0], offset); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -7461,12 +9830,23 @@ inline void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w break; case 10: - w3[3] = __byte_perm_S (w1[0], w1[1], selector); - w3[2] = __byte_perm_S (w0[3], w1[0], selector); - w3[1] = __byte_perm_S (w0[2], w0[3], selector); - w3[0] = __byte_perm_S (w0[1], w0[2], selector); - w2[3] = __byte_perm_S (w0[0], w0[1], selector); - w2[2] = __byte_perm_S ( 0, w0[0], selector); + c2[2] = amd_bytealign_S (w3[3], 0, offset); + c2[1] = amd_bytealign_S (w3[2], w3[3], offset); + c2[0] = amd_bytealign_S (w3[1], w3[2], offset); + c1[3] = amd_bytealign_S (w3[0], w3[1], offset); + c1[2] = amd_bytealign_S (w2[3], w3[0], offset); + c1[1] = amd_bytealign_S (w2[2], w2[3], offset); + c1[0] = amd_bytealign_S (w2[1], w2[2], offset); + c0[3] = amd_bytealign_S (w2[0], w2[1], offset); + c0[2] = amd_bytealign_S (w1[3], w2[0], offset); + c0[1] = amd_bytealign_S (w1[2], w1[3], offset); + c0[0] = amd_bytealign_S (w1[1], w1[2], offset); + w3[3] = amd_bytealign_S (w1[0], w1[1], offset); + w3[2] = amd_bytealign_S (w0[3], w1[0], offset); + w3[1] = amd_bytealign_S (w0[2], w0[3], offset); + w3[0] = amd_bytealign_S (w0[1], w0[2], offset); + w2[3] = amd_bytealign_S (w0[0], w0[1], offset); + w2[2] = amd_bytealign_S ( 0, w0[0], offset); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -7481,11 +9861,23 @@ inline void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w break; case 11: - w3[3] = __byte_perm_S (w0[3], w1[0], selector); - w3[2] = __byte_perm_S (w0[2], w0[3], selector); - w3[1] = __byte_perm_S (w0[1], w0[2], selector); - w3[0] = __byte_perm_S (w0[0], w0[1], selector); - w2[3] = __byte_perm_S ( 0, w0[0], selector); + c2[3] = amd_bytealign_S (w3[3], 0, offset); + c2[2] = amd_bytealign_S (w3[2], w3[3], offset); + c2[1] = amd_bytealign_S (w3[1], w3[2], offset); + c2[0] = amd_bytealign_S (w3[0], w3[1], offset); + c1[3] = amd_bytealign_S (w2[3], w3[0], offset); + c1[2] = amd_bytealign_S (w2[2], w2[3], offset); + c1[1] = amd_bytealign_S (w2[1], w2[2], offset); + c1[0] = amd_bytealign_S (w2[0], w2[1], offset); + c0[3] = amd_bytealign_S (w1[3], w2[0], offset); + c0[2] = amd_bytealign_S (w1[2], w1[3], offset); + c0[1] = amd_bytealign_S (w1[1], w1[2], offset); + c0[0] = amd_bytealign_S (w1[0], w1[1], offset); + w3[3] = amd_bytealign_S (w0[3], w1[0], offset); + w3[2] = amd_bytealign_S (w0[2], w0[3], offset); + w3[1] = amd_bytealign_S (w0[1], w0[2], offset); + w3[0] = amd_bytealign_S (w0[0], w0[1], offset); + w2[3] = amd_bytealign_S ( 0, w0[0], offset); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -7501,10 +9893,23 @@ inline void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w break; case 12: - w3[3] = __byte_perm_S (w0[2], w0[3], selector); - w3[2] = __byte_perm_S (w0[1], w0[2], selector); - w3[1] = __byte_perm_S (w0[0], w0[1], selector); - w3[0] = __byte_perm_S ( 0, w0[0], selector); + c3[0] = amd_bytealign_S (w3[3], 0, offset); + c2[3] = amd_bytealign_S (w3[2], w3[3], offset); + c2[2] = amd_bytealign_S (w3[1], w3[2], offset); + c2[1] = amd_bytealign_S (w3[0], w3[1], offset); + c2[0] = amd_bytealign_S (w2[3], w3[0], offset); + c1[3] = amd_bytealign_S (w2[2], w2[3], offset); + c1[2] = amd_bytealign_S (w2[1], w2[2], offset); + c1[1] = amd_bytealign_S (w2[0], w2[1], offset); + c1[0] = amd_bytealign_S (w1[3], w2[0], offset); + c0[3] = amd_bytealign_S (w1[2], w1[3], offset); + c0[2] = amd_bytealign_S (w1[1], w1[2], offset); + c0[1] = amd_bytealign_S (w1[0], w1[1], offset); + c0[0] = amd_bytealign_S (w0[3], w1[0], offset); + w3[3] = amd_bytealign_S (w0[2], w0[3], offset); + w3[2] = amd_bytealign_S (w0[1], w0[2], offset); + w3[1] = amd_bytealign_S (w0[0], w0[1], offset); + w3[0] = amd_bytealign_S ( 0, w0[0], offset); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -7521,9 +9926,23 @@ inline void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w break; case 13: - w3[3] = __byte_perm_S (w0[1], w0[2], selector); - w3[2] = __byte_perm_S (w0[0], w0[1], selector); - w3[1] = __byte_perm_S ( 0, w0[0], selector); + c3[1] = amd_bytealign_S (w3[3], 0, offset); + c3[0] = amd_bytealign_S (w3[2], w3[3], offset); + c2[3] = amd_bytealign_S (w3[1], w3[2], offset); + c2[2] = amd_bytealign_S (w3[0], w3[1], offset); + c2[1] = amd_bytealign_S (w2[3], w3[0], offset); + c2[0] = amd_bytealign_S (w2[2], w2[3], offset); + c1[3] = amd_bytealign_S (w2[1], w2[2], offset); + c1[2] = amd_bytealign_S (w2[0], w2[1], offset); + c1[1] = amd_bytealign_S (w1[3], w2[0], offset); + c1[0] = amd_bytealign_S (w1[2], w1[3], offset); + c0[3] = amd_bytealign_S (w1[1], w1[2], offset); + c0[2] = amd_bytealign_S (w1[0], w1[1], offset); + c0[1] = amd_bytealign_S (w0[3], w1[0], offset); + c0[0] = amd_bytealign_S (w0[2], w0[3], offset); + w3[3] = amd_bytealign_S (w0[1], w0[2], offset); + w3[2] = amd_bytealign_S (w0[0], w0[1], offset); + w3[1] = amd_bytealign_S ( 0, w0[0], offset); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -7541,8 +9960,23 @@ inline void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w break; case 14: - w3[3] = __byte_perm_S (w0[0], w0[1], selector); - w3[2] = __byte_perm_S ( 0, w0[0], selector); + c3[2] = amd_bytealign_S (w3[3], 0, offset); + c3[1] = amd_bytealign_S (w3[2], w3[3], offset); + c3[0] = amd_bytealign_S (w3[1], w3[2], offset); + c2[3] = amd_bytealign_S (w3[0], w3[1], offset); + c2[2] = amd_bytealign_S (w2[3], w3[0], offset); + c2[1] = amd_bytealign_S (w2[2], w2[3], offset); + c2[0] = amd_bytealign_S (w2[1], w2[2], offset); + c1[3] = amd_bytealign_S (w2[0], w2[1], offset); + c1[2] = amd_bytealign_S (w1[3], w2[0], offset); + c1[1] = amd_bytealign_S (w1[2], w1[3], offset); + c1[0] = amd_bytealign_S (w1[1], w1[2], offset); + c0[3] = amd_bytealign_S (w1[0], w1[1], offset); + c0[2] = amd_bytealign_S (w0[3], w1[0], offset); + c0[1] = amd_bytealign_S (w0[2], w0[3], offset); + c0[0] = amd_bytealign_S (w0[1], w0[2], offset); + w3[3] = amd_bytealign_S (w0[0], w0[1], offset); + w3[2] = amd_bytealign_S ( 0, w0[0], offset); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -7561,7 +9995,23 @@ inline void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w break; case 15: - w3[3] = __byte_perm_S ( 0, w0[0], selector); + c3[3] = amd_bytealign_S (w3[3], 0, offset); + c3[2] = amd_bytealign_S (w3[2], w3[3], offset); + c3[1] = amd_bytealign_S (w3[1], w3[2], offset); + c3[0] = amd_bytealign_S (w3[0], w3[1], offset); + c2[3] = amd_bytealign_S (w2[3], w3[0], offset); + c2[2] = amd_bytealign_S (w2[2], w2[3], offset); + c2[1] = amd_bytealign_S (w2[1], w2[2], offset); + c2[0] = amd_bytealign_S (w2[0], w2[1], offset); + c1[3] = amd_bytealign_S (w1[3], w2[0], offset); + c1[2] = amd_bytealign_S (w1[2], w1[3], offset); + c1[1] = amd_bytealign_S (w1[1], w1[2], offset); + c1[0] = amd_bytealign_S (w1[0], w1[1], offset); + c0[3] = amd_bytealign_S (w0[3], w1[0], offset); + c0[2] = amd_bytealign_S (w0[2], w0[3], offset); + c0[1] = amd_bytealign_S (w0[1], w0[2], offset); + c0[0] = amd_bytealign_S (w0[0], w0[1], offset); + w3[3] = amd_bytealign_S ( 0, w0[0], offset); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -7581,350 +10031,171 @@ inline void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w break; } #endif -} -inline void switch_buffer_by_offset_carry_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 c0[4], u32 c1[4], u32 c2[4], u32 c3[4], const u32 offset) -{ - const int offset_mod_4 = offset & 3; - - const int offset_minus_4 = 4 - offset; + #ifdef IS_NV + const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; switch (offset / 4) { - case 0: - c0[0] = amd_bytealign_S ( 0, w3[3], offset_minus_4); - w3[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w3[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w3[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w2[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w2[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w2[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w1[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w1[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w1[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w0[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w0[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w0[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); - - if (offset_mod_4 == 0) - { - w0[0] = w0[1]; - w0[1] = w0[2]; - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = c0[0]; - c0[0] = 0; - } - - break; - - case 1: - c0[1] = amd_bytealign_S ( 0, w3[3], offset_minus_4); - c0[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w3[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w3[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w2[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w2[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w1[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w1[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w0[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w0[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w0[1] = w0[2]; - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = c0[0]; - c0[0] = c0[1]; - c0[1] = 0; - } + case 0: + c0[0] = __byte_perm_S ( 0, w3[3], selector); + w3[3] = __byte_perm_S (w3[3], w3[2], selector); + w3[2] = __byte_perm_S (w3[2], w3[1], selector); + w3[1] = __byte_perm_S (w3[1], w3[0], selector); + w3[0] = __byte_perm_S (w3[0], w2[3], selector); + w2[3] = __byte_perm_S (w2[3], w2[2], selector); + w2[2] = __byte_perm_S (w2[2], w2[1], selector); + w2[1] = __byte_perm_S (w2[1], w2[0], selector); + w2[0] = __byte_perm_S (w2[0], w1[3], selector); + w1[3] = __byte_perm_S (w1[3], w1[2], selector); + w1[2] = __byte_perm_S (w1[2], w1[1], selector); + w1[1] = __byte_perm_S (w1[1], w1[0], selector); + w1[0] = __byte_perm_S (w1[0], w0[3], selector); + w0[3] = __byte_perm_S (w0[3], w0[2], selector); + w0[2] = __byte_perm_S (w0[2], w0[1], selector); + w0[1] = __byte_perm_S (w0[1], w0[0], selector); + w0[0] = __byte_perm_S (w0[0], 0, selector); break; - case 2: - c0[2] = amd_bytealign_S ( 0, w3[3], offset_minus_4); - c0[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - c0[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w3[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w2[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w1[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w0[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w0[1] = 0; + case 1: + c0[1] = __byte_perm_S ( 0, w3[3], selector); + c0[0] = __byte_perm_S (w3[3], w3[2], selector); + w3[3] = __byte_perm_S (w3[2], w3[1], selector); + w3[2] = __byte_perm_S (w3[1], w3[0], selector); + w3[1] = __byte_perm_S (w3[0], w2[3], selector); + w3[0] = __byte_perm_S (w2[3], w2[2], selector); + w2[3] = __byte_perm_S (w2[2], w2[1], selector); + w2[2] = __byte_perm_S (w2[1], w2[0], selector); + w2[1] = __byte_perm_S (w2[0], w1[3], selector); + w2[0] = __byte_perm_S (w1[3], w1[2], selector); + w1[3] = __byte_perm_S (w1[2], w1[1], selector); + w1[2] = __byte_perm_S (w1[1], w1[0], selector); + w1[1] = __byte_perm_S (w1[0], w0[3], selector); + w1[0] = __byte_perm_S (w0[3], w0[2], selector); + w0[3] = __byte_perm_S (w0[2], w0[1], selector); + w0[2] = __byte_perm_S (w0[1], w0[0], selector); + w0[1] = __byte_perm_S (w0[0], 0, selector); w0[0] = 0; - if (offset_mod_4 == 0) - { - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = c0[0]; - c0[0] = c0[1]; - c0[1] = c0[2]; - c0[2] = 0; - } - break; - case 3: - c0[3] = amd_bytealign_S ( 0, w3[3], offset_minus_4); - c0[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - c0[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - c0[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = c0[0]; - c0[0] = c0[1]; - c0[1] = c0[2]; - c0[2] = c0[3]; - c0[3] = 0; - } + case 2: + c0[2] = __byte_perm_S ( 0, w3[3], selector); + c0[1] = __byte_perm_S (w3[3], w3[2], selector); + c0[0] = __byte_perm_S (w3[2], w3[1], selector); + w3[3] = __byte_perm_S (w3[1], w3[0], selector); + w3[2] = __byte_perm_S (w3[0], w2[3], selector); + w3[1] = __byte_perm_S (w2[3], w2[2], selector); + w3[0] = __byte_perm_S (w2[2], w2[1], selector); + w2[3] = __byte_perm_S (w2[1], w2[0], selector); + w2[2] = __byte_perm_S (w2[0], w1[3], selector); + w2[1] = __byte_perm_S (w1[3], w1[2], selector); + w2[0] = __byte_perm_S (w1[2], w1[1], selector); + w1[3] = __byte_perm_S (w1[1], w1[0], selector); + w1[2] = __byte_perm_S (w1[0], w0[3], selector); + w1[1] = __byte_perm_S (w0[3], w0[2], selector); + w1[0] = __byte_perm_S (w0[2], w0[1], selector); + w0[3] = __byte_perm_S (w0[1], w0[0], selector); + w0[2] = __byte_perm_S (w0[0], 0, selector); + w0[1] = 0; + w0[0] = 0; break; - case 4: - c1[0] = amd_bytealign_S ( 0, w3[3], offset_minus_4); - c0[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - c0[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - c0[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - c0[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w0[3] = 0; + case 3: + c0[3] = __byte_perm_S ( 0, w3[3], selector); + c0[2] = __byte_perm_S (w3[3], w3[2], selector); + c0[1] = __byte_perm_S (w3[2], w3[1], selector); + c0[0] = __byte_perm_S (w3[1], w3[0], selector); + w3[3] = __byte_perm_S (w3[0], w2[3], selector); + w3[2] = __byte_perm_S (w2[3], w2[2], selector); + w3[1] = __byte_perm_S (w2[2], w2[1], selector); + w3[0] = __byte_perm_S (w2[1], w2[0], selector); + w2[3] = __byte_perm_S (w2[0], w1[3], selector); + w2[2] = __byte_perm_S (w1[3], w1[2], selector); + w2[1] = __byte_perm_S (w1[2], w1[1], selector); + w2[0] = __byte_perm_S (w1[1], w1[0], selector); + w1[3] = __byte_perm_S (w1[0], w0[3], selector); + w1[2] = __byte_perm_S (w0[3], w0[2], selector); + w1[1] = __byte_perm_S (w0[2], w0[1], selector); + w1[0] = __byte_perm_S (w0[1], w0[0], selector); + w0[3] = __byte_perm_S (w0[0], 0, selector); w0[2] = 0; w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = c0[0]; - c0[0] = c0[1]; - c0[1] = c0[2]; - c0[2] = c0[3]; - c0[3] = c1[0]; - c1[0] = 0; - } - break; - case 5: - c1[1] = amd_bytealign_S ( 0, w3[3], offset_minus_4); - c1[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - c0[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - c0[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - c0[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - c0[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w1[0] = 0; + case 4: + c1[0] = __byte_perm_S ( 0, w3[3], selector); + c0[3] = __byte_perm_S (w3[3], w3[2], selector); + c0[2] = __byte_perm_S (w3[2], w3[1], selector); + c0[1] = __byte_perm_S (w3[1], w3[0], selector); + c0[0] = __byte_perm_S (w3[0], w2[3], selector); + w3[3] = __byte_perm_S (w2[3], w2[2], selector); + w3[2] = __byte_perm_S (w2[2], w2[1], selector); + w3[1] = __byte_perm_S (w2[1], w2[0], selector); + w3[0] = __byte_perm_S (w2[0], w1[3], selector); + w2[3] = __byte_perm_S (w1[3], w1[2], selector); + w2[2] = __byte_perm_S (w1[2], w1[1], selector); + w2[1] = __byte_perm_S (w1[1], w1[0], selector); + w2[0] = __byte_perm_S (w1[0], w0[3], selector); + w1[3] = __byte_perm_S (w0[3], w0[2], selector); + w1[2] = __byte_perm_S (w0[2], w0[1], selector); + w1[1] = __byte_perm_S (w0[1], w0[0], selector); + w1[0] = __byte_perm_S (w0[0], 0, selector); w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = c0[0]; - c0[0] = c0[1]; - c0[1] = c0[2]; - c0[2] = c0[3]; - c0[3] = c1[0]; - c1[0] = c1[1]; - c1[1] = 0; - } - break; - case 6: - c1[2] = amd_bytealign_S ( 0, w3[3], offset_minus_4); - c1[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - c1[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - c0[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - c0[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - c0[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - c0[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w1[1] = 0; + case 5: + c1[1] = __byte_perm_S ( 0, w3[3], selector); + c1[0] = __byte_perm_S (w3[3], w3[2], selector); + c0[3] = __byte_perm_S (w3[2], w3[1], selector); + c0[2] = __byte_perm_S (w3[1], w3[0], selector); + c0[1] = __byte_perm_S (w3[0], w2[3], selector); + c0[0] = __byte_perm_S (w2[3], w2[2], selector); + w3[3] = __byte_perm_S (w2[2], w2[1], selector); + w3[2] = __byte_perm_S (w2[1], w2[0], selector); + w3[1] = __byte_perm_S (w2[0], w1[3], selector); + w3[0] = __byte_perm_S (w1[3], w1[2], selector); + w2[3] = __byte_perm_S (w1[2], w1[1], selector); + w2[2] = __byte_perm_S (w1[1], w1[0], selector); + w2[1] = __byte_perm_S (w1[0], w0[3], selector); + w2[0] = __byte_perm_S (w0[3], w0[2], selector); + w1[3] = __byte_perm_S (w0[2], w0[1], selector); + w1[2] = __byte_perm_S (w0[1], w0[0], selector); + w1[1] = __byte_perm_S (w0[0], 0, selector); w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = c0[0]; - c0[0] = c0[1]; - c0[1] = c0[2]; - c0[2] = c0[3]; - c0[3] = c1[0]; - c1[0] = c1[1]; - c1[1] = c1[2]; - c1[2] = 0; - } - break; - case 7: - c1[3] = amd_bytealign_S ( 0, w3[3], offset_minus_4); - c1[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - c1[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - c1[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - c0[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - c0[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - c0[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - c0[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w1[2] = 0; + case 6: + c1[2] = __byte_perm_S ( 0, w3[3], selector); + c1[1] = __byte_perm_S (w3[3], w3[2], selector); + c1[0] = __byte_perm_S (w3[2], w3[1], selector); + c0[3] = __byte_perm_S (w3[1], w3[0], selector); + c0[2] = __byte_perm_S (w3[0], w2[3], selector); + c0[1] = __byte_perm_S (w2[3], w2[2], selector); + c0[0] = __byte_perm_S (w2[2], w2[1], selector); + w3[3] = __byte_perm_S (w2[1], w2[0], selector); + w3[2] = __byte_perm_S (w2[0], w1[3], selector); + w3[1] = __byte_perm_S (w1[3], w1[2], selector); + w3[0] = __byte_perm_S (w1[2], w1[1], selector); + w2[3] = __byte_perm_S (w1[1], w1[0], selector); + w2[2] = __byte_perm_S (w1[0], w0[3], selector); + w2[1] = __byte_perm_S (w0[3], w0[2], selector); + w2[0] = __byte_perm_S (w0[2], w0[1], selector); + w1[3] = __byte_perm_S (w0[1], w0[0], selector); + w1[2] = __byte_perm_S (w0[0], 0, selector); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -7932,47 +10203,54 @@ inline void switch_buffer_by_offset_carry_le_S (u32 w0[4], u32 w1[4], u32 w2[4], w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = c0[0]; - c0[0] = c0[1]; - c0[1] = c0[2]; - c0[2] = c0[3]; - c0[3] = c1[0]; - c1[0] = c1[1]; - c1[1] = c1[2]; - c1[2] = c1[3]; - c1[3] = 0; - } + break; + + case 7: + c1[3] = __byte_perm_S ( 0, w3[3], selector); + c1[2] = __byte_perm_S (w3[3], w3[2], selector); + c1[1] = __byte_perm_S (w3[2], w3[1], selector); + c1[0] = __byte_perm_S (w3[1], w3[0], selector); + c0[3] = __byte_perm_S (w3[0], w2[3], selector); + c0[2] = __byte_perm_S (w2[3], w2[2], selector); + c0[1] = __byte_perm_S (w2[2], w2[1], selector); + c0[0] = __byte_perm_S (w2[1], w2[0], selector); + w3[3] = __byte_perm_S (w2[0], w1[3], selector); + w3[2] = __byte_perm_S (w1[3], w1[2], selector); + w3[1] = __byte_perm_S (w1[2], w1[1], selector); + w3[0] = __byte_perm_S (w1[1], w1[0], selector); + w2[3] = __byte_perm_S (w1[0], w0[3], selector); + w2[2] = __byte_perm_S (w0[3], w0[2], selector); + w2[1] = __byte_perm_S (w0[2], w0[1], selector); + w2[0] = __byte_perm_S (w0[1], w0[0], selector); + w1[3] = __byte_perm_S (w0[0], 0, selector); + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 8: - c2[0] = amd_bytealign_S ( 0, w3[3], offset_minus_4); - c1[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - c1[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - c1[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - c1[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - c0[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - c0[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - c0[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - c0[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + case 8: + c2[0] = __byte_perm_S ( 0, w3[3], selector); + c1[3] = __byte_perm_S (w3[3], w3[2], selector); + c1[2] = __byte_perm_S (w3[2], w3[1], selector); + c1[1] = __byte_perm_S (w3[1], w3[0], selector); + c1[0] = __byte_perm_S (w3[0], w2[3], selector); + c0[3] = __byte_perm_S (w2[3], w2[2], selector); + c0[2] = __byte_perm_S (w2[2], w2[1], selector); + c0[1] = __byte_perm_S (w2[1], w2[0], selector); + c0[0] = __byte_perm_S (w2[0], w1[3], selector); + w3[3] = __byte_perm_S (w1[3], w1[2], selector); + w3[2] = __byte_perm_S (w1[2], w1[1], selector); + w3[1] = __byte_perm_S (w1[1], w1[0], selector); + w3[0] = __byte_perm_S (w1[0], w0[3], selector); + w2[3] = __byte_perm_S (w0[3], w0[2], selector); + w2[2] = __byte_perm_S (w0[2], w0[1], selector); + w2[1] = __byte_perm_S (w0[1], w0[0], selector); + w2[0] = __byte_perm_S (w0[0], 0, selector); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -7982,47 +10260,26 @@ inline void switch_buffer_by_offset_carry_le_S (u32 w0[4], u32 w1[4], u32 w2[4], w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = c0[0]; - c0[0] = c0[1]; - c0[1] = c0[2]; - c0[2] = c0[3]; - c0[3] = c1[0]; - c1[0] = c1[1]; - c1[1] = c1[2]; - c1[2] = c1[3]; - c1[3] = c2[0]; - c2[0] = 0; - } - break; - case 9: - c2[1] = amd_bytealign_S ( 0, w3[3], offset_minus_4); - c2[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - c1[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - c1[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - c1[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - c1[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - c0[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - c0[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - c0[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - c0[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + case 9: + c2[1] = __byte_perm_S ( 0, w3[3], selector); + c2[0] = __byte_perm_S (w3[3], w3[2], selector); + c1[3] = __byte_perm_S (w3[2], w3[1], selector); + c1[2] = __byte_perm_S (w3[1], w3[0], selector); + c1[1] = __byte_perm_S (w3[0], w2[3], selector); + c1[0] = __byte_perm_S (w2[3], w2[2], selector); + c0[3] = __byte_perm_S (w2[2], w2[1], selector); + c0[2] = __byte_perm_S (w2[1], w2[0], selector); + c0[1] = __byte_perm_S (w2[0], w1[3], selector); + c0[0] = __byte_perm_S (w1[3], w1[2], selector); + w3[3] = __byte_perm_S (w1[2], w1[1], selector); + w3[2] = __byte_perm_S (w1[1], w1[0], selector); + w3[1] = __byte_perm_S (w1[0], w0[3], selector); + w3[0] = __byte_perm_S (w0[3], w0[2], selector); + w2[3] = __byte_perm_S (w0[2], w0[1], selector); + w2[2] = __byte_perm_S (w0[1], w0[0], selector); + w2[1] = __byte_perm_S (w0[0], 0, selector); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -8033,47 +10290,26 @@ inline void switch_buffer_by_offset_carry_le_S (u32 w0[4], u32 w1[4], u32 w2[4], w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = c0[0]; - c0[0] = c0[1]; - c0[1] = c0[2]; - c0[2] = c0[3]; - c0[3] = c1[0]; - c1[0] = c1[1]; - c1[1] = c1[2]; - c1[2] = c1[3]; - c1[3] = c2[0]; - c2[0] = c2[1]; - c2[1] = 0; - } - break; case 10: - c2[2] = amd_bytealign_S ( 0, w3[3], offset_minus_4); - c2[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - c2[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - c1[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - c1[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - c1[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - c1[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - c0[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - c0[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - c0[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - c0[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + c2[2] = __byte_perm_S ( 0, w3[3], selector); + c2[1] = __byte_perm_S (w3[3], w3[2], selector); + c2[0] = __byte_perm_S (w3[2], w3[1], selector); + c1[3] = __byte_perm_S (w3[1], w3[0], selector); + c1[2] = __byte_perm_S (w3[0], w2[3], selector); + c1[1] = __byte_perm_S (w2[3], w2[2], selector); + c1[0] = __byte_perm_S (w2[2], w2[1], selector); + c0[3] = __byte_perm_S (w2[1], w2[0], selector); + c0[2] = __byte_perm_S (w2[0], w1[3], selector); + c0[1] = __byte_perm_S (w1[3], w1[2], selector); + c0[0] = __byte_perm_S (w1[2], w1[1], selector); + w3[3] = __byte_perm_S (w1[1], w1[0], selector); + w3[2] = __byte_perm_S (w1[0], w0[3], selector); + w3[1] = __byte_perm_S (w0[3], w0[2], selector); + w3[0] = __byte_perm_S (w0[2], w0[1], selector); + w2[3] = __byte_perm_S (w0[1], w0[0], selector); + w2[2] = __byte_perm_S (w0[0], 0, selector); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -8085,47 +10321,26 @@ inline void switch_buffer_by_offset_carry_le_S (u32 w0[4], u32 w1[4], u32 w2[4], w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = c0[0]; - c0[0] = c0[1]; - c0[1] = c0[2]; - c0[2] = c0[3]; - c0[3] = c1[0]; - c1[0] = c1[1]; - c1[1] = c1[2]; - c1[2] = c1[3]; - c1[3] = c2[0]; - c2[0] = c2[1]; - c2[1] = c2[2]; - c2[2] = 0; - } - break; case 11: - c2[3] = amd_bytealign_S ( 0, w3[3], offset_minus_4); - c2[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - c2[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - c2[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - c1[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - c1[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - c1[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - c1[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - c0[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - c0[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - c0[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - c0[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + c2[3] = __byte_perm_S ( 0, w3[3], selector); + c2[2] = __byte_perm_S (w3[3], w3[2], selector); + c2[1] = __byte_perm_S (w3[2], w3[1], selector); + c2[0] = __byte_perm_S (w3[1], w3[0], selector); + c1[3] = __byte_perm_S (w3[0], w2[3], selector); + c1[2] = __byte_perm_S (w2[3], w2[2], selector); + c1[1] = __byte_perm_S (w2[2], w2[1], selector); + c1[0] = __byte_perm_S (w2[1], w2[0], selector); + c0[3] = __byte_perm_S (w2[0], w1[3], selector); + c0[2] = __byte_perm_S (w1[3], w1[2], selector); + c0[1] = __byte_perm_S (w1[2], w1[1], selector); + c0[0] = __byte_perm_S (w1[1], w1[0], selector); + w3[3] = __byte_perm_S (w1[0], w0[3], selector); + w3[2] = __byte_perm_S (w0[3], w0[2], selector); + w3[1] = __byte_perm_S (w0[2], w0[1], selector); + w3[0] = __byte_perm_S (w0[1], w0[0], selector); + w2[3] = __byte_perm_S (w0[0], 0, selector); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -8138,47 +10353,60 @@ inline void switch_buffer_by_offset_carry_le_S (u32 w0[4], u32 w1[4], u32 w2[4], w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = c0[0]; - c0[0] = c0[1]; - c0[1] = c0[2]; - c0[2] = c0[3]; - c0[3] = c1[0]; - c1[0] = c1[1]; - c1[1] = c1[2]; - c1[2] = c1[3]; - c1[3] = c2[0]; - c2[0] = c2[1]; - c2[1] = c2[2]; - c2[2] = c2[3]; - c2[3] = 0; - } + break; + + case 12: + c3[0] = __byte_perm_S ( 0, w3[3], selector); + c2[3] = __byte_perm_S (w3[3], w3[2], selector); + c2[2] = __byte_perm_S (w3[2], w3[1], selector); + c2[1] = __byte_perm_S (w3[1], w3[0], selector); + c2[0] = __byte_perm_S (w3[0], w2[3], selector); + c1[3] = __byte_perm_S (w2[3], w2[2], selector); + c1[2] = __byte_perm_S (w2[2], w2[1], selector); + c1[1] = __byte_perm_S (w2[1], w2[0], selector); + c1[0] = __byte_perm_S (w2[0], w1[3], selector); + c0[3] = __byte_perm_S (w1[3], w1[2], selector); + c0[2] = __byte_perm_S (w1[2], w1[1], selector); + c0[1] = __byte_perm_S (w1[1], w1[0], selector); + c0[0] = __byte_perm_S (w1[0], w0[3], selector); + w3[3] = __byte_perm_S (w0[3], w0[2], selector); + w3[2] = __byte_perm_S (w0[2], w0[1], selector); + w3[1] = __byte_perm_S (w0[1], w0[0], selector); + w3[0] = __byte_perm_S (w0[0], 0, selector); + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 12: - c3[0] = amd_bytealign_S ( 0, w3[3], offset_minus_4); - c2[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - c2[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - c2[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - c2[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - c1[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - c1[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - c1[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - c1[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - c0[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - c0[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - c0[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - c0[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + case 13: + c3[1] = __byte_perm_S ( 0, w3[3], selector); + c3[0] = __byte_perm_S (w3[3], w3[2], selector); + c2[3] = __byte_perm_S (w3[2], w3[1], selector); + c2[2] = __byte_perm_S (w3[1], w3[0], selector); + c2[1] = __byte_perm_S (w3[0], w2[3], selector); + c2[0] = __byte_perm_S (w2[3], w2[2], selector); + c1[3] = __byte_perm_S (w2[2], w2[1], selector); + c1[2] = __byte_perm_S (w2[1], w2[0], selector); + c1[1] = __byte_perm_S (w2[0], w1[3], selector); + c1[0] = __byte_perm_S (w1[3], w1[2], selector); + c0[3] = __byte_perm_S (w1[2], w1[1], selector); + c0[2] = __byte_perm_S (w1[1], w1[0], selector); + c0[1] = __byte_perm_S (w1[0], w0[3], selector); + c0[0] = __byte_perm_S (w0[3], w0[2], selector); + w3[3] = __byte_perm_S (w0[2], w0[1], selector); + w3[2] = __byte_perm_S (w0[1], w0[0], selector); + w3[1] = __byte_perm_S (w0[0], 0, selector); + w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -8192,47 +10420,27 @@ inline void switch_buffer_by_offset_carry_le_S (u32 w0[4], u32 w1[4], u32 w2[4], w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = c0[0]; - c0[0] = c0[1]; - c0[1] = c0[2]; - c0[2] = c0[3]; - c0[3] = c1[0]; - c1[0] = c1[1]; - c1[1] = c1[2]; - c1[2] = c1[3]; - c1[3] = c2[0]; - c2[0] = c2[1]; - c2[1] = c2[2]; - c2[2] = c2[3]; - c2[3] = c3[0]; - c3[0] = 0; - } - break; - case 13: - c3[1] = amd_bytealign_S ( 0, w3[3], offset_minus_4); - c3[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - c2[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - c2[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - c2[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - c2[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - c1[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - c1[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - c1[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - c1[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - c0[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - c0[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - c0[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - c0[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + case 14: + c3[2] = __byte_perm_S ( 0, w3[3], selector); + c3[1] = __byte_perm_S (w3[3], w3[2], selector); + c3[0] = __byte_perm_S (w3[2], w3[1], selector); + c2[3] = __byte_perm_S (w3[1], w3[0], selector); + c2[2] = __byte_perm_S (w3[0], w2[3], selector); + c2[1] = __byte_perm_S (w2[3], w2[2], selector); + c2[0] = __byte_perm_S (w2[2], w2[1], selector); + c1[3] = __byte_perm_S (w2[1], w2[0], selector); + c1[2] = __byte_perm_S (w2[0], w1[3], selector); + c1[1] = __byte_perm_S (w1[3], w1[2], selector); + c1[0] = __byte_perm_S (w1[2], w1[1], selector); + c0[3] = __byte_perm_S (w1[1], w1[0], selector); + c0[2] = __byte_perm_S (w1[0], w0[3], selector); + c0[1] = __byte_perm_S (w0[3], w0[2], selector); + c0[0] = __byte_perm_S (w0[2], w0[1], selector); + w3[3] = __byte_perm_S (w0[1], w0[0], selector); + w3[2] = __byte_perm_S (w0[0], 0, selector); + w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -8247,47 +10455,27 @@ inline void switch_buffer_by_offset_carry_le_S (u32 w0[4], u32 w1[4], u32 w2[4], w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = c0[0]; - c0[0] = c0[1]; - c0[1] = c0[2]; - c0[2] = c0[3]; - c0[3] = c1[0]; - c1[0] = c1[1]; - c1[1] = c1[2]; - c1[2] = c1[3]; - c1[3] = c2[0]; - c2[0] = c2[1]; - c2[1] = c2[2]; - c2[2] = c2[3]; - c2[3] = c3[0]; - c3[0] = c3[1]; - c3[1] = 0; - } - break; - case 14: - c3[2] = amd_bytealign_S ( 0, w3[3], offset_minus_4); - c3[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - c3[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - c2[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - c2[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - c2[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - c2[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - c1[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - c1[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - c1[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - c1[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - c0[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - c0[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - c0[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - c0[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + case 15: + c3[3] = __byte_perm_S ( 0, w3[3], selector); + c3[2] = __byte_perm_S (w3[3], w3[2], selector); + c3[1] = __byte_perm_S (w3[2], w3[1], selector); + c3[0] = __byte_perm_S (w3[1], w3[0], selector); + c2[3] = __byte_perm_S (w3[0], w2[3], selector); + c2[2] = __byte_perm_S (w2[3], w2[2], selector); + c2[1] = __byte_perm_S (w2[2], w2[1], selector); + c2[0] = __byte_perm_S (w2[1], w2[0], selector); + c1[3] = __byte_perm_S (w2[0], w1[3], selector); + c1[2] = __byte_perm_S (w1[3], w1[2], selector); + c1[1] = __byte_perm_S (w1[2], w1[1], selector); + c1[0] = __byte_perm_S (w1[1], w1[0], selector); + c0[3] = __byte_perm_S (w1[0], w0[3], selector); + c0[2] = __byte_perm_S (w0[3], w0[2], selector); + c0[1] = __byte_perm_S (w0[2], w0[1], selector); + c0[0] = __byte_perm_S (w0[1], w0[0], selector); + w3[3] = __byte_perm_S (w0[0], 0, selector); + w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -8303,224 +10491,464 @@ inline void switch_buffer_by_offset_carry_le_S (u32 w0[4], u32 w1[4], u32 w2[4], w0[1] = 0; w0[0] = 0; + break; + } + #endif +} + +inline void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset) +{ + #if defined IS_AMD || defined IS_GENERIC + const int offset_mod_4 = offset & 3; + + const int offset_minus_4 = 4 - offset; + + switch (offset / 4) + { + case 0: + w7[3] = amd_bytealign_S (w7[3], w7[2], offset_minus_4); + w7[2] = amd_bytealign_S (w7[2], w7[1], offset_minus_4); + w7[1] = amd_bytealign_S (w7[1], w7[0], offset_minus_4); + w7[0] = amd_bytealign_S (w7[0], w6[3], offset_minus_4); + w6[3] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); + w6[2] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); + w6[1] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); + w6[0] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); + w5[3] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); + w5[2] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); + w5[1] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w5[0] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w4[3] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w4[2] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w4[1] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w4[0] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w3[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w3[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w3[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w3[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w2[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w2[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w2[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w2[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w1[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w1[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w1[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w1[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w0[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w0[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w0[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w0[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + if (offset_mod_4 == 0) { + w0[0] = w0[1]; + w0[1] = w0[2]; + w0[2] = w0[3]; + w0[3] = w1[0]; + w1[0] = w1[1]; + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; w3[2] = w3[3]; - w3[3] = c0[0]; - c0[0] = c0[1]; - c0[1] = c0[2]; - c0[2] = c0[3]; - c0[3] = c1[0]; - c1[0] = c1[1]; - c1[1] = c1[2]; - c1[2] = c1[3]; - c1[3] = c2[0]; - c2[0] = c2[1]; - c2[1] = c2[2]; - c2[2] = c2[3]; - c2[3] = c3[0]; - c3[0] = c3[1]; - c3[1] = c3[2]; - c3[2] = 0; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; } break; - case 15: - c3[3] = amd_bytealign_S ( 0, w3[3], offset_minus_4); - c3[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - c3[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - c3[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - c2[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - c2[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - c2[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - c2[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - c1[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - c1[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - c1[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - c1[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - c0[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - c0[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - c0[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - c0[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w3[2] = 0; - w3[1] = 0; - w3[0] = 0; - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; + case 1: + w7[3] = amd_bytealign_S (w7[2], w7[1], offset_minus_4); + w7[2] = amd_bytealign_S (w7[1], w7[0], offset_minus_4); + w7[1] = amd_bytealign_S (w7[0], w6[3], offset_minus_4); + w7[0] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); + w6[3] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); + w6[2] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); + w6[1] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); + w6[0] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); + w5[3] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); + w5[2] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w5[1] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w5[0] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w4[3] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w4[2] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w4[1] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w4[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w3[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w3[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w3[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w3[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w2[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w2[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w2[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w2[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w1[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w1[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w1[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w1[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w0[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w0[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w0[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); w0[0] = 0; if (offset_mod_4 == 0) { - w3[3] = c0[0]; - c0[0] = c0[1]; - c0[1] = c0[2]; - c0[2] = c0[3]; - c0[3] = c1[0]; - c1[0] = c1[1]; - c1[1] = c1[2]; - c1[2] = c1[3]; - c1[3] = c2[0]; - c2[0] = c2[1]; - c2[1] = c2[2]; - c2[2] = c2[3]; - c2[3] = c3[0]; - c3[0] = c3[1]; - c3[1] = c3[2]; - c3[2] = c3[3]; - c3[3] = 0; + w0[1] = w0[2]; + w0[2] = w0[3]; + w0[3] = w1[0]; + w1[0] = w1[1]; + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; } - break; - } -} - -inline void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) -{ - #if defined IS_AMD || defined IS_GENERIC - switch (offset / 4) - { - case 0: - w3[3] = amd_bytealign_S (w3[2], w3[3], offset); - w3[2] = amd_bytealign_S (w3[1], w3[2], offset); - w3[1] = amd_bytealign_S (w3[0], w3[1], offset); - w3[0] = amd_bytealign_S (w2[3], w3[0], offset); - w2[3] = amd_bytealign_S (w2[2], w2[3], offset); - w2[2] = amd_bytealign_S (w2[1], w2[2], offset); - w2[1] = amd_bytealign_S (w2[0], w2[1], offset); - w2[0] = amd_bytealign_S (w1[3], w2[0], offset); - w1[3] = amd_bytealign_S (w1[2], w1[3], offset); - w1[2] = amd_bytealign_S (w1[1], w1[2], offset); - w1[1] = amd_bytealign_S (w1[0], w1[1], offset); - w1[0] = amd_bytealign_S (w0[3], w1[0], offset); - w0[3] = amd_bytealign_S (w0[2], w0[3], offset); - w0[2] = amd_bytealign_S (w0[1], w0[2], offset); - w0[1] = amd_bytealign_S (w0[0], w0[1], offset); - w0[0] = amd_bytealign_S ( 0, w0[0], offset); - - break; - - case 1: - w3[3] = amd_bytealign_S (w3[1], w3[2], offset); - w3[2] = amd_bytealign_S (w3[0], w3[1], offset); - w3[1] = amd_bytealign_S (w2[3], w3[0], offset); - w3[0] = amd_bytealign_S (w2[2], w2[3], offset); - w2[3] = amd_bytealign_S (w2[1], w2[2], offset); - w2[2] = amd_bytealign_S (w2[0], w2[1], offset); - w2[1] = amd_bytealign_S (w1[3], w2[0], offset); - w2[0] = amd_bytealign_S (w1[2], w1[3], offset); - w1[3] = amd_bytealign_S (w1[1], w1[2], offset); - w1[2] = amd_bytealign_S (w1[0], w1[1], offset); - w1[1] = amd_bytealign_S (w0[3], w1[0], offset); - w1[0] = amd_bytealign_S (w0[2], w0[3], offset); - w0[3] = amd_bytealign_S (w0[1], w0[2], offset); - w0[2] = amd_bytealign_S (w0[0], w0[1], offset); - w0[1] = amd_bytealign_S ( 0, w0[0], offset); - w0[0] = 0; - break; - case 2: - w3[3] = amd_bytealign_S (w3[0], w3[1], offset); - w3[2] = amd_bytealign_S (w2[3], w3[0], offset); - w3[1] = amd_bytealign_S (w2[2], w2[3], offset); - w3[0] = amd_bytealign_S (w2[1], w2[2], offset); - w2[3] = amd_bytealign_S (w2[0], w2[1], offset); - w2[2] = amd_bytealign_S (w1[3], w2[0], offset); - w2[1] = amd_bytealign_S (w1[2], w1[3], offset); - w2[0] = amd_bytealign_S (w1[1], w1[2], offset); - w1[3] = amd_bytealign_S (w1[0], w1[1], offset); - w1[2] = amd_bytealign_S (w0[3], w1[0], offset); - w1[1] = amd_bytealign_S (w0[2], w0[3], offset); - w1[0] = amd_bytealign_S (w0[1], w0[2], offset); - w0[3] = amd_bytealign_S (w0[0], w0[1], offset); - w0[2] = amd_bytealign_S ( 0, w0[0], offset); + case 2: + w7[3] = amd_bytealign_S (w7[1], w7[0], offset_minus_4); + w7[2] = amd_bytealign_S (w7[0], w6[3], offset_minus_4); + w7[1] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); + w7[0] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); + w6[3] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); + w6[2] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); + w6[1] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); + w6[0] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); + w5[3] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w5[2] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w5[1] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w5[0] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w4[3] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w4[2] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w4[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w4[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w3[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w3[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w3[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w3[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w2[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w2[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w2[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w2[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w1[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w1[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w1[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w1[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w0[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w0[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); w0[1] = 0; w0[0] = 0; - break; - - case 3: - w3[3] = amd_bytealign_S (w2[3], w3[0], offset); - w3[2] = amd_bytealign_S (w2[2], w2[3], offset); - w3[1] = amd_bytealign_S (w2[1], w2[2], offset); - w3[0] = amd_bytealign_S (w2[0], w2[1], offset); - w2[3] = amd_bytealign_S (w1[3], w2[0], offset); - w2[2] = amd_bytealign_S (w1[2], w1[3], offset); - w2[1] = amd_bytealign_S (w1[1], w1[2], offset); - w2[0] = amd_bytealign_S (w1[0], w1[1], offset); - w1[3] = amd_bytealign_S (w0[3], w1[0], offset); - w1[2] = amd_bytealign_S (w0[2], w0[3], offset); - w1[1] = amd_bytealign_S (w0[1], w0[2], offset); - w1[0] = amd_bytealign_S (w0[0], w0[1], offset); - w0[3] = amd_bytealign_S ( 0, w0[0], offset); + if (offset_mod_4 == 0) + { + w0[2] = w0[3]; + w0[3] = w1[0]; + w1[0] = w1[1]; + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 3: + w7[3] = amd_bytealign_S (w7[0], w6[3], offset_minus_4); + w7[2] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); + w7[1] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); + w7[0] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); + w6[3] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); + w6[2] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); + w6[1] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); + w6[0] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w5[3] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w5[2] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w5[1] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w5[0] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w4[3] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w4[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w4[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w4[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w3[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w3[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w3[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w3[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w2[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w2[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w2[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w2[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w1[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w1[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w1[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w1[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w0[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); w0[2] = 0; w0[1] = 0; w0[0] = 0; + if (offset_mod_4 == 0) + { + w0[3] = w1[0]; + w1[0] = w1[1]; + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + break; - case 4: - w3[3] = amd_bytealign_S (w2[2], w2[3], offset); - w3[2] = amd_bytealign_S (w2[1], w2[2], offset); - w3[1] = amd_bytealign_S (w2[0], w2[1], offset); - w3[0] = amd_bytealign_S (w1[3], w2[0], offset); - w2[3] = amd_bytealign_S (w1[2], w1[3], offset); - w2[2] = amd_bytealign_S (w1[1], w1[2], offset); - w2[1] = amd_bytealign_S (w1[0], w1[1], offset); - w2[0] = amd_bytealign_S (w0[3], w1[0], offset); - w1[3] = amd_bytealign_S (w0[2], w0[3], offset); - w1[2] = amd_bytealign_S (w0[1], w0[2], offset); - w1[1] = amd_bytealign_S (w0[0], w0[1], offset); - w1[0] = amd_bytealign_S ( 0, w0[0], offset); + case 4: + w7[3] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); + w7[2] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); + w7[1] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); + w7[0] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); + w6[3] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); + w6[2] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); + w6[1] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w6[0] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w5[3] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w5[2] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w5[1] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w5[0] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w4[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w4[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w4[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w4[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w3[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w3[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w3[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w3[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w2[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w2[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w2[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w2[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w1[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w1[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w1[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w1[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; + if (offset_mod_4 == 0) + { + w1[0] = w1[1]; + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + break; - case 5: - w3[3] = amd_bytealign_S (w2[1], w2[2], offset); - w3[2] = amd_bytealign_S (w2[0], w2[1], offset); - w3[1] = amd_bytealign_S (w1[3], w2[0], offset); - w3[0] = amd_bytealign_S (w1[2], w1[3], offset); - w2[3] = amd_bytealign_S (w1[1], w1[2], offset); - w2[2] = amd_bytealign_S (w1[0], w1[1], offset); - w2[1] = amd_bytealign_S (w0[3], w1[0], offset); - w2[0] = amd_bytealign_S (w0[2], w0[3], offset); - w1[3] = amd_bytealign_S (w0[1], w0[2], offset); - w1[2] = amd_bytealign_S (w0[0], w0[1], offset); - w1[1] = amd_bytealign_S ( 0, w0[0], offset); + case 5: + w7[3] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); + w7[2] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); + w7[1] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); + w7[0] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); + w6[3] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); + w6[2] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w6[1] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w6[0] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w5[3] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w5[2] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w5[1] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w5[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w4[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w4[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w4[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w4[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w3[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w3[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w3[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w3[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w2[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w2[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w2[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w2[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w1[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w1[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w1[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; + if (offset_mod_4 == 0) + { + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + break; - case 6: - w3[3] = amd_bytealign_S (w2[0], w2[1], offset); - w3[2] = amd_bytealign_S (w1[3], w2[0], offset); - w3[1] = amd_bytealign_S (w1[2], w1[3], offset); - w3[0] = amd_bytealign_S (w1[1], w1[2], offset); - w2[3] = amd_bytealign_S (w1[0], w1[1], offset); - w2[2] = amd_bytealign_S (w0[3], w1[0], offset); - w2[1] = amd_bytealign_S (w0[2], w0[3], offset); - w2[0] = amd_bytealign_S (w0[1], w0[2], offset); - w1[3] = amd_bytealign_S (w0[0], w0[1], offset); - w1[2] = amd_bytealign_S ( 0, w0[0], offset); + case 6: + w7[3] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); + w7[2] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); + w7[1] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); + w7[0] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); + w6[3] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w6[2] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w6[1] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w6[0] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w5[3] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w5[2] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w5[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w5[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w4[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w4[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w4[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w4[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w3[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w3[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w3[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w3[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w2[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w2[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w2[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w2[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w1[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w1[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -8528,18 +10956,64 @@ inline void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w w0[1] = 0; w0[0] = 0; + if (offset_mod_4 == 0) + { + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + break; - case 7: - w3[3] = amd_bytealign_S (w1[3], w2[0], offset); - w3[2] = amd_bytealign_S (w1[2], w1[3], offset); - w3[1] = amd_bytealign_S (w1[1], w1[2], offset); - w3[0] = amd_bytealign_S (w1[0], w1[1], offset); - w2[3] = amd_bytealign_S (w0[3], w1[0], offset); - w2[2] = amd_bytealign_S (w0[2], w0[3], offset); - w2[1] = amd_bytealign_S (w0[1], w0[2], offset); - w2[0] = amd_bytealign_S (w0[0], w0[1], offset); - w1[3] = amd_bytealign_S ( 0, w0[0], offset); + case 7: + w7[3] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); + w7[2] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); + w7[1] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); + w7[0] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w6[3] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w6[2] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w6[1] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w6[0] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w5[3] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w5[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w5[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w5[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w4[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w4[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w4[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w4[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w3[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w3[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w3[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w3[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w2[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w2[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w2[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w2[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w1[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -8548,17 +11022,62 @@ inline void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w w0[1] = 0; w0[0] = 0; + if (offset_mod_4 == 0) + { + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + break; - case 8: - w3[3] = amd_bytealign_S (w1[2], w1[3], offset); - w3[2] = amd_bytealign_S (w1[1], w1[2], offset); - w3[1] = amd_bytealign_S (w1[0], w1[1], offset); - w3[0] = amd_bytealign_S (w0[3], w1[0], offset); - w2[3] = amd_bytealign_S (w0[2], w0[3], offset); - w2[2] = amd_bytealign_S (w0[1], w0[2], offset); - w2[1] = amd_bytealign_S (w0[0], w0[1], offset); - w2[0] = amd_bytealign_S ( 0, w0[0], offset); + case 8: + w7[3] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); + w7[2] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); + w7[1] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w7[0] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w6[3] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w6[2] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w6[1] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w6[0] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w5[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w5[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w5[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w5[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w4[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w4[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w4[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w4[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w3[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w3[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w3[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w3[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w2[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w2[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w2[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w2[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -8568,16 +11087,60 @@ inline void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w w0[1] = 0; w0[0] = 0; + if (offset_mod_4 == 0) + { + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + break; - case 9: - w3[3] = amd_bytealign_S (w1[1], w1[2], offset); - w3[2] = amd_bytealign_S (w1[0], w1[1], offset); - w3[1] = amd_bytealign_S (w0[3], w1[0], offset); - w3[0] = amd_bytealign_S (w0[2], w0[3], offset); - w2[3] = amd_bytealign_S (w0[1], w0[2], offset); - w2[2] = amd_bytealign_S (w0[0], w0[1], offset); - w2[1] = amd_bytealign_S ( 0, w0[0], offset); + case 9: + w7[3] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); + w7[2] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w7[1] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w7[0] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w6[3] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w6[2] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w6[1] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w6[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w5[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w5[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w5[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w5[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w4[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w4[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w4[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w4[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w3[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w3[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w3[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w3[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w2[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w2[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w2[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -8588,15 +11151,58 @@ inline void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w w0[1] = 0; w0[0] = 0; + if (offset_mod_4 == 0) + { + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + break; case 10: - w3[3] = amd_bytealign_S (w1[0], w1[1], offset); - w3[2] = amd_bytealign_S (w0[3], w1[0], offset); - w3[1] = amd_bytealign_S (w0[2], w0[3], offset); - w3[0] = amd_bytealign_S (w0[1], w0[2], offset); - w2[3] = amd_bytealign_S (w0[0], w0[1], offset); - w2[2] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w7[2] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w7[1] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w7[0] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w6[3] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w6[2] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w6[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w6[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w5[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w5[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w5[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w5[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w4[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w4[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w4[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w4[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w3[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w3[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w3[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w3[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w2[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w2[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -8608,14 +11214,56 @@ inline void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w w0[1] = 0; w0[0] = 0; + if (offset_mod_4 == 0) + { + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + break; case 11: - w3[3] = amd_bytealign_S (w0[3], w1[0], offset); - w3[2] = amd_bytealign_S (w0[2], w0[3], offset); - w3[1] = amd_bytealign_S (w0[1], w0[2], offset); - w3[0] = amd_bytealign_S (w0[0], w0[1], offset); - w2[3] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w7[2] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w7[1] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w7[0] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w6[3] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w6[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w6[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w6[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w5[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w5[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w5[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w5[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w4[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w4[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w4[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w4[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w3[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w3[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w3[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w3[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w2[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -8628,13 +11276,54 @@ inline void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w w0[1] = 0; w0[0] = 0; + if (offset_mod_4 == 0) + { + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + break; case 12: - w3[3] = amd_bytealign_S (w0[2], w0[3], offset); - w3[2] = amd_bytealign_S (w0[1], w0[2], offset); - w3[1] = amd_bytealign_S (w0[0], w0[1], offset); - w3[0] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w7[2] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w7[1] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w7[0] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w6[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w6[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w6[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w6[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w5[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w5[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w5[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w5[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w4[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w4[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w4[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w4[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w3[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w3[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w3[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w3[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -8648,12 +11337,52 @@ inline void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w w0[1] = 0; w0[0] = 0; + if (offset_mod_4 == 0) + { + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + break; case 13: - w3[3] = amd_bytealign_S (w0[1], w0[2], offset); - w3[2] = amd_bytealign_S (w0[0], w0[1], offset); - w3[1] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w7[2] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w7[1] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w7[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w6[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w6[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w6[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w6[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w5[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w5[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w5[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w5[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w4[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w4[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w4[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w4[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w3[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w3[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w3[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -8668,11 +11397,50 @@ inline void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w w0[1] = 0; w0[0] = 0; + if (offset_mod_4 == 0) + { + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + break; case 14: - w3[3] = amd_bytealign_S (w0[0], w0[1], offset); - w3[2] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w7[2] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w7[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w7[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w6[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w6[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w6[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w6[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w5[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w5[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w5[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w5[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w4[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w4[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w4[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w4[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w3[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w3[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -8688,10 +11456,48 @@ inline void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w w0[1] = 0; w0[0] = 0; + if (offset_mod_4 == 0) + { + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + break; case 15: - w3[3] = amd_bytealign_S ( 0, w0[0], offset); + w7[3] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w7[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w7[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w7[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w6[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w6[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w6[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w6[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w5[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w5[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w5[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w5[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w4[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w4[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w4[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w4[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w3[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -8708,165 +11514,309 @@ inline void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w w0[1] = 0; w0[0] = 0; - break; - } - #endif - - #ifdef IS_NV - const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; - - switch (offset / 4) - { - case 0: - w3[3] = __byte_perm_S (w3[3], w3[2], selector); - w3[2] = __byte_perm_S (w3[2], w3[1], selector); - w3[1] = __byte_perm_S (w3[1], w3[0], selector); - w3[0] = __byte_perm_S (w3[0], w2[3], selector); - w2[3] = __byte_perm_S (w2[3], w2[2], selector); - w2[2] = __byte_perm_S (w2[2], w2[1], selector); - w2[1] = __byte_perm_S (w2[1], w2[0], selector); - w2[0] = __byte_perm_S (w2[0], w1[3], selector); - w1[3] = __byte_perm_S (w1[3], w1[2], selector); - w1[2] = __byte_perm_S (w1[2], w1[1], selector); - w1[1] = __byte_perm_S (w1[1], w1[0], selector); - w1[0] = __byte_perm_S (w1[0], w0[3], selector); - w0[3] = __byte_perm_S (w0[3], w0[2], selector); - w0[2] = __byte_perm_S (w0[2], w0[1], selector); - w0[1] = __byte_perm_S (w0[1], w0[0], selector); - w0[0] = __byte_perm_S (w0[0], 0, selector); + if (offset_mod_4 == 0) + { + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } break; + } + #endif - case 1: - w3[3] = __byte_perm_S (w3[2], w3[1], selector); - w3[2] = __byte_perm_S (w3[1], w3[0], selector); - w3[1] = __byte_perm_S (w3[0], w2[3], selector); - w3[0] = __byte_perm_S (w2[3], w2[2], selector); - w2[3] = __byte_perm_S (w2[2], w2[1], selector); - w2[2] = __byte_perm_S (w2[1], w2[0], selector); - w2[1] = __byte_perm_S (w2[0], w1[3], selector); - w2[0] = __byte_perm_S (w1[3], w1[2], selector); - w1[3] = __byte_perm_S (w1[2], w1[1], selector); - w1[2] = __byte_perm_S (w1[1], w1[0], selector); - w1[1] = __byte_perm_S (w1[0], w0[3], selector); - w1[0] = __byte_perm_S (w0[3], w0[2], selector); - w0[3] = __byte_perm_S (w0[2], w0[1], selector); - w0[2] = __byte_perm_S (w0[1], w0[0], selector); - w0[1] = __byte_perm_S (w0[0], 0, selector); - w0[0] = 0; + #ifdef IS_NV + const int offset_minus_4 = 4 - (offset % 4); + const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; + + switch (offset / 4) + { + case 0: + w7[3] = __byte_perm_S (w7[2], w7[3], selector); + w7[2] = __byte_perm_S (w7[1], w7[2], selector); + w7[1] = __byte_perm_S (w7[0], w7[1], selector); + w7[0] = __byte_perm_S (w6[3], w7[0], selector); + w6[3] = __byte_perm_S (w6[2], w6[3], selector); + w6[2] = __byte_perm_S (w6[1], w6[2], selector); + w6[1] = __byte_perm_S (w6[0], w6[1], selector); + w6[0] = __byte_perm_S (w5[3], w6[0], selector); + w5[3] = __byte_perm_S (w5[2], w5[3], selector); + w5[2] = __byte_perm_S (w5[1], w5[2], selector); + w5[1] = __byte_perm_S (w5[0], w5[1], selector); + w5[0] = __byte_perm_S (w4[3], w5[0], selector); + w4[3] = __byte_perm_S (w4[2], w4[3], selector); + w4[2] = __byte_perm_S (w4[1], w4[2], selector); + w4[1] = __byte_perm_S (w4[0], w4[1], selector); + w4[0] = __byte_perm_S (w3[3], w4[0], selector); + w3[3] = __byte_perm_S (w3[2], w3[3], selector); + w3[2] = __byte_perm_S (w3[1], w3[2], selector); + w3[1] = __byte_perm_S (w3[0], w3[1], selector); + w3[0] = __byte_perm_S (w2[3], w3[0], selector); + w2[3] = __byte_perm_S (w2[2], w2[3], selector); + w2[2] = __byte_perm_S (w2[1], w2[2], selector); + w2[1] = __byte_perm_S (w2[0], w2[1], selector); + w2[0] = __byte_perm_S (w1[3], w2[0], selector); + w1[3] = __byte_perm_S (w1[2], w1[3], selector); + w1[2] = __byte_perm_S (w1[1], w1[2], selector); + w1[1] = __byte_perm_S (w1[0], w1[1], selector); + w1[0] = __byte_perm_S (w0[3], w1[0], selector); + w0[3] = __byte_perm_S (w0[2], w0[3], selector); + w0[2] = __byte_perm_S (w0[1], w0[2], selector); + w0[1] = __byte_perm_S (w0[0], w0[1], selector); + w0[0] = __byte_perm_S ( 0, w0[0], selector); break; - case 2: - w3[3] = __byte_perm_S (w3[1], w3[0], selector); - w3[2] = __byte_perm_S (w3[0], w2[3], selector); - w3[1] = __byte_perm_S (w2[3], w2[2], selector); - w3[0] = __byte_perm_S (w2[2], w2[1], selector); - w2[3] = __byte_perm_S (w2[1], w2[0], selector); - w2[2] = __byte_perm_S (w2[0], w1[3], selector); - w2[1] = __byte_perm_S (w1[3], w1[2], selector); - w2[0] = __byte_perm_S (w1[2], w1[1], selector); - w1[3] = __byte_perm_S (w1[1], w1[0], selector); - w1[2] = __byte_perm_S (w1[0], w0[3], selector); - w1[1] = __byte_perm_S (w0[3], w0[2], selector); - w1[0] = __byte_perm_S (w0[2], w0[1], selector); - w0[3] = __byte_perm_S (w0[1], w0[0], selector); - w0[2] = __byte_perm_S (w0[0], 0, selector); - w0[1] = 0; + case 1: + w7[3] = __byte_perm_S (w7[1], w7[2], selector); + w7[2] = __byte_perm_S (w7[0], w7[1], selector); + w7[1] = __byte_perm_S (w6[3], w7[0], selector); + w7[0] = __byte_perm_S (w6[2], w6[3], selector); + w6[3] = __byte_perm_S (w6[1], w6[2], selector); + w6[2] = __byte_perm_S (w6[0], w6[1], selector); + w6[1] = __byte_perm_S (w5[3], w6[0], selector); + w6[0] = __byte_perm_S (w5[2], w5[3], selector); + w5[3] = __byte_perm_S (w5[1], w5[2], selector); + w5[2] = __byte_perm_S (w5[0], w5[1], selector); + w5[1] = __byte_perm_S (w4[3], w5[0], selector); + w5[0] = __byte_perm_S (w4[2], w4[3], selector); + w4[3] = __byte_perm_S (w4[1], w4[2], selector); + w4[2] = __byte_perm_S (w4[0], w4[1], selector); + w4[1] = __byte_perm_S (w3[3], w4[0], selector); + w4[0] = __byte_perm_S (w3[2], w3[3], selector); + w3[3] = __byte_perm_S (w3[1], w3[2], selector); + w3[2] = __byte_perm_S (w3[0], w3[1], selector); + w3[1] = __byte_perm_S (w2[3], w3[0], selector); + w3[0] = __byte_perm_S (w2[2], w2[3], selector); + w2[3] = __byte_perm_S (w2[1], w2[2], selector); + w2[2] = __byte_perm_S (w2[0], w2[1], selector); + w2[1] = __byte_perm_S (w1[3], w2[0], selector); + w2[0] = __byte_perm_S (w1[2], w1[3], selector); + w1[3] = __byte_perm_S (w1[1], w1[2], selector); + w1[2] = __byte_perm_S (w1[0], w1[1], selector); + w1[1] = __byte_perm_S (w0[3], w1[0], selector); + w1[0] = __byte_perm_S (w0[2], w0[3], selector); + w0[3] = __byte_perm_S (w0[1], w0[2], selector); + w0[2] = __byte_perm_S (w0[0], w0[1], selector); + w0[1] = __byte_perm_S ( 0, w0[0], selector); w0[0] = 0; + break; + case 2: + w7[3] = __byte_perm_S (w7[0], w7[1], selector); + w7[2] = __byte_perm_S (w6[3], w7[0], selector); + w7[1] = __byte_perm_S (w6[2], w6[3], selector); + w7[0] = __byte_perm_S (w6[1], w6[2], selector); + w6[3] = __byte_perm_S (w6[0], w6[1], selector); + w6[2] = __byte_perm_S (w5[3], w6[0], selector); + w6[1] = __byte_perm_S (w5[2], w5[3], selector); + w6[0] = __byte_perm_S (w5[1], w5[2], selector); + w5[3] = __byte_perm_S (w5[0], w5[1], selector); + w5[2] = __byte_perm_S (w4[3], w5[0], selector); + w5[1] = __byte_perm_S (w4[2], w4[3], selector); + w5[0] = __byte_perm_S (w4[1], w4[2], selector); + w4[3] = __byte_perm_S (w4[0], w4[1], selector); + w4[2] = __byte_perm_S (w3[3], w4[0], selector); + w4[1] = __byte_perm_S (w3[2], w3[3], selector); + w4[0] = __byte_perm_S (w3[1], w3[2], selector); + w3[3] = __byte_perm_S (w3[0], w3[1], selector); + w3[2] = __byte_perm_S (w2[3], w3[0], selector); + w3[1] = __byte_perm_S (w2[2], w2[3], selector); + w3[0] = __byte_perm_S (w2[1], w2[2], selector); + w2[3] = __byte_perm_S (w2[0], w2[1], selector); + w2[2] = __byte_perm_S (w1[3], w2[0], selector); + w2[1] = __byte_perm_S (w1[2], w1[3], selector); + w2[0] = __byte_perm_S (w1[1], w1[2], selector); + w1[3] = __byte_perm_S (w1[0], w1[1], selector); + w1[2] = __byte_perm_S (w0[3], w1[0], selector); + w1[1] = __byte_perm_S (w0[2], w0[3], selector); + w1[0] = __byte_perm_S (w0[1], w0[2], selector); + w0[3] = __byte_perm_S (w0[0], w0[1], selector); + w0[2] = __byte_perm_S ( 0, w0[0], selector); + w0[1] = 0; + w0[0] = 0; break; - case 3: - w3[3] = __byte_perm_S (w3[0], w2[3], selector); - w3[2] = __byte_perm_S (w2[3], w2[2], selector); - w3[1] = __byte_perm_S (w2[2], w2[1], selector); - w3[0] = __byte_perm_S (w2[1], w2[0], selector); - w2[3] = __byte_perm_S (w2[0], w1[3], selector); - w2[2] = __byte_perm_S (w1[3], w1[2], selector); - w2[1] = __byte_perm_S (w1[2], w1[1], selector); - w2[0] = __byte_perm_S (w1[1], w1[0], selector); - w1[3] = __byte_perm_S (w1[0], w0[3], selector); - w1[2] = __byte_perm_S (w0[3], w0[2], selector); - w1[1] = __byte_perm_S (w0[2], w0[1], selector); - w1[0] = __byte_perm_S (w0[1], w0[0], selector); - w0[3] = __byte_perm_S (w0[0], 0, selector); + case 3: + w7[3] = __byte_perm_S (w6[3], w7[0], selector); + w7[2] = __byte_perm_S (w6[2], w6[3], selector); + w7[1] = __byte_perm_S (w6[1], w6[2], selector); + w7[0] = __byte_perm_S (w6[0], w6[1], selector); + w6[3] = __byte_perm_S (w5[3], w6[0], selector); + w6[2] = __byte_perm_S (w5[2], w5[3], selector); + w6[1] = __byte_perm_S (w5[1], w5[2], selector); + w6[0] = __byte_perm_S (w5[0], w5[1], selector); + w5[3] = __byte_perm_S (w4[3], w5[0], selector); + w5[2] = __byte_perm_S (w4[2], w4[3], selector); + w5[1] = __byte_perm_S (w4[1], w4[2], selector); + w5[0] = __byte_perm_S (w4[0], w4[1], selector); + w4[3] = __byte_perm_S (w3[3], w4[0], selector); + w4[2] = __byte_perm_S (w3[2], w3[3], selector); + w4[1] = __byte_perm_S (w3[1], w3[2], selector); + w4[0] = __byte_perm_S (w3[0], w3[1], selector); + w3[3] = __byte_perm_S (w2[3], w3[0], selector); + w3[2] = __byte_perm_S (w2[2], w2[3], selector); + w3[1] = __byte_perm_S (w2[1], w2[2], selector); + w3[0] = __byte_perm_S (w2[0], w2[1], selector); + w2[3] = __byte_perm_S (w1[3], w2[0], selector); + w2[2] = __byte_perm_S (w1[2], w1[3], selector); + w2[1] = __byte_perm_S (w1[1], w1[2], selector); + w2[0] = __byte_perm_S (w1[0], w1[1], selector); + w1[3] = __byte_perm_S (w0[3], w1[0], selector); + w1[2] = __byte_perm_S (w0[2], w0[3], selector); + w1[1] = __byte_perm_S (w0[1], w0[2], selector); + w1[0] = __byte_perm_S (w0[0], w0[1], selector); + w0[3] = __byte_perm_S ( 0, w0[0], selector); w0[2] = 0; w0[1] = 0; w0[0] = 0; - break; - case 4: - w3[3] = __byte_perm_S (w2[3], w2[2], selector); - w3[2] = __byte_perm_S (w2[2], w2[1], selector); - w3[1] = __byte_perm_S (w2[1], w2[0], selector); - w3[0] = __byte_perm_S (w2[0], w1[3], selector); - w2[3] = __byte_perm_S (w1[3], w1[2], selector); - w2[2] = __byte_perm_S (w1[2], w1[1], selector); - w2[1] = __byte_perm_S (w1[1], w1[0], selector); - w2[0] = __byte_perm_S (w1[0], w0[3], selector); - w1[3] = __byte_perm_S (w0[3], w0[2], selector); - w1[2] = __byte_perm_S (w0[2], w0[1], selector); - w1[1] = __byte_perm_S (w0[1], w0[0], selector); - w1[0] = __byte_perm_S (w0[0], 0, selector); + case 4: + w7[3] = __byte_perm_S (w6[2], w6[3], selector); + w7[2] = __byte_perm_S (w6[1], w6[2], selector); + w7[1] = __byte_perm_S (w6[0], w6[1], selector); + w7[0] = __byte_perm_S (w5[3], w6[0], selector); + w6[3] = __byte_perm_S (w5[2], w5[3], selector); + w6[2] = __byte_perm_S (w5[1], w5[2], selector); + w6[1] = __byte_perm_S (w5[0], w5[1], selector); + w6[0] = __byte_perm_S (w4[3], w5[0], selector); + w5[3] = __byte_perm_S (w4[2], w4[3], selector); + w5[2] = __byte_perm_S (w4[1], w4[2], selector); + w5[1] = __byte_perm_S (w4[0], w4[1], selector); + w5[0] = __byte_perm_S (w3[3], w4[0], selector); + w4[3] = __byte_perm_S (w3[2], w3[3], selector); + w4[2] = __byte_perm_S (w3[1], w3[2], selector); + w4[1] = __byte_perm_S (w3[0], w3[1], selector); + w4[0] = __byte_perm_S (w2[3], w3[0], selector); + w3[3] = __byte_perm_S (w2[2], w2[3], selector); + w3[2] = __byte_perm_S (w2[1], w2[2], selector); + w3[1] = __byte_perm_S (w2[0], w2[1], selector); + w3[0] = __byte_perm_S (w1[3], w2[0], selector); + w2[3] = __byte_perm_S (w1[2], w1[3], selector); + w2[2] = __byte_perm_S (w1[1], w1[2], selector); + w2[1] = __byte_perm_S (w1[0], w1[1], selector); + w2[0] = __byte_perm_S (w0[3], w1[0], selector); + w1[3] = __byte_perm_S (w0[2], w0[3], selector); + w1[2] = __byte_perm_S (w0[1], w0[2], selector); + w1[1] = __byte_perm_S (w0[0], w0[1], selector); + w1[0] = __byte_perm_S ( 0, w0[0], selector); w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; - break; - case 5: - w3[3] = __byte_perm_S (w2[2], w2[1], selector); - w3[2] = __byte_perm_S (w2[1], w2[0], selector); - w3[1] = __byte_perm_S (w2[0], w1[3], selector); - w3[0] = __byte_perm_S (w1[3], w1[2], selector); - w2[3] = __byte_perm_S (w1[2], w1[1], selector); - w2[2] = __byte_perm_S (w1[1], w1[0], selector); - w2[1] = __byte_perm_S (w1[0], w0[3], selector); - w2[0] = __byte_perm_S (w0[3], w0[2], selector); - w1[3] = __byte_perm_S (w0[2], w0[1], selector); - w1[2] = __byte_perm_S (w0[1], w0[0], selector); - w1[1] = __byte_perm_S (w0[0], 0, selector); + case 5: + w7[3] = __byte_perm_S (w6[1], w6[2], selector); + w7[2] = __byte_perm_S (w6[0], w6[1], selector); + w7[1] = __byte_perm_S (w5[3], w6[0], selector); + w7[0] = __byte_perm_S (w5[2], w5[3], selector); + w6[3] = __byte_perm_S (w5[1], w5[2], selector); + w6[2] = __byte_perm_S (w5[0], w5[1], selector); + w6[1] = __byte_perm_S (w4[3], w5[0], selector); + w6[0] = __byte_perm_S (w4[2], w4[3], selector); + w5[3] = __byte_perm_S (w4[1], w4[2], selector); + w5[2] = __byte_perm_S (w4[0], w4[1], selector); + w5[1] = __byte_perm_S (w3[3], w4[0], selector); + w5[0] = __byte_perm_S (w3[2], w3[3], selector); + w4[3] = __byte_perm_S (w3[1], w3[2], selector); + w4[2] = __byte_perm_S (w3[0], w3[1], selector); + w4[1] = __byte_perm_S (w2[3], w3[0], selector); + w4[0] = __byte_perm_S (w2[2], w2[3], selector); + w3[3] = __byte_perm_S (w2[1], w2[2], selector); + w3[2] = __byte_perm_S (w2[0], w2[1], selector); + w3[1] = __byte_perm_S (w1[3], w2[0], selector); + w3[0] = __byte_perm_S (w1[2], w1[3], selector); + w2[3] = __byte_perm_S (w1[1], w1[2], selector); + w2[2] = __byte_perm_S (w1[0], w1[1], selector); + w2[1] = __byte_perm_S (w0[3], w1[0], selector); + w2[0] = __byte_perm_S (w0[2], w0[3], selector); + w1[3] = __byte_perm_S (w0[1], w0[2], selector); + w1[2] = __byte_perm_S (w0[0], w0[1], selector); + w1[1] = __byte_perm_S ( 0, w0[0], selector); w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; - break; - case 6: - w3[3] = __byte_perm_S (w2[1], w2[0], selector); - w3[2] = __byte_perm_S (w2[0], w1[3], selector); - w3[1] = __byte_perm_S (w1[3], w1[2], selector); - w3[0] = __byte_perm_S (w1[2], w1[1], selector); - w2[3] = __byte_perm_S (w1[1], w1[0], selector); - w2[2] = __byte_perm_S (w1[0], w0[3], selector); - w2[1] = __byte_perm_S (w0[3], w0[2], selector); - w2[0] = __byte_perm_S (w0[2], w0[1], selector); - w1[3] = __byte_perm_S (w0[1], w0[0], selector); - w1[2] = __byte_perm_S (w0[0], 0, selector); + case 6: + w7[3] = __byte_perm_S (w6[0], w6[1], selector); + w7[2] = __byte_perm_S (w5[3], w6[0], selector); + w7[1] = __byte_perm_S (w5[2], w5[3], selector); + w7[0] = __byte_perm_S (w5[1], w5[2], selector); + w6[3] = __byte_perm_S (w5[0], w5[1], selector); + w6[2] = __byte_perm_S (w4[3], w5[0], selector); + w6[1] = __byte_perm_S (w4[2], w4[3], selector); + w6[0] = __byte_perm_S (w4[1], w4[2], selector); + w5[3] = __byte_perm_S (w4[0], w4[1], selector); + w5[2] = __byte_perm_S (w3[3], w4[0], selector); + w5[1] = __byte_perm_S (w3[2], w3[3], selector); + w5[0] = __byte_perm_S (w3[1], w3[2], selector); + w4[3] = __byte_perm_S (w3[0], w3[1], selector); + w4[2] = __byte_perm_S (w2[3], w3[0], selector); + w4[1] = __byte_perm_S (w2[2], w2[3], selector); + w4[0] = __byte_perm_S (w2[1], w2[2], selector); + w3[3] = __byte_perm_S (w2[0], w2[1], selector); + w3[2] = __byte_perm_S (w1[3], w2[0], selector); + w3[1] = __byte_perm_S (w1[2], w1[3], selector); + w3[0] = __byte_perm_S (w1[1], w1[2], selector); + w2[3] = __byte_perm_S (w1[0], w1[1], selector); + w2[2] = __byte_perm_S (w0[3], w1[0], selector); + w2[1] = __byte_perm_S (w0[2], w0[3], selector); + w2[0] = __byte_perm_S (w0[1], w0[2], selector); + w1[3] = __byte_perm_S (w0[0], w0[1], selector); + w1[2] = __byte_perm_S ( 0, w0[0], selector); w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; - break; - case 7: - w3[3] = __byte_perm_S (w2[0], w1[3], selector); - w3[2] = __byte_perm_S (w1[3], w1[2], selector); - w3[1] = __byte_perm_S (w1[2], w1[1], selector); - w3[0] = __byte_perm_S (w1[1], w1[0], selector); - w2[3] = __byte_perm_S (w1[0], w0[3], selector); - w2[2] = __byte_perm_S (w0[3], w0[2], selector); - w2[1] = __byte_perm_S (w0[2], w0[1], selector); - w2[0] = __byte_perm_S (w0[1], w0[0], selector); - w1[3] = __byte_perm_S (w0[0], 0, selector); + case 7: + w7[3] = __byte_perm_S (w5[3], w6[0], selector); + w7[2] = __byte_perm_S (w5[2], w5[3], selector); + w7[1] = __byte_perm_S (w5[1], w5[2], selector); + w7[0] = __byte_perm_S (w5[0], w5[1], selector); + w6[3] = __byte_perm_S (w4[3], w5[0], selector); + w6[2] = __byte_perm_S (w4[2], w4[3], selector); + w6[1] = __byte_perm_S (w4[1], w4[2], selector); + w6[0] = __byte_perm_S (w4[0], w4[1], selector); + w5[3] = __byte_perm_S (w3[3], w4[0], selector); + w5[2] = __byte_perm_S (w3[2], w3[3], selector); + w5[1] = __byte_perm_S (w3[1], w3[2], selector); + w5[0] = __byte_perm_S (w3[0], w3[1], selector); + w4[3] = __byte_perm_S (w2[3], w3[0], selector); + w4[2] = __byte_perm_S (w2[2], w2[3], selector); + w4[1] = __byte_perm_S (w2[1], w2[2], selector); + w4[0] = __byte_perm_S (w2[0], w2[1], selector); + w3[3] = __byte_perm_S (w1[3], w2[0], selector); + w3[2] = __byte_perm_S (w1[2], w1[3], selector); + w3[1] = __byte_perm_S (w1[1], w1[2], selector); + w3[0] = __byte_perm_S (w1[0], w1[1], selector); + w2[3] = __byte_perm_S (w0[3], w1[0], selector); + w2[2] = __byte_perm_S (w0[2], w0[3], selector); + w2[1] = __byte_perm_S (w0[1], w0[2], selector); + w2[0] = __byte_perm_S (w0[0], w0[1], selector); + w1[3] = __byte_perm_S ( 0, w0[0], selector); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -8874,18 +11824,33 @@ inline void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w w0[2] = 0; w0[1] = 0; w0[0] = 0; - break; - case 8: - w3[3] = __byte_perm_S (w1[3], w1[2], selector); - w3[2] = __byte_perm_S (w1[2], w1[1], selector); - w3[1] = __byte_perm_S (w1[1], w1[0], selector); - w3[0] = __byte_perm_S (w1[0], w0[3], selector); - w2[3] = __byte_perm_S (w0[3], w0[2], selector); - w2[2] = __byte_perm_S (w0[2], w0[1], selector); - w2[1] = __byte_perm_S (w0[1], w0[0], selector); - w2[0] = __byte_perm_S (w0[0], 0, selector); + case 8: + w7[3] = __byte_perm_S (w5[2], w5[3], selector); + w7[2] = __byte_perm_S (w5[1], w5[2], selector); + w7[1] = __byte_perm_S (w5[0], w5[1], selector); + w7[0] = __byte_perm_S (w4[3], w5[0], selector); + w6[3] = __byte_perm_S (w4[2], w4[3], selector); + w6[2] = __byte_perm_S (w4[1], w4[2], selector); + w6[1] = __byte_perm_S (w4[0], w4[1], selector); + w6[0] = __byte_perm_S (w3[3], w4[0], selector); + w5[3] = __byte_perm_S (w3[2], w3[3], selector); + w5[2] = __byte_perm_S (w3[1], w3[2], selector); + w5[1] = __byte_perm_S (w3[0], w3[1], selector); + w5[0] = __byte_perm_S (w2[3], w3[0], selector); + w4[3] = __byte_perm_S (w2[2], w2[3], selector); + w4[2] = __byte_perm_S (w2[1], w2[2], selector); + w4[1] = __byte_perm_S (w2[0], w2[1], selector); + w4[0] = __byte_perm_S (w1[3], w2[0], selector); + w3[3] = __byte_perm_S (w1[2], w1[3], selector); + w3[2] = __byte_perm_S (w1[1], w1[2], selector); + w3[1] = __byte_perm_S (w1[0], w1[1], selector); + w3[0] = __byte_perm_S (w0[3], w1[0], selector); + w2[3] = __byte_perm_S (w0[2], w0[3], selector); + w2[2] = __byte_perm_S (w0[1], w0[2], selector); + w2[1] = __byte_perm_S (w0[0], w0[1], selector); + w2[0] = __byte_perm_S ( 0, w0[0], selector); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -8894,17 +11859,32 @@ inline void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w w0[2] = 0; w0[1] = 0; w0[0] = 0; - break; - case 9: - w3[3] = __byte_perm_S (w1[2], w1[1], selector); - w3[2] = __byte_perm_S (w1[1], w1[0], selector); - w3[1] = __byte_perm_S (w1[0], w0[3], selector); - w3[0] = __byte_perm_S (w0[3], w0[2], selector); - w2[3] = __byte_perm_S (w0[2], w0[1], selector); - w2[2] = __byte_perm_S (w0[1], w0[0], selector); - w2[1] = __byte_perm_S (w0[0], 0, selector); + case 9: + w7[3] = __byte_perm_S (w5[1], w5[2], selector); + w7[2] = __byte_perm_S (w5[0], w5[1], selector); + w7[1] = __byte_perm_S (w4[3], w5[0], selector); + w7[0] = __byte_perm_S (w4[2], w4[3], selector); + w6[3] = __byte_perm_S (w4[1], w4[2], selector); + w6[2] = __byte_perm_S (w4[0], w4[1], selector); + w6[1] = __byte_perm_S (w3[3], w4[0], selector); + w6[0] = __byte_perm_S (w3[2], w3[3], selector); + w5[3] = __byte_perm_S (w3[1], w3[2], selector); + w5[2] = __byte_perm_S (w3[0], w3[1], selector); + w5[1] = __byte_perm_S (w2[3], w3[0], selector); + w5[0] = __byte_perm_S (w2[2], w2[3], selector); + w4[3] = __byte_perm_S (w2[1], w2[2], selector); + w4[2] = __byte_perm_S (w2[0], w2[1], selector); + w4[1] = __byte_perm_S (w1[3], w2[0], selector); + w4[0] = __byte_perm_S (w1[2], w1[3], selector); + w3[3] = __byte_perm_S (w1[1], w1[2], selector); + w3[2] = __byte_perm_S (w1[0], w1[1], selector); + w3[1] = __byte_perm_S (w0[3], w1[0], selector); + w3[0] = __byte_perm_S (w0[2], w0[3], selector); + w2[3] = __byte_perm_S (w0[1], w0[2], selector); + w2[2] = __byte_perm_S (w0[0], w0[1], selector); + w2[1] = __byte_perm_S ( 0, w0[0], selector); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -8914,16 +11894,31 @@ inline void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w w0[2] = 0; w0[1] = 0; w0[0] = 0; - break; case 10: - w3[3] = __byte_perm_S (w1[1], w1[0], selector); - w3[2] = __byte_perm_S (w1[0], w0[3], selector); - w3[1] = __byte_perm_S (w0[3], w0[2], selector); - w3[0] = __byte_perm_S (w0[2], w0[1], selector); - w2[3] = __byte_perm_S (w0[1], w0[0], selector); - w2[2] = __byte_perm_S (w0[0], 0, selector); + w7[3] = __byte_perm_S (w5[0], w5[1], selector); + w7[2] = __byte_perm_S (w4[3], w5[0], selector); + w7[1] = __byte_perm_S (w4[2], w4[3], selector); + w7[0] = __byte_perm_S (w4[1], w4[2], selector); + w6[3] = __byte_perm_S (w4[0], w4[1], selector); + w6[2] = __byte_perm_S (w3[3], w4[0], selector); + w6[1] = __byte_perm_S (w3[2], w3[3], selector); + w6[0] = __byte_perm_S (w3[1], w3[2], selector); + w5[3] = __byte_perm_S (w3[0], w3[1], selector); + w5[2] = __byte_perm_S (w2[3], w3[0], selector); + w5[1] = __byte_perm_S (w2[2], w2[3], selector); + w5[0] = __byte_perm_S (w2[1], w2[2], selector); + w4[3] = __byte_perm_S (w2[0], w2[1], selector); + w4[2] = __byte_perm_S (w1[3], w2[0], selector); + w4[1] = __byte_perm_S (w1[2], w1[3], selector); + w4[0] = __byte_perm_S (w1[1], w1[2], selector); + w3[3] = __byte_perm_S (w1[0], w1[1], selector); + w3[2] = __byte_perm_S (w0[3], w1[0], selector); + w3[1] = __byte_perm_S (w0[2], w0[3], selector); + w3[0] = __byte_perm_S (w0[1], w0[2], selector); + w2[3] = __byte_perm_S (w0[0], w0[1], selector); + w2[2] = __byte_perm_S ( 0, w0[0], selector); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -8934,15 +11929,30 @@ inline void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w w0[2] = 0; w0[1] = 0; w0[0] = 0; - break; case 11: - w3[3] = __byte_perm_S (w1[0], w0[3], selector); - w3[2] = __byte_perm_S (w0[3], w0[2], selector); - w3[1] = __byte_perm_S (w0[2], w0[1], selector); - w3[0] = __byte_perm_S (w0[1], w0[0], selector); - w2[3] = __byte_perm_S (w0[0], 0, selector); + w7[3] = __byte_perm_S (w4[3], w5[0], selector); + w7[2] = __byte_perm_S (w4[2], w4[3], selector); + w7[1] = __byte_perm_S (w4[1], w4[2], selector); + w7[0] = __byte_perm_S (w4[0], w4[1], selector); + w6[3] = __byte_perm_S (w3[3], w4[0], selector); + w6[2] = __byte_perm_S (w3[2], w3[3], selector); + w6[1] = __byte_perm_S (w3[1], w3[2], selector); + w6[0] = __byte_perm_S (w3[0], w3[1], selector); + w5[3] = __byte_perm_S (w2[3], w3[0], selector); + w5[2] = __byte_perm_S (w2[2], w2[3], selector); + w5[1] = __byte_perm_S (w2[1], w2[2], selector); + w5[0] = __byte_perm_S (w2[0], w2[1], selector); + w4[3] = __byte_perm_S (w1[3], w2[0], selector); + w4[2] = __byte_perm_S (w1[2], w1[3], selector); + w4[1] = __byte_perm_S (w1[1], w1[2], selector); + w4[0] = __byte_perm_S (w1[0], w1[1], selector); + w3[3] = __byte_perm_S (w0[3], w1[0], selector); + w3[2] = __byte_perm_S (w0[2], w0[3], selector); + w3[1] = __byte_perm_S (w0[1], w0[2], selector); + w3[0] = __byte_perm_S (w0[0], w0[1], selector); + w2[3] = __byte_perm_S ( 0, w0[0], selector); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -8954,14 +11964,29 @@ inline void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w w0[2] = 0; w0[1] = 0; w0[0] = 0; - break; case 12: - w3[3] = __byte_perm_S (w0[3], w0[2], selector); - w3[2] = __byte_perm_S (w0[2], w0[1], selector); - w3[1] = __byte_perm_S (w0[1], w0[0], selector); - w3[0] = __byte_perm_S (w0[0], 0, selector); + w7[3] = __byte_perm_S (w4[2], w4[3], selector); + w7[2] = __byte_perm_S (w4[1], w4[2], selector); + w7[1] = __byte_perm_S (w4[0], w4[1], selector); + w7[0] = __byte_perm_S (w3[3], w4[0], selector); + w6[3] = __byte_perm_S (w3[2], w3[3], selector); + w6[2] = __byte_perm_S (w3[1], w3[2], selector); + w6[1] = __byte_perm_S (w3[0], w3[1], selector); + w6[0] = __byte_perm_S (w2[3], w3[0], selector); + w5[3] = __byte_perm_S (w2[2], w2[3], selector); + w5[2] = __byte_perm_S (w2[1], w2[2], selector); + w5[1] = __byte_perm_S (w2[0], w2[1], selector); + w5[0] = __byte_perm_S (w1[3], w2[0], selector); + w4[3] = __byte_perm_S (w1[2], w1[3], selector); + w4[2] = __byte_perm_S (w1[1], w1[2], selector); + w4[1] = __byte_perm_S (w1[0], w1[1], selector); + w4[0] = __byte_perm_S (w0[3], w1[0], selector); + w3[3] = __byte_perm_S (w0[2], w0[3], selector); + w3[2] = __byte_perm_S (w0[1], w0[2], selector); + w3[1] = __byte_perm_S (w0[0], w0[1], selector); + w3[0] = __byte_perm_S ( 0, w0[0], selector); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -8974,13 +11999,28 @@ inline void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w w0[2] = 0; w0[1] = 0; w0[0] = 0; - break; case 13: - w3[3] = __byte_perm_S (w0[2], w0[1], selector); - w3[2] = __byte_perm_S (w0[1], w0[0], selector); - w3[1] = __byte_perm_S (w0[0], 0, selector); + w7[3] = __byte_perm_S (w4[1], w4[2], selector); + w7[2] = __byte_perm_S (w4[0], w4[1], selector); + w7[1] = __byte_perm_S (w3[3], w4[0], selector); + w7[0] = __byte_perm_S (w3[2], w3[3], selector); + w6[3] = __byte_perm_S (w3[1], w3[2], selector); + w6[2] = __byte_perm_S (w3[0], w3[1], selector); + w6[1] = __byte_perm_S (w2[3], w3[0], selector); + w6[0] = __byte_perm_S (w2[2], w2[3], selector); + w5[3] = __byte_perm_S (w2[1], w2[2], selector); + w5[2] = __byte_perm_S (w2[0], w2[1], selector); + w5[1] = __byte_perm_S (w1[3], w2[0], selector); + w5[0] = __byte_perm_S (w1[2], w1[3], selector); + w4[3] = __byte_perm_S (w1[1], w1[2], selector); + w4[2] = __byte_perm_S (w1[0], w1[1], selector); + w4[1] = __byte_perm_S (w0[3], w1[0], selector); + w4[0] = __byte_perm_S (w0[2], w0[3], selector); + w3[3] = __byte_perm_S (w0[1], w0[2], selector); + w3[2] = __byte_perm_S (w0[0], w0[1], selector); + w3[1] = __byte_perm_S ( 0, w0[0], selector); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -8994,12 +12034,27 @@ inline void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w w0[2] = 0; w0[1] = 0; w0[0] = 0; - break; case 14: - w3[3] = __byte_perm_S (w0[1], w0[0], selector); - w3[2] = __byte_perm_S (w0[0], 0, selector); + w7[3] = __byte_perm_S (w4[0], w4[1], selector); + w7[2] = __byte_perm_S (w3[3], w4[0], selector); + w7[1] = __byte_perm_S (w3[2], w3[3], selector); + w7[0] = __byte_perm_S (w3[1], w3[2], selector); + w6[3] = __byte_perm_S (w3[0], w3[1], selector); + w6[2] = __byte_perm_S (w2[3], w3[0], selector); + w6[1] = __byte_perm_S (w2[2], w2[3], selector); + w6[0] = __byte_perm_S (w2[1], w2[2], selector); + w5[3] = __byte_perm_S (w2[0], w2[1], selector); + w5[2] = __byte_perm_S (w1[3], w2[0], selector); + w5[1] = __byte_perm_S (w1[2], w1[3], selector); + w5[0] = __byte_perm_S (w1[1], w1[2], selector); + w4[3] = __byte_perm_S (w1[0], w1[1], selector); + w4[2] = __byte_perm_S (w0[3], w1[0], selector); + w4[1] = __byte_perm_S (w0[2], w0[3], selector); + w4[0] = __byte_perm_S (w0[1], w0[2], selector); + w3[3] = __byte_perm_S (w0[0], w0[1], selector); + w3[2] = __byte_perm_S ( 0, w0[0], selector); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -9014,11 +12069,26 @@ inline void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w w0[2] = 0; w0[1] = 0; w0[0] = 0; - break; case 15: - w3[3] = __byte_perm_S (w0[0], 0, selector); + w7[3] = __byte_perm_S (w3[3], w4[0], selector); + w7[2] = __byte_perm_S (w3[2], w3[3], selector); + w7[1] = __byte_perm_S (w3[1], w3[2], selector); + w7[0] = __byte_perm_S (w3[0], w3[1], selector); + w6[3] = __byte_perm_S (w2[3], w3[0], selector); + w6[2] = __byte_perm_S (w2[2], w2[3], selector); + w6[1] = __byte_perm_S (w2[1], w2[2], selector); + w6[0] = __byte_perm_S (w2[0], w2[1], selector); + w5[3] = __byte_perm_S (w1[3], w2[0], selector); + w5[2] = __byte_perm_S (w1[2], w1[3], selector); + w5[1] = __byte_perm_S (w1[1], w1[2], selector); + w5[0] = __byte_perm_S (w1[0], w1[1], selector); + w4[3] = __byte_perm_S (w0[3], w1[0], selector); + w4[2] = __byte_perm_S (w0[2], w0[3], selector); + w4[1] = __byte_perm_S (w0[1], w0[2], selector); + w4[0] = __byte_perm_S (w0[0], w0[1], selector); + w3[3] = __byte_perm_S ( 0, w0[0], selector); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -9034,19 +12104,33 @@ inline void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w w0[2] = 0; w0[1] = 0; w0[0] = 0; - break; } #endif } -inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 c0[4], u32 c1[4], u32 c2[4], u32 c3[4], const u32 offset) +inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset) { #if defined IS_AMD || defined IS_GENERIC switch (offset / 4) { case 0: - c0[0] = amd_bytealign_S (w3[3], 0, offset); + w7[3] = amd_bytealign_S (w7[2], w7[3], offset); + w7[2] = amd_bytealign_S (w7[1], w7[2], offset); + w7[1] = amd_bytealign_S (w7[0], w7[1], offset); + w7[0] = amd_bytealign_S (w6[3], w7[0], offset); + w6[3] = amd_bytealign_S (w6[2], w6[3], offset); + w6[2] = amd_bytealign_S (w6[1], w6[2], offset); + w6[1] = amd_bytealign_S (w6[0], w6[1], offset); + w6[0] = amd_bytealign_S (w5[3], w6[0], offset); + w5[3] = amd_bytealign_S (w5[2], w5[3], offset); + w5[2] = amd_bytealign_S (w5[1], w5[2], offset); + w5[1] = amd_bytealign_S (w5[0], w5[1], offset); + w5[0] = amd_bytealign_S (w4[3], w5[0], offset); + w4[3] = amd_bytealign_S (w4[2], w4[3], offset); + w4[2] = amd_bytealign_S (w4[1], w4[2], offset); + w4[1] = amd_bytealign_S (w4[0], w4[1], offset); + w4[0] = amd_bytealign_S (w3[3], w4[0], offset); w3[3] = amd_bytealign_S (w3[2], w3[3], offset); w3[2] = amd_bytealign_S (w3[1], w3[2], offset); w3[1] = amd_bytealign_S (w3[0], w3[1], offset); @@ -9067,8 +12151,22 @@ inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], break; case 1: - c0[1] = amd_bytealign_S (w3[3], 0, offset); - c0[0] = amd_bytealign_S (w3[2], w3[3], offset); + w7[3] = amd_bytealign_S (w7[1], w7[2], offset); + w7[2] = amd_bytealign_S (w7[0], w7[1], offset); + w7[1] = amd_bytealign_S (w6[3], w7[0], offset); + w7[0] = amd_bytealign_S (w6[2], w6[3], offset); + w6[3] = amd_bytealign_S (w6[1], w6[2], offset); + w6[2] = amd_bytealign_S (w6[0], w6[1], offset); + w6[1] = amd_bytealign_S (w5[3], w6[0], offset); + w6[0] = amd_bytealign_S (w5[2], w5[3], offset); + w5[3] = amd_bytealign_S (w5[1], w5[2], offset); + w5[2] = amd_bytealign_S (w5[0], w5[1], offset); + w5[1] = amd_bytealign_S (w4[3], w5[0], offset); + w5[0] = amd_bytealign_S (w4[2], w4[3], offset); + w4[3] = amd_bytealign_S (w4[1], w4[2], offset); + w4[2] = amd_bytealign_S (w4[0], w4[1], offset); + w4[1] = amd_bytealign_S (w3[3], w4[0], offset); + w4[0] = amd_bytealign_S (w3[2], w3[3], offset); w3[3] = amd_bytealign_S (w3[1], w3[2], offset); w3[2] = amd_bytealign_S (w3[0], w3[1], offset); w3[1] = amd_bytealign_S (w2[3], w3[0], offset); @@ -9089,9 +12187,22 @@ inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], break; case 2: - c0[2] = amd_bytealign_S (w3[3], 0, offset); - c0[1] = amd_bytealign_S (w3[2], w3[3], offset); - c0[0] = amd_bytealign_S (w3[1], w3[2], offset); + w7[3] = amd_bytealign_S (w7[0], w7[1], offset); + w7[2] = amd_bytealign_S (w6[3], w7[0], offset); + w7[1] = amd_bytealign_S (w6[2], w6[3], offset); + w7[0] = amd_bytealign_S (w6[1], w6[2], offset); + w6[3] = amd_bytealign_S (w6[0], w6[1], offset); + w6[2] = amd_bytealign_S (w5[3], w6[0], offset); + w6[1] = amd_bytealign_S (w5[2], w5[3], offset); + w6[0] = amd_bytealign_S (w5[1], w5[2], offset); + w5[3] = amd_bytealign_S (w5[0], w5[1], offset); + w5[2] = amd_bytealign_S (w4[3], w5[0], offset); + w5[1] = amd_bytealign_S (w4[2], w4[3], offset); + w5[0] = amd_bytealign_S (w4[1], w4[2], offset); + w4[3] = amd_bytealign_S (w4[0], w4[1], offset); + w4[2] = amd_bytealign_S (w3[3], w4[0], offset); + w4[1] = amd_bytealign_S (w3[2], w3[3], offset); + w4[0] = amd_bytealign_S (w3[1], w3[2], offset); w3[3] = amd_bytealign_S (w3[0], w3[1], offset); w3[2] = amd_bytealign_S (w2[3], w3[0], offset); w3[1] = amd_bytealign_S (w2[2], w2[3], offset); @@ -9112,10 +12223,22 @@ inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], break; case 3: - c0[3] = amd_bytealign_S (w3[3], 0, offset); - c0[2] = amd_bytealign_S (w3[2], w3[3], offset); - c0[1] = amd_bytealign_S (w3[1], w3[2], offset); - c0[0] = amd_bytealign_S (w3[0], w3[1], offset); + w7[3] = amd_bytealign_S (w6[3], w7[0], offset); + w7[2] = amd_bytealign_S (w6[2], w6[3], offset); + w7[1] = amd_bytealign_S (w6[1], w6[2], offset); + w7[0] = amd_bytealign_S (w6[0], w6[1], offset); + w6[3] = amd_bytealign_S (w5[3], w6[0], offset); + w6[2] = amd_bytealign_S (w5[2], w5[3], offset); + w6[1] = amd_bytealign_S (w5[1], w5[2], offset); + w6[0] = amd_bytealign_S (w5[0], w5[1], offset); + w5[3] = amd_bytealign_S (w4[3], w5[0], offset); + w5[2] = amd_bytealign_S (w4[2], w4[3], offset); + w5[1] = amd_bytealign_S (w4[1], w4[2], offset); + w5[0] = amd_bytealign_S (w4[0], w4[1], offset); + w4[3] = amd_bytealign_S (w3[3], w4[0], offset); + w4[2] = amd_bytealign_S (w3[2], w3[3], offset); + w4[1] = amd_bytealign_S (w3[1], w3[2], offset); + w4[0] = amd_bytealign_S (w3[0], w3[1], offset); w3[3] = amd_bytealign_S (w2[3], w3[0], offset); w3[2] = amd_bytealign_S (w2[2], w2[3], offset); w3[1] = amd_bytealign_S (w2[1], w2[2], offset); @@ -9136,11 +12259,22 @@ inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], break; case 4: - c1[0] = amd_bytealign_S (w3[3], 0, offset); - c0[3] = amd_bytealign_S (w3[2], w3[3], offset); - c0[2] = amd_bytealign_S (w3[1], w3[2], offset); - c0[1] = amd_bytealign_S (w3[0], w3[1], offset); - c0[0] = amd_bytealign_S (w2[3], w3[0], offset); + w7[3] = amd_bytealign_S (w6[2], w6[3], offset); + w7[2] = amd_bytealign_S (w6[1], w6[2], offset); + w7[1] = amd_bytealign_S (w6[0], w6[1], offset); + w7[0] = amd_bytealign_S (w5[3], w6[0], offset); + w6[3] = amd_bytealign_S (w5[2], w5[3], offset); + w6[2] = amd_bytealign_S (w5[1], w5[2], offset); + w6[1] = amd_bytealign_S (w5[0], w5[1], offset); + w6[0] = amd_bytealign_S (w4[3], w5[0], offset); + w5[3] = amd_bytealign_S (w4[2], w4[3], offset); + w5[2] = amd_bytealign_S (w4[1], w4[2], offset); + w5[1] = amd_bytealign_S (w4[0], w4[1], offset); + w5[0] = amd_bytealign_S (w3[3], w4[0], offset); + w4[3] = amd_bytealign_S (w3[2], w3[3], offset); + w4[2] = amd_bytealign_S (w3[1], w3[2], offset); + w4[1] = amd_bytealign_S (w3[0], w3[1], offset); + w4[0] = amd_bytealign_S (w2[3], w3[0], offset); w3[3] = amd_bytealign_S (w2[2], w2[3], offset); w3[2] = amd_bytealign_S (w2[1], w2[2], offset); w3[1] = amd_bytealign_S (w2[0], w2[1], offset); @@ -9161,12 +12295,22 @@ inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], break; case 5: - c1[1] = amd_bytealign_S (w3[3], 0, offset); - c1[0] = amd_bytealign_S (w3[2], w3[3], offset); - c0[3] = amd_bytealign_S (w3[1], w3[2], offset); - c0[2] = amd_bytealign_S (w3[0], w3[1], offset); - c0[1] = amd_bytealign_S (w2[3], w3[0], offset); - c0[0] = amd_bytealign_S (w2[2], w2[3], offset); + w7[3] = amd_bytealign_S (w6[1], w6[2], offset); + w7[2] = amd_bytealign_S (w6[0], w6[1], offset); + w7[1] = amd_bytealign_S (w5[3], w6[0], offset); + w7[0] = amd_bytealign_S (w5[2], w5[3], offset); + w6[3] = amd_bytealign_S (w5[1], w5[2], offset); + w6[2] = amd_bytealign_S (w5[0], w5[1], offset); + w6[1] = amd_bytealign_S (w4[3], w5[0], offset); + w6[0] = amd_bytealign_S (w4[2], w4[3], offset); + w5[3] = amd_bytealign_S (w4[1], w4[2], offset); + w5[2] = amd_bytealign_S (w4[0], w4[1], offset); + w5[1] = amd_bytealign_S (w3[3], w4[0], offset); + w5[0] = amd_bytealign_S (w3[2], w3[3], offset); + w4[3] = amd_bytealign_S (w3[1], w3[2], offset); + w4[2] = amd_bytealign_S (w3[0], w3[1], offset); + w4[1] = amd_bytealign_S (w2[3], w3[0], offset); + w4[0] = amd_bytealign_S (w2[2], w2[3], offset); w3[3] = amd_bytealign_S (w2[1], w2[2], offset); w3[2] = amd_bytealign_S (w2[0], w2[1], offset); w3[1] = amd_bytealign_S (w1[3], w2[0], offset); @@ -9187,13 +12331,22 @@ inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], break; case 6: - c1[2] = amd_bytealign_S (w3[3], 0, offset); - c1[1] = amd_bytealign_S (w3[2], w3[3], offset); - c1[0] = amd_bytealign_S (w3[1], w3[2], offset); - c0[3] = amd_bytealign_S (w3[0], w3[1], offset); - c0[2] = amd_bytealign_S (w2[3], w3[0], offset); - c0[1] = amd_bytealign_S (w2[2], w2[3], offset); - c0[0] = amd_bytealign_S (w2[1], w2[2], offset); + w7[3] = amd_bytealign_S (w6[0], w6[1], offset); + w7[2] = amd_bytealign_S (w5[3], w6[0], offset); + w7[1] = amd_bytealign_S (w5[2], w5[3], offset); + w7[0] = amd_bytealign_S (w5[1], w5[2], offset); + w6[3] = amd_bytealign_S (w5[0], w5[1], offset); + w6[2] = amd_bytealign_S (w4[3], w5[0], offset); + w6[1] = amd_bytealign_S (w4[2], w4[3], offset); + w6[0] = amd_bytealign_S (w4[1], w4[2], offset); + w5[3] = amd_bytealign_S (w4[0], w4[1], offset); + w5[2] = amd_bytealign_S (w3[3], w4[0], offset); + w5[1] = amd_bytealign_S (w3[2], w3[3], offset); + w5[0] = amd_bytealign_S (w3[1], w3[2], offset); + w4[3] = amd_bytealign_S (w3[0], w3[1], offset); + w4[2] = amd_bytealign_S (w2[3], w3[0], offset); + w4[1] = amd_bytealign_S (w2[2], w2[3], offset); + w4[0] = amd_bytealign_S (w2[1], w2[2], offset); w3[3] = amd_bytealign_S (w2[0], w2[1], offset); w3[2] = amd_bytealign_S (w1[3], w2[0], offset); w3[1] = amd_bytealign_S (w1[2], w1[3], offset); @@ -9214,14 +12367,22 @@ inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], break; case 7: - c1[3] = amd_bytealign_S (w3[3], 0, offset); - c1[2] = amd_bytealign_S (w3[2], w3[3], offset); - c1[1] = amd_bytealign_S (w3[1], w3[2], offset); - c1[0] = amd_bytealign_S (w3[0], w3[1], offset); - c0[3] = amd_bytealign_S (w2[3], w3[0], offset); - c0[2] = amd_bytealign_S (w2[2], w2[3], offset); - c0[1] = amd_bytealign_S (w2[1], w2[2], offset); - c0[0] = amd_bytealign_S (w2[0], w2[1], offset); + w7[3] = amd_bytealign_S (w5[3], w6[0], offset); + w7[2] = amd_bytealign_S (w5[2], w5[3], offset); + w7[1] = amd_bytealign_S (w5[1], w5[2], offset); + w7[0] = amd_bytealign_S (w5[0], w5[1], offset); + w6[3] = amd_bytealign_S (w4[3], w5[0], offset); + w6[2] = amd_bytealign_S (w4[2], w4[3], offset); + w6[1] = amd_bytealign_S (w4[1], w4[2], offset); + w6[0] = amd_bytealign_S (w4[0], w4[1], offset); + w5[3] = amd_bytealign_S (w3[3], w4[0], offset); + w5[2] = amd_bytealign_S (w3[2], w3[3], offset); + w5[1] = amd_bytealign_S (w3[1], w3[2], offset); + w5[0] = amd_bytealign_S (w3[0], w3[1], offset); + w4[3] = amd_bytealign_S (w2[3], w3[0], offset); + w4[2] = amd_bytealign_S (w2[2], w2[3], offset); + w4[1] = amd_bytealign_S (w2[1], w2[2], offset); + w4[0] = amd_bytealign_S (w2[0], w2[1], offset); w3[3] = amd_bytealign_S (w1[3], w2[0], offset); w3[2] = amd_bytealign_S (w1[2], w1[3], offset); w3[1] = amd_bytealign_S (w1[1], w1[2], offset); @@ -9242,15 +12403,22 @@ inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], break; case 8: - c2[0] = amd_bytealign_S (w3[3], 0, offset); - c1[3] = amd_bytealign_S (w3[2], w3[3], offset); - c1[2] = amd_bytealign_S (w3[1], w3[2], offset); - c1[1] = amd_bytealign_S (w3[0], w3[1], offset); - c1[0] = amd_bytealign_S (w2[3], w3[0], offset); - c0[3] = amd_bytealign_S (w2[2], w2[3], offset); - c0[2] = amd_bytealign_S (w2[1], w2[2], offset); - c0[1] = amd_bytealign_S (w2[0], w2[1], offset); - c0[0] = amd_bytealign_S (w1[3], w2[0], offset); + w7[3] = amd_bytealign_S (w5[2], w5[3], offset); + w7[2] = amd_bytealign_S (w5[1], w5[2], offset); + w7[1] = amd_bytealign_S (w5[0], w5[1], offset); + w7[0] = amd_bytealign_S (w4[3], w5[0], offset); + w6[3] = amd_bytealign_S (w4[2], w4[3], offset); + w6[2] = amd_bytealign_S (w4[1], w4[2], offset); + w6[1] = amd_bytealign_S (w4[0], w4[1], offset); + w6[0] = amd_bytealign_S (w3[3], w4[0], offset); + w5[3] = amd_bytealign_S (w3[2], w3[3], offset); + w5[2] = amd_bytealign_S (w3[1], w3[2], offset); + w5[1] = amd_bytealign_S (w3[0], w3[1], offset); + w5[0] = amd_bytealign_S (w2[3], w3[0], offset); + w4[3] = amd_bytealign_S (w2[2], w2[3], offset); + w4[2] = amd_bytealign_S (w2[1], w2[2], offset); + w4[1] = amd_bytealign_S (w2[0], w2[1], offset); + w4[0] = amd_bytealign_S (w1[3], w2[0], offset); w3[3] = amd_bytealign_S (w1[2], w1[3], offset); w3[2] = amd_bytealign_S (w1[1], w1[2], offset); w3[1] = amd_bytealign_S (w1[0], w1[1], offset); @@ -9271,16 +12439,22 @@ inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], break; case 9: - c2[1] = amd_bytealign_S (w3[3], 0, offset); - c2[0] = amd_bytealign_S (w3[2], w3[3], offset); - c1[3] = amd_bytealign_S (w3[1], w3[2], offset); - c1[2] = amd_bytealign_S (w3[0], w3[1], offset); - c1[1] = amd_bytealign_S (w2[3], w3[0], offset); - c1[0] = amd_bytealign_S (w2[2], w2[3], offset); - c0[3] = amd_bytealign_S (w2[1], w2[2], offset); - c0[2] = amd_bytealign_S (w2[0], w2[1], offset); - c0[1] = amd_bytealign_S (w1[3], w2[0], offset); - c0[0] = amd_bytealign_S (w1[2], w1[3], offset); + w7[3] = amd_bytealign_S (w5[1], w5[2], offset); + w7[2] = amd_bytealign_S (w5[0], w5[1], offset); + w7[1] = amd_bytealign_S (w4[3], w5[0], offset); + w7[0] = amd_bytealign_S (w4[2], w4[3], offset); + w6[3] = amd_bytealign_S (w4[1], w4[2], offset); + w6[2] = amd_bytealign_S (w4[0], w4[1], offset); + w6[1] = amd_bytealign_S (w3[3], w4[0], offset); + w6[0] = amd_bytealign_S (w3[2], w3[3], offset); + w5[3] = amd_bytealign_S (w3[1], w3[2], offset); + w5[2] = amd_bytealign_S (w3[0], w3[1], offset); + w5[1] = amd_bytealign_S (w2[3], w3[0], offset); + w5[0] = amd_bytealign_S (w2[2], w2[3], offset); + w4[3] = amd_bytealign_S (w2[1], w2[2], offset); + w4[2] = amd_bytealign_S (w2[0], w2[1], offset); + w4[1] = amd_bytealign_S (w1[3], w2[0], offset); + w4[0] = amd_bytealign_S (w1[2], w1[3], offset); w3[3] = amd_bytealign_S (w1[1], w1[2], offset); w3[2] = amd_bytealign_S (w1[0], w1[1], offset); w3[1] = amd_bytealign_S (w0[3], w1[0], offset); @@ -9301,17 +12475,22 @@ inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], break; case 10: - c2[2] = amd_bytealign_S (w3[3], 0, offset); - c2[1] = amd_bytealign_S (w3[2], w3[3], offset); - c2[0] = amd_bytealign_S (w3[1], w3[2], offset); - c1[3] = amd_bytealign_S (w3[0], w3[1], offset); - c1[2] = amd_bytealign_S (w2[3], w3[0], offset); - c1[1] = amd_bytealign_S (w2[2], w2[3], offset); - c1[0] = amd_bytealign_S (w2[1], w2[2], offset); - c0[3] = amd_bytealign_S (w2[0], w2[1], offset); - c0[2] = amd_bytealign_S (w1[3], w2[0], offset); - c0[1] = amd_bytealign_S (w1[2], w1[3], offset); - c0[0] = amd_bytealign_S (w1[1], w1[2], offset); + w7[3] = amd_bytealign_S (w5[0], w5[1], offset); + w7[2] = amd_bytealign_S (w4[3], w5[0], offset); + w7[1] = amd_bytealign_S (w4[2], w4[3], offset); + w7[0] = amd_bytealign_S (w4[1], w4[2], offset); + w6[3] = amd_bytealign_S (w4[0], w4[1], offset); + w6[2] = amd_bytealign_S (w3[3], w4[0], offset); + w6[1] = amd_bytealign_S (w3[2], w3[3], offset); + w6[0] = amd_bytealign_S (w3[1], w3[2], offset); + w5[3] = amd_bytealign_S (w3[0], w3[1], offset); + w5[2] = amd_bytealign_S (w2[3], w3[0], offset); + w5[1] = amd_bytealign_S (w2[2], w2[3], offset); + w5[0] = amd_bytealign_S (w2[1], w2[2], offset); + w4[3] = amd_bytealign_S (w2[0], w2[1], offset); + w4[2] = amd_bytealign_S (w1[3], w2[0], offset); + w4[1] = amd_bytealign_S (w1[2], w1[3], offset); + w4[0] = amd_bytealign_S (w1[1], w1[2], offset); w3[3] = amd_bytealign_S (w1[0], w1[1], offset); w3[2] = amd_bytealign_S (w0[3], w1[0], offset); w3[1] = amd_bytealign_S (w0[2], w0[3], offset); @@ -9332,18 +12511,22 @@ inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], break; case 11: - c2[3] = amd_bytealign_S (w3[3], 0, offset); - c2[2] = amd_bytealign_S (w3[2], w3[3], offset); - c2[1] = amd_bytealign_S (w3[1], w3[2], offset); - c2[0] = amd_bytealign_S (w3[0], w3[1], offset); - c1[3] = amd_bytealign_S (w2[3], w3[0], offset); - c1[2] = amd_bytealign_S (w2[2], w2[3], offset); - c1[1] = amd_bytealign_S (w2[1], w2[2], offset); - c1[0] = amd_bytealign_S (w2[0], w2[1], offset); - c0[3] = amd_bytealign_S (w1[3], w2[0], offset); - c0[2] = amd_bytealign_S (w1[2], w1[3], offset); - c0[1] = amd_bytealign_S (w1[1], w1[2], offset); - c0[0] = amd_bytealign_S (w1[0], w1[1], offset); + w7[3] = amd_bytealign_S (w4[3], w5[0], offset); + w7[2] = amd_bytealign_S (w4[2], w4[3], offset); + w7[1] = amd_bytealign_S (w4[1], w4[2], offset); + w7[0] = amd_bytealign_S (w4[0], w4[1], offset); + w6[3] = amd_bytealign_S (w3[3], w4[0], offset); + w6[2] = amd_bytealign_S (w3[2], w3[3], offset); + w6[1] = amd_bytealign_S (w3[1], w3[2], offset); + w6[0] = amd_bytealign_S (w3[0], w3[1], offset); + w5[3] = amd_bytealign_S (w2[3], w3[0], offset); + w5[2] = amd_bytealign_S (w2[2], w2[3], offset); + w5[1] = amd_bytealign_S (w2[1], w2[2], offset); + w5[0] = amd_bytealign_S (w2[0], w2[1], offset); + w4[3] = amd_bytealign_S (w1[3], w2[0], offset); + w4[2] = amd_bytealign_S (w1[2], w1[3], offset); + w4[1] = amd_bytealign_S (w1[1], w1[2], offset); + w4[0] = amd_bytealign_S (w1[0], w1[1], offset); w3[3] = amd_bytealign_S (w0[3], w1[0], offset); w3[2] = amd_bytealign_S (w0[2], w0[3], offset); w3[1] = amd_bytealign_S (w0[1], w0[2], offset); @@ -9364,19 +12547,22 @@ inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], break; case 12: - c3[0] = amd_bytealign_S (w3[3], 0, offset); - c2[3] = amd_bytealign_S (w3[2], w3[3], offset); - c2[2] = amd_bytealign_S (w3[1], w3[2], offset); - c2[1] = amd_bytealign_S (w3[0], w3[1], offset); - c2[0] = amd_bytealign_S (w2[3], w3[0], offset); - c1[3] = amd_bytealign_S (w2[2], w2[3], offset); - c1[2] = amd_bytealign_S (w2[1], w2[2], offset); - c1[1] = amd_bytealign_S (w2[0], w2[1], offset); - c1[0] = amd_bytealign_S (w1[3], w2[0], offset); - c0[3] = amd_bytealign_S (w1[2], w1[3], offset); - c0[2] = amd_bytealign_S (w1[1], w1[2], offset); - c0[1] = amd_bytealign_S (w1[0], w1[1], offset); - c0[0] = amd_bytealign_S (w0[3], w1[0], offset); + w7[3] = amd_bytealign_S (w4[2], w4[3], offset); + w7[2] = amd_bytealign_S (w4[1], w4[2], offset); + w7[1] = amd_bytealign_S (w4[0], w4[1], offset); + w7[0] = amd_bytealign_S (w3[3], w4[0], offset); + w6[3] = amd_bytealign_S (w3[2], w3[3], offset); + w6[2] = amd_bytealign_S (w3[1], w3[2], offset); + w6[1] = amd_bytealign_S (w3[0], w3[1], offset); + w6[0] = amd_bytealign_S (w2[3], w3[0], offset); + w5[3] = amd_bytealign_S (w2[2], w2[3], offset); + w5[2] = amd_bytealign_S (w2[1], w2[2], offset); + w5[1] = amd_bytealign_S (w2[0], w2[1], offset); + w5[0] = amd_bytealign_S (w1[3], w2[0], offset); + w4[3] = amd_bytealign_S (w1[2], w1[3], offset); + w4[2] = amd_bytealign_S (w1[1], w1[2], offset); + w4[1] = amd_bytealign_S (w1[0], w1[1], offset); + w4[0] = amd_bytealign_S (w0[3], w1[0], offset); w3[3] = amd_bytealign_S (w0[2], w0[3], offset); w3[2] = amd_bytealign_S (w0[1], w0[2], offset); w3[1] = amd_bytealign_S (w0[0], w0[1], offset); @@ -9397,20 +12583,22 @@ inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], break; case 13: - c3[1] = amd_bytealign_S (w3[3], 0, offset); - c3[0] = amd_bytealign_S (w3[2], w3[3], offset); - c2[3] = amd_bytealign_S (w3[1], w3[2], offset); - c2[2] = amd_bytealign_S (w3[0], w3[1], offset); - c2[1] = amd_bytealign_S (w2[3], w3[0], offset); - c2[0] = amd_bytealign_S (w2[2], w2[3], offset); - c1[3] = amd_bytealign_S (w2[1], w2[2], offset); - c1[2] = amd_bytealign_S (w2[0], w2[1], offset); - c1[1] = amd_bytealign_S (w1[3], w2[0], offset); - c1[0] = amd_bytealign_S (w1[2], w1[3], offset); - c0[3] = amd_bytealign_S (w1[1], w1[2], offset); - c0[2] = amd_bytealign_S (w1[0], w1[1], offset); - c0[1] = amd_bytealign_S (w0[3], w1[0], offset); - c0[0] = amd_bytealign_S (w0[2], w0[3], offset); + w7[3] = amd_bytealign_S (w4[1], w4[2], offset); + w7[2] = amd_bytealign_S (w4[0], w4[1], offset); + w7[1] = amd_bytealign_S (w3[3], w4[0], offset); + w7[0] = amd_bytealign_S (w3[2], w3[3], offset); + w6[3] = amd_bytealign_S (w3[1], w3[2], offset); + w6[2] = amd_bytealign_S (w3[0], w3[1], offset); + w6[1] = amd_bytealign_S (w2[3], w3[0], offset); + w6[0] = amd_bytealign_S (w2[2], w2[3], offset); + w5[3] = amd_bytealign_S (w2[1], w2[2], offset); + w5[2] = amd_bytealign_S (w2[0], w2[1], offset); + w5[1] = amd_bytealign_S (w1[3], w2[0], offset); + w5[0] = amd_bytealign_S (w1[2], w1[3], offset); + w4[3] = amd_bytealign_S (w1[1], w1[2], offset); + w4[2] = amd_bytealign_S (w1[0], w1[1], offset); + w4[1] = amd_bytealign_S (w0[3], w1[0], offset); + w4[0] = amd_bytealign_S (w0[2], w0[3], offset); w3[3] = amd_bytealign_S (w0[1], w0[2], offset); w3[2] = amd_bytealign_S (w0[0], w0[1], offset); w3[1] = amd_bytealign_S ( 0, w0[0], offset); @@ -9430,24 +12618,169 @@ inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], break; - case 14: - c3[2] = amd_bytealign_S (w3[3], 0, offset); - c3[1] = amd_bytealign_S (w3[2], w3[3], offset); - c3[0] = amd_bytealign_S (w3[1], w3[2], offset); - c2[3] = amd_bytealign_S (w3[0], w3[1], offset); - c2[2] = amd_bytealign_S (w2[3], w3[0], offset); - c2[1] = amd_bytealign_S (w2[2], w2[3], offset); - c2[0] = amd_bytealign_S (w2[1], w2[2], offset); - c1[3] = amd_bytealign_S (w2[0], w2[1], offset); - c1[2] = amd_bytealign_S (w1[3], w2[0], offset); - c1[1] = amd_bytealign_S (w1[2], w1[3], offset); - c1[0] = amd_bytealign_S (w1[1], w1[2], offset); - c0[3] = amd_bytealign_S (w1[0], w1[1], offset); - c0[2] = amd_bytealign_S (w0[3], w1[0], offset); - c0[1] = amd_bytealign_S (w0[2], w0[3], offset); - c0[0] = amd_bytealign_S (w0[1], w0[2], offset); - w3[3] = amd_bytealign_S (w0[0], w0[1], offset); - w3[2] = amd_bytealign_S ( 0, w0[0], offset); + case 14: + w7[3] = amd_bytealign_S (w4[0], w4[1], offset); + w7[2] = amd_bytealign_S (w3[3], w4[0], offset); + w7[1] = amd_bytealign_S (w3[2], w3[3], offset); + w7[0] = amd_bytealign_S (w3[1], w3[2], offset); + w6[3] = amd_bytealign_S (w3[0], w3[1], offset); + w6[2] = amd_bytealign_S (w2[3], w3[0], offset); + w6[1] = amd_bytealign_S (w2[2], w2[3], offset); + w6[0] = amd_bytealign_S (w2[1], w2[2], offset); + w5[3] = amd_bytealign_S (w2[0], w2[1], offset); + w5[2] = amd_bytealign_S (w1[3], w2[0], offset); + w5[1] = amd_bytealign_S (w1[2], w1[3], offset); + w5[0] = amd_bytealign_S (w1[1], w1[2], offset); + w4[3] = amd_bytealign_S (w1[0], w1[1], offset); + w4[2] = amd_bytealign_S (w0[3], w1[0], offset); + w4[1] = amd_bytealign_S (w0[2], w0[3], offset); + w4[0] = amd_bytealign_S (w0[1], w0[2], offset); + w3[3] = amd_bytealign_S (w0[0], w0[1], offset); + w3[2] = amd_bytealign_S ( 0, w0[0], offset); + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 15: + w7[3] = amd_bytealign_S (w3[3], w4[0], offset); + w7[2] = amd_bytealign_S (w3[2], w3[3], offset); + w7[1] = amd_bytealign_S (w3[1], w3[2], offset); + w7[0] = amd_bytealign_S (w3[0], w3[1], offset); + w6[3] = amd_bytealign_S (w2[3], w3[0], offset); + w6[2] = amd_bytealign_S (w2[2], w2[3], offset); + w6[1] = amd_bytealign_S (w2[1], w2[2], offset); + w6[0] = amd_bytealign_S (w2[0], w2[1], offset); + w5[3] = amd_bytealign_S (w1[3], w2[0], offset); + w5[2] = amd_bytealign_S (w1[2], w1[3], offset); + w5[1] = amd_bytealign_S (w1[1], w1[2], offset); + w5[0] = amd_bytealign_S (w1[0], w1[1], offset); + w4[3] = amd_bytealign_S (w0[3], w1[0], offset); + w4[2] = amd_bytealign_S (w0[2], w0[3], offset); + w4[1] = amd_bytealign_S (w0[1], w0[2], offset); + w4[0] = amd_bytealign_S (w0[0], w0[1], offset); + w3[3] = amd_bytealign_S ( 0, w0[0], offset); + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 16: + w7[3] = amd_bytealign_S (w3[2], w3[3], offset); + w7[2] = amd_bytealign_S (w3[1], w3[2], offset); + w7[1] = amd_bytealign_S (w3[0], w3[1], offset); + w7[0] = amd_bytealign_S (w2[3], w3[0], offset); + w6[3] = amd_bytealign_S (w2[2], w2[3], offset); + w6[2] = amd_bytealign_S (w2[1], w2[2], offset); + w6[1] = amd_bytealign_S (w2[0], w2[1], offset); + w6[0] = amd_bytealign_S (w1[3], w2[0], offset); + w5[3] = amd_bytealign_S (w1[2], w1[3], offset); + w5[2] = amd_bytealign_S (w1[1], w1[2], offset); + w5[1] = amd_bytealign_S (w1[0], w1[1], offset); + w5[0] = amd_bytealign_S (w0[3], w1[0], offset); + w4[3] = amd_bytealign_S (w0[2], w0[3], offset); + w4[2] = amd_bytealign_S (w0[1], w0[2], offset); + w4[1] = amd_bytealign_S (w0[0], w0[1], offset); + w4[0] = amd_bytealign_S ( 0, w0[0], offset); + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 17: + w7[3] = amd_bytealign_S (w3[1], w3[2], offset); + w7[2] = amd_bytealign_S (w3[0], w3[1], offset); + w7[1] = amd_bytealign_S (w2[3], w3[0], offset); + w7[0] = amd_bytealign_S (w2[2], w2[3], offset); + w6[3] = amd_bytealign_S (w2[1], w2[2], offset); + w6[2] = amd_bytealign_S (w2[0], w2[1], offset); + w6[1] = amd_bytealign_S (w1[3], w2[0], offset); + w6[0] = amd_bytealign_S (w1[2], w1[3], offset); + w5[3] = amd_bytealign_S (w1[1], w1[2], offset); + w5[2] = amd_bytealign_S (w1[0], w1[1], offset); + w5[1] = amd_bytealign_S (w0[3], w1[0], offset); + w5[0] = amd_bytealign_S (w0[2], w0[3], offset); + w4[3] = amd_bytealign_S (w0[1], w0[2], offset); + w4[2] = amd_bytealign_S (w0[0], w0[1], offset); + w4[1] = amd_bytealign_S ( 0, w0[0], offset); + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 18: + w7[3] = amd_bytealign_S (w3[0], w3[1], offset); + w7[2] = amd_bytealign_S (w2[3], w3[0], offset); + w7[1] = amd_bytealign_S (w2[2], w2[3], offset); + w7[0] = amd_bytealign_S (w2[1], w2[2], offset); + w6[3] = amd_bytealign_S (w2[0], w2[1], offset); + w6[2] = amd_bytealign_S (w1[3], w2[0], offset); + w6[1] = amd_bytealign_S (w1[2], w1[3], offset); + w6[0] = amd_bytealign_S (w1[1], w1[2], offset); + w5[3] = amd_bytealign_S (w1[0], w1[1], offset); + w5[2] = amd_bytealign_S (w0[3], w1[0], offset); + w5[1] = amd_bytealign_S (w0[2], w0[3], offset); + w5[0] = amd_bytealign_S (w0[1], w0[2], offset); + w4[3] = amd_bytealign_S (w0[0], w0[1], offset); + w4[2] = amd_bytealign_S ( 0, w0[0], offset); + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -9465,24 +12798,24 @@ inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], break; - case 15: - c3[3] = amd_bytealign_S (w3[3], 0, offset); - c3[2] = amd_bytealign_S (w3[2], w3[3], offset); - c3[1] = amd_bytealign_S (w3[1], w3[2], offset); - c3[0] = amd_bytealign_S (w3[0], w3[1], offset); - c2[3] = amd_bytealign_S (w2[3], w3[0], offset); - c2[2] = amd_bytealign_S (w2[2], w2[3], offset); - c2[1] = amd_bytealign_S (w2[1], w2[2], offset); - c2[0] = amd_bytealign_S (w2[0], w2[1], offset); - c1[3] = amd_bytealign_S (w1[3], w2[0], offset); - c1[2] = amd_bytealign_S (w1[2], w1[3], offset); - c1[1] = amd_bytealign_S (w1[1], w1[2], offset); - c1[0] = amd_bytealign_S (w1[0], w1[1], offset); - c0[3] = amd_bytealign_S (w0[3], w1[0], offset); - c0[2] = amd_bytealign_S (w0[2], w0[3], offset); - c0[1] = amd_bytealign_S (w0[1], w0[2], offset); - c0[0] = amd_bytealign_S (w0[0], w0[1], offset); - w3[3] = amd_bytealign_S ( 0, w0[0], offset); + case 19: + w7[3] = amd_bytealign_S (w2[3], w3[0], offset); + w7[2] = amd_bytealign_S (w2[2], w2[3], offset); + w7[1] = amd_bytealign_S (w2[1], w2[2], offset); + w7[0] = amd_bytealign_S (w2[0], w2[1], offset); + w6[3] = amd_bytealign_S (w1[3], w2[0], offset); + w6[2] = amd_bytealign_S (w1[2], w1[3], offset); + w6[1] = amd_bytealign_S (w1[1], w1[2], offset); + w6[0] = amd_bytealign_S (w1[0], w1[1], offset); + w5[3] = amd_bytealign_S (w0[3], w1[0], offset); + w5[2] = amd_bytealign_S (w0[2], w0[3], offset); + w5[1] = amd_bytealign_S (w0[1], w0[2], offset); + w5[0] = amd_bytealign_S (w0[0], w0[1], offset); + w4[3] = amd_bytealign_S ( 0, w0[0], offset); + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -9499,123 +12832,37 @@ inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], w0[1] = 0; w0[0] = 0; - break; - } - #endif - - #ifdef IS_NV - const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; - - switch (offset / 4) - { - case 0: - c0[0] = __byte_perm_S ( 0, w3[3], selector); - w3[3] = __byte_perm_S (w3[3], w3[2], selector); - w3[2] = __byte_perm_S (w3[2], w3[1], selector); - w3[1] = __byte_perm_S (w3[1], w3[0], selector); - w3[0] = __byte_perm_S (w3[0], w2[3], selector); - w2[3] = __byte_perm_S (w2[3], w2[2], selector); - w2[2] = __byte_perm_S (w2[2], w2[1], selector); - w2[1] = __byte_perm_S (w2[1], w2[0], selector); - w2[0] = __byte_perm_S (w2[0], w1[3], selector); - w1[3] = __byte_perm_S (w1[3], w1[2], selector); - w1[2] = __byte_perm_S (w1[2], w1[1], selector); - w1[1] = __byte_perm_S (w1[1], w1[0], selector); - w1[0] = __byte_perm_S (w1[0], w0[3], selector); - w0[3] = __byte_perm_S (w0[3], w0[2], selector); - w0[2] = __byte_perm_S (w0[2], w0[1], selector); - w0[1] = __byte_perm_S (w0[1], w0[0], selector); - w0[0] = __byte_perm_S (w0[0], 0, selector); - - break; - - case 1: - c0[1] = __byte_perm_S ( 0, w3[3], selector); - c0[0] = __byte_perm_S (w3[3], w3[2], selector); - w3[3] = __byte_perm_S (w3[2], w3[1], selector); - w3[2] = __byte_perm_S (w3[1], w3[0], selector); - w3[1] = __byte_perm_S (w3[0], w2[3], selector); - w3[0] = __byte_perm_S (w2[3], w2[2], selector); - w2[3] = __byte_perm_S (w2[2], w2[1], selector); - w2[2] = __byte_perm_S (w2[1], w2[0], selector); - w2[1] = __byte_perm_S (w2[0], w1[3], selector); - w2[0] = __byte_perm_S (w1[3], w1[2], selector); - w1[3] = __byte_perm_S (w1[2], w1[1], selector); - w1[2] = __byte_perm_S (w1[1], w1[0], selector); - w1[1] = __byte_perm_S (w1[0], w0[3], selector); - w1[0] = __byte_perm_S (w0[3], w0[2], selector); - w0[3] = __byte_perm_S (w0[2], w0[1], selector); - w0[2] = __byte_perm_S (w0[1], w0[0], selector); - w0[1] = __byte_perm_S (w0[0], 0, selector); - w0[0] = 0; - - break; - - case 2: - c0[2] = __byte_perm_S ( 0, w3[3], selector); - c0[1] = __byte_perm_S (w3[3], w3[2], selector); - c0[0] = __byte_perm_S (w3[2], w3[1], selector); - w3[3] = __byte_perm_S (w3[1], w3[0], selector); - w3[2] = __byte_perm_S (w3[0], w2[3], selector); - w3[1] = __byte_perm_S (w2[3], w2[2], selector); - w3[0] = __byte_perm_S (w2[2], w2[1], selector); - w2[3] = __byte_perm_S (w2[1], w2[0], selector); - w2[2] = __byte_perm_S (w2[0], w1[3], selector); - w2[1] = __byte_perm_S (w1[3], w1[2], selector); - w2[0] = __byte_perm_S (w1[2], w1[1], selector); - w1[3] = __byte_perm_S (w1[1], w1[0], selector); - w1[2] = __byte_perm_S (w1[0], w0[3], selector); - w1[1] = __byte_perm_S (w0[3], w0[2], selector); - w1[0] = __byte_perm_S (w0[2], w0[1], selector); - w0[3] = __byte_perm_S (w0[1], w0[0], selector); - w0[2] = __byte_perm_S (w0[0], 0, selector); - w0[1] = 0; - w0[0] = 0; - - break; - - case 3: - c0[3] = __byte_perm_S ( 0, w3[3], selector); - c0[2] = __byte_perm_S (w3[3], w3[2], selector); - c0[1] = __byte_perm_S (w3[2], w3[1], selector); - c0[0] = __byte_perm_S (w3[1], w3[0], selector); - w3[3] = __byte_perm_S (w3[0], w2[3], selector); - w3[2] = __byte_perm_S (w2[3], w2[2], selector); - w3[1] = __byte_perm_S (w2[2], w2[1], selector); - w3[0] = __byte_perm_S (w2[1], w2[0], selector); - w2[3] = __byte_perm_S (w2[0], w1[3], selector); - w2[2] = __byte_perm_S (w1[3], w1[2], selector); - w2[1] = __byte_perm_S (w1[2], w1[1], selector); - w2[0] = __byte_perm_S (w1[1], w1[0], selector); - w1[3] = __byte_perm_S (w1[0], w0[3], selector); - w1[2] = __byte_perm_S (w0[3], w0[2], selector); - w1[1] = __byte_perm_S (w0[2], w0[1], selector); - w1[0] = __byte_perm_S (w0[1], w0[0], selector); - w0[3] = __byte_perm_S (w0[0], 0, selector); - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - case 4: - c1[0] = __byte_perm_S ( 0, w3[3], selector); - c0[3] = __byte_perm_S (w3[3], w3[2], selector); - c0[2] = __byte_perm_S (w3[2], w3[1], selector); - c0[1] = __byte_perm_S (w3[1], w3[0], selector); - c0[0] = __byte_perm_S (w3[0], w2[3], selector); - w3[3] = __byte_perm_S (w2[3], w2[2], selector); - w3[2] = __byte_perm_S (w2[2], w2[1], selector); - w3[1] = __byte_perm_S (w2[1], w2[0], selector); - w3[0] = __byte_perm_S (w2[0], w1[3], selector); - w2[3] = __byte_perm_S (w1[3], w1[2], selector); - w2[2] = __byte_perm_S (w1[2], w1[1], selector); - w2[1] = __byte_perm_S (w1[1], w1[0], selector); - w2[0] = __byte_perm_S (w1[0], w0[3], selector); - w1[3] = __byte_perm_S (w0[3], w0[2], selector); - w1[2] = __byte_perm_S (w0[2], w0[1], selector); - w1[1] = __byte_perm_S (w0[1], w0[0], selector); - w1[0] = __byte_perm_S (w0[0], 0, selector); + case 20: + w7[3] = amd_bytealign_S (w2[2], w2[3], offset); + w7[2] = amd_bytealign_S (w2[1], w2[2], offset); + w7[1] = amd_bytealign_S (w2[0], w2[1], offset); + w7[0] = amd_bytealign_S (w1[3], w2[0], offset); + w6[3] = amd_bytealign_S (w1[2], w1[3], offset); + w6[2] = amd_bytealign_S (w1[1], w1[2], offset); + w6[1] = amd_bytealign_S (w1[0], w1[1], offset); + w6[0] = amd_bytealign_S (w0[3], w1[0], offset); + w5[3] = amd_bytealign_S (w0[2], w0[3], offset); + w5[2] = amd_bytealign_S (w0[1], w0[2], offset); + w5[1] = amd_bytealign_S (w0[0], w0[1], offset); + w5[0] = amd_bytealign_S ( 0, w0[0], offset); + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; @@ -9623,24 +12870,34 @@ inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], break; - case 5: - c1[1] = __byte_perm_S ( 0, w3[3], selector); - c1[0] = __byte_perm_S (w3[3], w3[2], selector); - c0[3] = __byte_perm_S (w3[2], w3[1], selector); - c0[2] = __byte_perm_S (w3[1], w3[0], selector); - c0[1] = __byte_perm_S (w3[0], w2[3], selector); - c0[0] = __byte_perm_S (w2[3], w2[2], selector); - w3[3] = __byte_perm_S (w2[2], w2[1], selector); - w3[2] = __byte_perm_S (w2[1], w2[0], selector); - w3[1] = __byte_perm_S (w2[0], w1[3], selector); - w3[0] = __byte_perm_S (w1[3], w1[2], selector); - w2[3] = __byte_perm_S (w1[2], w1[1], selector); - w2[2] = __byte_perm_S (w1[1], w1[0], selector); - w2[1] = __byte_perm_S (w1[0], w0[3], selector); - w2[0] = __byte_perm_S (w0[3], w0[2], selector); - w1[3] = __byte_perm_S (w0[2], w0[1], selector); - w1[2] = __byte_perm_S (w0[1], w0[0], selector); - w1[1] = __byte_perm_S (w0[0], 0, selector); + case 21: + w7[3] = amd_bytealign_S (w2[1], w2[2], offset); + w7[2] = amd_bytealign_S (w2[0], w2[1], offset); + w7[1] = amd_bytealign_S (w1[3], w2[0], offset); + w7[0] = amd_bytealign_S (w1[2], w1[3], offset); + w6[3] = amd_bytealign_S (w1[1], w1[2], offset); + w6[2] = amd_bytealign_S (w1[0], w1[1], offset); + w6[1] = amd_bytealign_S (w0[3], w1[0], offset); + w6[0] = amd_bytealign_S (w0[2], w0[3], offset); + w5[3] = amd_bytealign_S (w0[1], w0[2], offset); + w5[2] = amd_bytealign_S (w0[0], w0[1], offset); + w5[1] = amd_bytealign_S ( 0, w0[0], offset); + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; @@ -9649,24 +12906,33 @@ inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], break; - case 6: - c1[2] = __byte_perm_S ( 0, w3[3], selector); - c1[1] = __byte_perm_S (w3[3], w3[2], selector); - c1[0] = __byte_perm_S (w3[2], w3[1], selector); - c0[3] = __byte_perm_S (w3[1], w3[0], selector); - c0[2] = __byte_perm_S (w3[0], w2[3], selector); - c0[1] = __byte_perm_S (w2[3], w2[2], selector); - c0[0] = __byte_perm_S (w2[2], w2[1], selector); - w3[3] = __byte_perm_S (w2[1], w2[0], selector); - w3[2] = __byte_perm_S (w2[0], w1[3], selector); - w3[1] = __byte_perm_S (w1[3], w1[2], selector); - w3[0] = __byte_perm_S (w1[2], w1[1], selector); - w2[3] = __byte_perm_S (w1[1], w1[0], selector); - w2[2] = __byte_perm_S (w1[0], w0[3], selector); - w2[1] = __byte_perm_S (w0[3], w0[2], selector); - w2[0] = __byte_perm_S (w0[2], w0[1], selector); - w1[3] = __byte_perm_S (w0[1], w0[0], selector); - w1[2] = __byte_perm_S (w0[0], 0, selector); + case 22: + w7[3] = amd_bytealign_S (w2[0], w2[1], offset); + w7[2] = amd_bytealign_S (w1[3], w2[0], offset); + w7[1] = amd_bytealign_S (w1[2], w1[3], offset); + w7[0] = amd_bytealign_S (w1[1], w1[2], offset); + w6[3] = amd_bytealign_S (w1[0], w1[1], offset); + w6[2] = amd_bytealign_S (w0[3], w1[0], offset); + w6[1] = amd_bytealign_S (w0[2], w0[3], offset); + w6[0] = amd_bytealign_S (w0[1], w0[2], offset); + w5[3] = amd_bytealign_S (w0[0], w0[1], offset); + w5[2] = amd_bytealign_S ( 0, w0[0], offset); + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -9676,24 +12942,32 @@ inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], break; - case 7: - c1[3] = __byte_perm_S ( 0, w3[3], selector); - c1[2] = __byte_perm_S (w3[3], w3[2], selector); - c1[1] = __byte_perm_S (w3[2], w3[1], selector); - c1[0] = __byte_perm_S (w3[1], w3[0], selector); - c0[3] = __byte_perm_S (w3[0], w2[3], selector); - c0[2] = __byte_perm_S (w2[3], w2[2], selector); - c0[1] = __byte_perm_S (w2[2], w2[1], selector); - c0[0] = __byte_perm_S (w2[1], w2[0], selector); - w3[3] = __byte_perm_S (w2[0], w1[3], selector); - w3[2] = __byte_perm_S (w1[3], w1[2], selector); - w3[1] = __byte_perm_S (w1[2], w1[1], selector); - w3[0] = __byte_perm_S (w1[1], w1[0], selector); - w2[3] = __byte_perm_S (w1[0], w0[3], selector); - w2[2] = __byte_perm_S (w0[3], w0[2], selector); - w2[1] = __byte_perm_S (w0[2], w0[1], selector); - w2[0] = __byte_perm_S (w0[1], w0[0], selector); - w1[3] = __byte_perm_S (w0[0], 0, selector); + case 23: + w7[3] = amd_bytealign_S (w1[3], w2[0], offset); + w7[2] = amd_bytealign_S (w1[2], w1[3], offset); + w7[1] = amd_bytealign_S (w1[1], w1[2], offset); + w7[0] = amd_bytealign_S (w1[0], w1[1], offset); + w6[3] = amd_bytealign_S (w0[3], w1[0], offset); + w6[2] = amd_bytealign_S (w0[2], w0[3], offset); + w6[1] = amd_bytealign_S (w0[1], w0[2], offset); + w6[0] = amd_bytealign_S (w0[0], w0[1], offset); + w5[3] = amd_bytealign_S ( 0, w0[0], offset); + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -9704,24 +12978,31 @@ inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], break; - case 8: - c2[0] = __byte_perm_S ( 0, w3[3], selector); - c1[3] = __byte_perm_S (w3[3], w3[2], selector); - c1[2] = __byte_perm_S (w3[2], w3[1], selector); - c1[1] = __byte_perm_S (w3[1], w3[0], selector); - c1[0] = __byte_perm_S (w3[0], w2[3], selector); - c0[3] = __byte_perm_S (w2[3], w2[2], selector); - c0[2] = __byte_perm_S (w2[2], w2[1], selector); - c0[1] = __byte_perm_S (w2[1], w2[0], selector); - c0[0] = __byte_perm_S (w2[0], w1[3], selector); - w3[3] = __byte_perm_S (w1[3], w1[2], selector); - w3[2] = __byte_perm_S (w1[2], w1[1], selector); - w3[1] = __byte_perm_S (w1[1], w1[0], selector); - w3[0] = __byte_perm_S (w1[0], w0[3], selector); - w2[3] = __byte_perm_S (w0[3], w0[2], selector); - w2[2] = __byte_perm_S (w0[2], w0[1], selector); - w2[1] = __byte_perm_S (w0[1], w0[0], selector); - w2[0] = __byte_perm_S (w0[0], 0, selector); + case 24: + w7[3] = amd_bytealign_S (w1[2], w1[3], offset); + w7[2] = amd_bytealign_S (w1[1], w1[2], offset); + w7[1] = amd_bytealign_S (w1[0], w1[1], offset); + w7[0] = amd_bytealign_S (w0[3], w1[0], offset); + w6[3] = amd_bytealign_S (w0[2], w0[3], offset); + w6[2] = amd_bytealign_S (w0[1], w0[2], offset); + w6[1] = amd_bytealign_S (w0[0], w0[1], offset); + w6[0] = amd_bytealign_S ( 0, w0[0], offset); + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -9733,24 +13014,30 @@ inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], break; - case 9: - c2[1] = __byte_perm_S ( 0, w3[3], selector); - c2[0] = __byte_perm_S (w3[3], w3[2], selector); - c1[3] = __byte_perm_S (w3[2], w3[1], selector); - c1[2] = __byte_perm_S (w3[1], w3[0], selector); - c1[1] = __byte_perm_S (w3[0], w2[3], selector); - c1[0] = __byte_perm_S (w2[3], w2[2], selector); - c0[3] = __byte_perm_S (w2[2], w2[1], selector); - c0[2] = __byte_perm_S (w2[1], w2[0], selector); - c0[1] = __byte_perm_S (w2[0], w1[3], selector); - c0[0] = __byte_perm_S (w1[3], w1[2], selector); - w3[3] = __byte_perm_S (w1[2], w1[1], selector); - w3[2] = __byte_perm_S (w1[1], w1[0], selector); - w3[1] = __byte_perm_S (w1[0], w0[3], selector); - w3[0] = __byte_perm_S (w0[3], w0[2], selector); - w2[3] = __byte_perm_S (w0[2], w0[1], selector); - w2[2] = __byte_perm_S (w0[1], w0[0], selector); - w2[1] = __byte_perm_S (w0[0], 0, selector); + case 25: + w7[3] = amd_bytealign_S (w1[1], w1[2], offset); + w7[2] = amd_bytealign_S (w1[0], w1[1], offset); + w7[1] = amd_bytealign_S (w0[3], w1[0], offset); + w7[0] = amd_bytealign_S (w0[2], w0[3], offset); + w6[3] = amd_bytealign_S (w0[1], w0[2], offset); + w6[2] = amd_bytealign_S (w0[0], w0[1], offset); + w6[1] = amd_bytealign_S ( 0, w0[0], offset); + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -9763,24 +13050,29 @@ inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], break; - case 10: - c2[2] = __byte_perm_S ( 0, w3[3], selector); - c2[1] = __byte_perm_S (w3[3], w3[2], selector); - c2[0] = __byte_perm_S (w3[2], w3[1], selector); - c1[3] = __byte_perm_S (w3[1], w3[0], selector); - c1[2] = __byte_perm_S (w3[0], w2[3], selector); - c1[1] = __byte_perm_S (w2[3], w2[2], selector); - c1[0] = __byte_perm_S (w2[2], w2[1], selector); - c0[3] = __byte_perm_S (w2[1], w2[0], selector); - c0[2] = __byte_perm_S (w2[0], w1[3], selector); - c0[1] = __byte_perm_S (w1[3], w1[2], selector); - c0[0] = __byte_perm_S (w1[2], w1[1], selector); - w3[3] = __byte_perm_S (w1[1], w1[0], selector); - w3[2] = __byte_perm_S (w1[0], w0[3], selector); - w3[1] = __byte_perm_S (w0[3], w0[2], selector); - w3[0] = __byte_perm_S (w0[2], w0[1], selector); - w2[3] = __byte_perm_S (w0[1], w0[0], selector); - w2[2] = __byte_perm_S (w0[0], 0, selector); + case 26: + w7[3] = amd_bytealign_S (w1[0], w1[1], offset); + w7[2] = amd_bytealign_S (w0[3], w1[0], offset); + w7[1] = amd_bytealign_S (w0[2], w0[3], offset); + w7[0] = amd_bytealign_S (w0[1], w0[2], offset); + w6[3] = amd_bytealign_S (w0[0], w0[1], offset); + w6[2] = amd_bytealign_S ( 0, w0[0], offset); + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -9789,29 +13081,33 @@ inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], w1[0] = 0; w0[3] = 0; w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - break; - - case 11: - c2[3] = __byte_perm_S ( 0, w3[3], selector); - c2[2] = __byte_perm_S (w3[3], w3[2], selector); - c2[1] = __byte_perm_S (w3[2], w3[1], selector); - c2[0] = __byte_perm_S (w3[1], w3[0], selector); - c1[3] = __byte_perm_S (w3[0], w2[3], selector); - c1[2] = __byte_perm_S (w2[3], w2[2], selector); - c1[1] = __byte_perm_S (w2[2], w2[1], selector); - c1[0] = __byte_perm_S (w2[1], w2[0], selector); - c0[3] = __byte_perm_S (w2[0], w1[3], selector); - c0[2] = __byte_perm_S (w1[3], w1[2], selector); - c0[1] = __byte_perm_S (w1[2], w1[1], selector); - c0[0] = __byte_perm_S (w1[1], w1[0], selector); - w3[3] = __byte_perm_S (w1[0], w0[3], selector); - w3[2] = __byte_perm_S (w0[3], w0[2], selector); - w3[1] = __byte_perm_S (w0[2], w0[1], selector); - w3[0] = __byte_perm_S (w0[1], w0[0], selector); - w2[3] = __byte_perm_S (w0[0], 0, selector); + w0[1] = 0; + w0[0] = 0; + + break; + + case 27: + w7[3] = amd_bytealign_S (w0[3], w1[0], offset); + w7[2] = amd_bytealign_S (w0[2], w0[3], offset); + w7[1] = amd_bytealign_S (w0[1], w0[2], offset); + w7[0] = amd_bytealign_S (w0[0], w0[1], offset); + w6[3] = amd_bytealign_S ( 0, w0[0], offset); + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -9826,24 +13122,27 @@ inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], break; - case 12: - c3[0] = __byte_perm_S ( 0, w3[3], selector); - c2[3] = __byte_perm_S (w3[3], w3[2], selector); - c2[2] = __byte_perm_S (w3[2], w3[1], selector); - c2[1] = __byte_perm_S (w3[1], w3[0], selector); - c2[0] = __byte_perm_S (w3[0], w2[3], selector); - c1[3] = __byte_perm_S (w2[3], w2[2], selector); - c1[2] = __byte_perm_S (w2[2], w2[1], selector); - c1[1] = __byte_perm_S (w2[1], w2[0], selector); - c1[0] = __byte_perm_S (w2[0], w1[3], selector); - c0[3] = __byte_perm_S (w1[3], w1[2], selector); - c0[2] = __byte_perm_S (w1[2], w1[1], selector); - c0[1] = __byte_perm_S (w1[1], w1[0], selector); - c0[0] = __byte_perm_S (w1[0], w0[3], selector); - w3[3] = __byte_perm_S (w0[3], w0[2], selector); - w3[2] = __byte_perm_S (w0[2], w0[1], selector); - w3[1] = __byte_perm_S (w0[1], w0[0], selector); - w3[0] = __byte_perm_S (w0[0], 0, selector); + case 28: + w7[3] = amd_bytealign_S (w0[2], w0[3], offset); + w7[2] = amd_bytealign_S (w0[1], w0[2], offset); + w7[1] = amd_bytealign_S (w0[0], w0[1], offset); + w7[0] = amd_bytealign_S ( 0, w0[0], offset); + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -9859,24 +13158,26 @@ inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], break; - case 13: - c3[1] = __byte_perm_S ( 0, w3[3], selector); - c3[0] = __byte_perm_S (w3[3], w3[2], selector); - c2[3] = __byte_perm_S (w3[2], w3[1], selector); - c2[2] = __byte_perm_S (w3[1], w3[0], selector); - c2[1] = __byte_perm_S (w3[0], w2[3], selector); - c2[0] = __byte_perm_S (w2[3], w2[2], selector); - c1[3] = __byte_perm_S (w2[2], w2[1], selector); - c1[2] = __byte_perm_S (w2[1], w2[0], selector); - c1[1] = __byte_perm_S (w2[0], w1[3], selector); - c1[0] = __byte_perm_S (w1[3], w1[2], selector); - c0[3] = __byte_perm_S (w1[2], w1[1], selector); - c0[2] = __byte_perm_S (w1[1], w1[0], selector); - c0[1] = __byte_perm_S (w1[0], w0[3], selector); - c0[0] = __byte_perm_S (w0[3], w0[2], selector); - w3[3] = __byte_perm_S (w0[2], w0[1], selector); - w3[2] = __byte_perm_S (w0[1], w0[0], selector); - w3[1] = __byte_perm_S (w0[0], 0, selector); + case 29: + w7[3] = amd_bytealign_S (w0[1], w0[2], offset); + w7[2] = amd_bytealign_S (w0[0], w0[1], offset); + w7[1] = amd_bytealign_S ( 0, w0[0], offset); + w7[0] = 0; + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -9893,24 +13194,25 @@ inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], break; - case 14: - c3[2] = __byte_perm_S ( 0, w3[3], selector); - c3[1] = __byte_perm_S (w3[3], w3[2], selector); - c3[0] = __byte_perm_S (w3[2], w3[1], selector); - c2[3] = __byte_perm_S (w3[1], w3[0], selector); - c2[2] = __byte_perm_S (w3[0], w2[3], selector); - c2[1] = __byte_perm_S (w2[3], w2[2], selector); - c2[0] = __byte_perm_S (w2[2], w2[1], selector); - c1[3] = __byte_perm_S (w2[1], w2[0], selector); - c1[2] = __byte_perm_S (w2[0], w1[3], selector); - c1[1] = __byte_perm_S (w1[3], w1[2], selector); - c1[0] = __byte_perm_S (w1[2], w1[1], selector); - c0[3] = __byte_perm_S (w1[1], w1[0], selector); - c0[2] = __byte_perm_S (w1[0], w0[3], selector); - c0[1] = __byte_perm_S (w0[3], w0[2], selector); - c0[0] = __byte_perm_S (w0[2], w0[1], selector); - w3[3] = __byte_perm_S (w0[1], w0[0], selector); - w3[2] = __byte_perm_S (w0[0], 0, selector); + case 30: + w7[3] = amd_bytealign_S (w0[0], w0[1], offset); + w7[2] = amd_bytealign_S ( 0, w0[0], offset); + w7[1] = 0; + w7[0] = 0; + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -9928,24 +13230,24 @@ inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], break; - case 15: - c3[3] = __byte_perm_S ( 0, w3[3], selector); - c3[2] = __byte_perm_S (w3[3], w3[2], selector); - c3[1] = __byte_perm_S (w3[2], w3[1], selector); - c3[0] = __byte_perm_S (w3[1], w3[0], selector); - c2[3] = __byte_perm_S (w3[0], w2[3], selector); - c2[2] = __byte_perm_S (w2[3], w2[2], selector); - c2[1] = __byte_perm_S (w2[2], w2[1], selector); - c2[0] = __byte_perm_S (w2[1], w2[0], selector); - c1[3] = __byte_perm_S (w2[0], w1[3], selector); - c1[2] = __byte_perm_S (w1[3], w1[2], selector); - c1[1] = __byte_perm_S (w1[2], w1[1], selector); - c1[0] = __byte_perm_S (w1[1], w1[0], selector); - c0[3] = __byte_perm_S (w1[0], w0[3], selector); - c0[2] = __byte_perm_S (w0[3], w0[2], selector); - c0[1] = __byte_perm_S (w0[2], w0[1], selector); - c0[0] = __byte_perm_S (w0[1], w0[0], selector); - w3[3] = __byte_perm_S (w0[0], 0, selector); + case 31: + w7[3] = amd_bytealign_S ( 0, w0[0], offset); + w7[2] = 0; + w7[1] = 0; + w7[0] = 0; + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -9965,461 +13267,327 @@ inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], break; } #endif -} - -inline void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset) -{ - #if defined IS_AMD || defined IS_GENERIC - const int offset_mod_4 = offset & 3; - - const int offset_minus_4 = 4 - offset; - - switch (offset / 4) - { - case 0: - w7[3] = amd_bytealign_S (w7[3], w7[2], offset_minus_4); - w7[2] = amd_bytealign_S (w7[2], w7[1], offset_minus_4); - w7[1] = amd_bytealign_S (w7[1], w7[0], offset_minus_4); - w7[0] = amd_bytealign_S (w7[0], w6[3], offset_minus_4); - w6[3] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); - w6[2] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); - w6[1] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); - w6[0] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w5[3] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w5[2] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w5[1] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w5[0] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w4[3] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w4[2] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w4[1] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w4[0] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w3[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w3[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w3[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w2[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w2[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w2[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w1[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w1[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w1[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w0[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w0[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w0[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); - if (offset_mod_4 == 0) - { - w0[0] = w0[1]; - w0[1] = w0[2]; - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } + #ifdef IS_NV + const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; + + switch (offset / 4) + { + case 0: + w7[3] = __byte_perm_S (w7[3], w7[2], selector); + w7[2] = __byte_perm_S (w7[2], w7[1], selector); + w7[1] = __byte_perm_S (w7[1], w7[0], selector); + w7[0] = __byte_perm_S (w7[0], w6[3], selector); + w6[3] = __byte_perm_S (w6[3], w6[2], selector); + w6[2] = __byte_perm_S (w6[2], w6[1], selector); + w6[1] = __byte_perm_S (w6[1], w6[0], selector); + w6[0] = __byte_perm_S (w6[0], w5[3], selector); + w5[3] = __byte_perm_S (w5[3], w5[2], selector); + w5[2] = __byte_perm_S (w5[2], w5[1], selector); + w5[1] = __byte_perm_S (w5[1], w5[0], selector); + w5[0] = __byte_perm_S (w5[0], w4[3], selector); + w4[3] = __byte_perm_S (w4[3], w4[2], selector); + w4[2] = __byte_perm_S (w4[2], w4[1], selector); + w4[1] = __byte_perm_S (w4[1], w4[0], selector); + w4[0] = __byte_perm_S (w4[0], w3[3], selector); + w3[3] = __byte_perm_S (w3[3], w3[2], selector); + w3[2] = __byte_perm_S (w3[2], w3[1], selector); + w3[1] = __byte_perm_S (w3[1], w3[0], selector); + w3[0] = __byte_perm_S (w3[0], w2[3], selector); + w2[3] = __byte_perm_S (w2[3], w2[2], selector); + w2[2] = __byte_perm_S (w2[2], w2[1], selector); + w2[1] = __byte_perm_S (w2[1], w2[0], selector); + w2[0] = __byte_perm_S (w2[0], w1[3], selector); + w1[3] = __byte_perm_S (w1[3], w1[2], selector); + w1[2] = __byte_perm_S (w1[2], w1[1], selector); + w1[1] = __byte_perm_S (w1[1], w1[0], selector); + w1[0] = __byte_perm_S (w1[0], w0[3], selector); + w0[3] = __byte_perm_S (w0[3], w0[2], selector); + w0[2] = __byte_perm_S (w0[2], w0[1], selector); + w0[1] = __byte_perm_S (w0[1], w0[0], selector); + w0[0] = __byte_perm_S (w0[0], 0, selector); break; - case 1: - w7[3] = amd_bytealign_S (w7[2], w7[1], offset_minus_4); - w7[2] = amd_bytealign_S (w7[1], w7[0], offset_minus_4); - w7[1] = amd_bytealign_S (w7[0], w6[3], offset_minus_4); - w7[0] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); - w6[3] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); - w6[2] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); - w6[1] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w6[0] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w5[3] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w5[2] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w5[1] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w5[0] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w4[3] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w4[2] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w4[1] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w4[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w3[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w3[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w2[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w2[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w1[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w1[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w0[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w0[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + case 1: + w7[3] = __byte_perm_S (w7[2], w7[1], selector); + w7[2] = __byte_perm_S (w7[1], w7[0], selector); + w7[1] = __byte_perm_S (w7[0], w6[3], selector); + w7[0] = __byte_perm_S (w6[3], w6[2], selector); + w6[3] = __byte_perm_S (w6[2], w6[1], selector); + w6[2] = __byte_perm_S (w6[1], w6[0], selector); + w6[1] = __byte_perm_S (w6[0], w5[3], selector); + w6[0] = __byte_perm_S (w5[3], w5[2], selector); + w5[3] = __byte_perm_S (w5[2], w5[1], selector); + w5[2] = __byte_perm_S (w5[1], w5[0], selector); + w5[1] = __byte_perm_S (w5[0], w4[3], selector); + w5[0] = __byte_perm_S (w4[3], w4[2], selector); + w4[3] = __byte_perm_S (w4[2], w4[1], selector); + w4[2] = __byte_perm_S (w4[1], w4[0], selector); + w4[1] = __byte_perm_S (w4[0], w3[3], selector); + w4[0] = __byte_perm_S (w3[3], w3[2], selector); + w3[3] = __byte_perm_S (w3[2], w3[1], selector); + w3[2] = __byte_perm_S (w3[1], w3[0], selector); + w3[1] = __byte_perm_S (w3[0], w2[3], selector); + w3[0] = __byte_perm_S (w2[3], w2[2], selector); + w2[3] = __byte_perm_S (w2[2], w2[1], selector); + w2[2] = __byte_perm_S (w2[1], w2[0], selector); + w2[1] = __byte_perm_S (w2[0], w1[3], selector); + w2[0] = __byte_perm_S (w1[3], w1[2], selector); + w1[3] = __byte_perm_S (w1[2], w1[1], selector); + w1[2] = __byte_perm_S (w1[1], w1[0], selector); + w1[1] = __byte_perm_S (w1[0], w0[3], selector); + w1[0] = __byte_perm_S (w0[3], w0[2], selector); + w0[3] = __byte_perm_S (w0[2], w0[1], selector); + w0[2] = __byte_perm_S (w0[1], w0[0], selector); + w0[1] = __byte_perm_S (w0[0], 0, selector); w0[0] = 0; - if (offset_mod_4 == 0) - { - w0[1] = w0[2]; - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 2: - w7[3] = amd_bytealign_S (w7[1], w7[0], offset_minus_4); - w7[2] = amd_bytealign_S (w7[0], w6[3], offset_minus_4); - w7[1] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); - w7[0] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); - w6[3] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); - w6[2] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w6[1] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w6[0] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w5[3] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w5[2] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w5[1] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w5[0] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w4[3] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w4[2] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w4[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w4[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w3[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w2[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w1[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w0[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + case 2: + w7[3] = __byte_perm_S (w7[1], w7[0], selector); + w7[2] = __byte_perm_S (w7[0], w6[3], selector); + w7[1] = __byte_perm_S (w6[3], w6[2], selector); + w7[0] = __byte_perm_S (w6[2], w6[1], selector); + w6[3] = __byte_perm_S (w6[1], w6[0], selector); + w6[2] = __byte_perm_S (w6[0], w5[3], selector); + w6[1] = __byte_perm_S (w5[3], w5[2], selector); + w6[0] = __byte_perm_S (w5[2], w5[1], selector); + w5[3] = __byte_perm_S (w5[1], w5[0], selector); + w5[2] = __byte_perm_S (w5[0], w4[3], selector); + w5[1] = __byte_perm_S (w4[3], w4[2], selector); + w5[0] = __byte_perm_S (w4[2], w4[1], selector); + w4[3] = __byte_perm_S (w4[1], w4[0], selector); + w4[2] = __byte_perm_S (w4[0], w3[3], selector); + w4[1] = __byte_perm_S (w3[3], w3[2], selector); + w4[0] = __byte_perm_S (w3[2], w3[1], selector); + w3[3] = __byte_perm_S (w3[1], w3[0], selector); + w3[2] = __byte_perm_S (w3[0], w2[3], selector); + w3[1] = __byte_perm_S (w2[3], w2[2], selector); + w3[0] = __byte_perm_S (w2[2], w2[1], selector); + w2[3] = __byte_perm_S (w2[1], w2[0], selector); + w2[2] = __byte_perm_S (w2[0], w1[3], selector); + w2[1] = __byte_perm_S (w1[3], w1[2], selector); + w2[0] = __byte_perm_S (w1[2], w1[1], selector); + w1[3] = __byte_perm_S (w1[1], w1[0], selector); + w1[2] = __byte_perm_S (w1[0], w0[3], selector); + w1[1] = __byte_perm_S (w0[3], w0[2], selector); + w1[0] = __byte_perm_S (w0[2], w0[1], selector); + w0[3] = __byte_perm_S (w0[1], w0[0], selector); + w0[2] = __byte_perm_S (w0[0], 0, selector); w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } + break; + + case 3: + w7[3] = __byte_perm_S (w7[0], w6[3], selector); + w7[2] = __byte_perm_S (w6[3], w6[2], selector); + w7[1] = __byte_perm_S (w6[2], w6[1], selector); + w7[0] = __byte_perm_S (w6[1], w6[0], selector); + w6[3] = __byte_perm_S (w6[0], w5[3], selector); + w6[2] = __byte_perm_S (w5[3], w5[2], selector); + w6[1] = __byte_perm_S (w5[2], w5[1], selector); + w6[0] = __byte_perm_S (w5[1], w5[0], selector); + w5[3] = __byte_perm_S (w5[0], w4[3], selector); + w5[2] = __byte_perm_S (w4[3], w4[2], selector); + w5[1] = __byte_perm_S (w4[2], w4[1], selector); + w5[0] = __byte_perm_S (w4[1], w4[0], selector); + w4[3] = __byte_perm_S (w4[0], w3[3], selector); + w4[2] = __byte_perm_S (w3[3], w3[2], selector); + w4[1] = __byte_perm_S (w3[2], w3[1], selector); + w4[0] = __byte_perm_S (w3[1], w3[0], selector); + w3[3] = __byte_perm_S (w3[0], w2[3], selector); + w3[2] = __byte_perm_S (w2[3], w2[2], selector); + w3[1] = __byte_perm_S (w2[2], w2[1], selector); + w3[0] = __byte_perm_S (w2[1], w2[0], selector); + w2[3] = __byte_perm_S (w2[0], w1[3], selector); + w2[2] = __byte_perm_S (w1[3], w1[2], selector); + w2[1] = __byte_perm_S (w1[2], w1[1], selector); + w2[0] = __byte_perm_S (w1[1], w1[0], selector); + w1[3] = __byte_perm_S (w1[0], w0[3], selector); + w1[2] = __byte_perm_S (w0[3], w0[2], selector); + w1[1] = __byte_perm_S (w0[2], w0[1], selector); + w1[0] = __byte_perm_S (w0[1], w0[0], selector); + w0[3] = __byte_perm_S (w0[0], 0, selector); + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 3: - w7[3] = amd_bytealign_S (w7[0], w6[3], offset_minus_4); - w7[2] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); - w7[1] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); - w7[0] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); - w6[3] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w6[2] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w6[1] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w6[0] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w5[3] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w5[2] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w5[1] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w5[0] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w4[3] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w4[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w4[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w4[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + case 4: + w7[3] = __byte_perm_S (w6[3], w6[2], selector); + w7[2] = __byte_perm_S (w6[2], w6[1], selector); + w7[1] = __byte_perm_S (w6[1], w6[0], selector); + w7[0] = __byte_perm_S (w6[0], w5[3], selector); + w6[3] = __byte_perm_S (w5[3], w5[2], selector); + w6[2] = __byte_perm_S (w5[2], w5[1], selector); + w6[1] = __byte_perm_S (w5[1], w5[0], selector); + w6[0] = __byte_perm_S (w5[0], w4[3], selector); + w5[3] = __byte_perm_S (w4[3], w4[2], selector); + w5[2] = __byte_perm_S (w4[2], w4[1], selector); + w5[1] = __byte_perm_S (w4[1], w4[0], selector); + w5[0] = __byte_perm_S (w4[0], w3[3], selector); + w4[3] = __byte_perm_S (w3[3], w3[2], selector); + w4[2] = __byte_perm_S (w3[2], w3[1], selector); + w4[1] = __byte_perm_S (w3[1], w3[0], selector); + w4[0] = __byte_perm_S (w3[0], w2[3], selector); + w3[3] = __byte_perm_S (w2[3], w2[2], selector); + w3[2] = __byte_perm_S (w2[2], w2[1], selector); + w3[1] = __byte_perm_S (w2[1], w2[0], selector); + w3[0] = __byte_perm_S (w2[0], w1[3], selector); + w2[3] = __byte_perm_S (w1[3], w1[2], selector); + w2[2] = __byte_perm_S (w1[2], w1[1], selector); + w2[1] = __byte_perm_S (w1[1], w1[0], selector); + w2[0] = __byte_perm_S (w1[0], w0[3], selector); + w1[3] = __byte_perm_S (w0[3], w0[2], selector); + w1[2] = __byte_perm_S (w0[2], w0[1], selector); + w1[1] = __byte_perm_S (w0[1], w0[0], selector); + w1[0] = __byte_perm_S (w0[0], 0, selector); + w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 4: - w7[3] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); - w7[2] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); - w7[1] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); - w7[0] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w6[3] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w6[2] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w6[1] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w6[0] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w5[3] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w5[2] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w5[1] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w5[0] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w4[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w4[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w4[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w4[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + case 5: + w7[3] = __byte_perm_S (w6[2], w6[1], selector); + w7[2] = __byte_perm_S (w6[1], w6[0], selector); + w7[1] = __byte_perm_S (w6[0], w5[3], selector); + w7[0] = __byte_perm_S (w5[3], w5[2], selector); + w6[3] = __byte_perm_S (w5[2], w5[1], selector); + w6[2] = __byte_perm_S (w5[1], w5[0], selector); + w6[1] = __byte_perm_S (w5[0], w4[3], selector); + w6[0] = __byte_perm_S (w4[3], w4[2], selector); + w5[3] = __byte_perm_S (w4[2], w4[1], selector); + w5[2] = __byte_perm_S (w4[1], w4[0], selector); + w5[1] = __byte_perm_S (w4[0], w3[3], selector); + w5[0] = __byte_perm_S (w3[3], w3[2], selector); + w4[3] = __byte_perm_S (w3[2], w3[1], selector); + w4[2] = __byte_perm_S (w3[1], w3[0], selector); + w4[1] = __byte_perm_S (w3[0], w2[3], selector); + w4[0] = __byte_perm_S (w2[3], w2[2], selector); + w3[3] = __byte_perm_S (w2[2], w2[1], selector); + w3[2] = __byte_perm_S (w2[1], w2[0], selector); + w3[1] = __byte_perm_S (w2[0], w1[3], selector); + w3[0] = __byte_perm_S (w1[3], w1[2], selector); + w2[3] = __byte_perm_S (w1[2], w1[1], selector); + w2[2] = __byte_perm_S (w1[1], w1[0], selector); + w2[1] = __byte_perm_S (w1[0], w0[3], selector); + w2[0] = __byte_perm_S (w0[3], w0[2], selector); + w1[3] = __byte_perm_S (w0[2], w0[1], selector); + w1[2] = __byte_perm_S (w0[1], w0[0], selector); + w1[1] = __byte_perm_S (w0[0], 0, selector); + w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 5: - w7[3] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); - w7[2] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); - w7[1] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w7[0] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w6[3] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w6[2] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w6[1] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w6[0] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w5[3] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w5[2] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w5[1] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w5[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w4[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w4[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w4[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w4[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + case 6: + w7[3] = __byte_perm_S (w6[1], w6[0], selector); + w7[2] = __byte_perm_S (w6[0], w5[3], selector); + w7[1] = __byte_perm_S (w5[3], w5[2], selector); + w7[0] = __byte_perm_S (w5[2], w5[1], selector); + w6[3] = __byte_perm_S (w5[1], w5[0], selector); + w6[2] = __byte_perm_S (w5[0], w4[3], selector); + w6[1] = __byte_perm_S (w4[3], w4[2], selector); + w6[0] = __byte_perm_S (w4[2], w4[1], selector); + w5[3] = __byte_perm_S (w4[1], w4[0], selector); + w5[2] = __byte_perm_S (w4[0], w3[3], selector); + w5[1] = __byte_perm_S (w3[3], w3[2], selector); + w5[0] = __byte_perm_S (w3[2], w3[1], selector); + w4[3] = __byte_perm_S (w3[1], w3[0], selector); + w4[2] = __byte_perm_S (w3[0], w2[3], selector); + w4[1] = __byte_perm_S (w2[3], w2[2], selector); + w4[0] = __byte_perm_S (w2[2], w2[1], selector); + w3[3] = __byte_perm_S (w2[1], w2[0], selector); + w3[2] = __byte_perm_S (w2[0], w1[3], selector); + w3[1] = __byte_perm_S (w1[3], w1[2], selector); + w3[0] = __byte_perm_S (w1[2], w1[1], selector); + w2[3] = __byte_perm_S (w1[1], w1[0], selector); + w2[2] = __byte_perm_S (w1[0], w0[3], selector); + w2[1] = __byte_perm_S (w0[3], w0[2], selector); + w2[0] = __byte_perm_S (w0[2], w0[1], selector); + w1[3] = __byte_perm_S (w0[1], w0[0], selector); + w1[2] = __byte_perm_S (w0[0], 0, selector); + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 7: + w7[3] = __byte_perm_S (w6[0], w5[3], selector); + w7[2] = __byte_perm_S (w5[3], w5[2], selector); + w7[1] = __byte_perm_S (w5[2], w5[1], selector); + w7[0] = __byte_perm_S (w5[1], w5[0], selector); + w6[3] = __byte_perm_S (w5[0], w4[3], selector); + w6[2] = __byte_perm_S (w4[3], w4[2], selector); + w6[1] = __byte_perm_S (w4[2], w4[1], selector); + w6[0] = __byte_perm_S (w4[1], w4[0], selector); + w5[3] = __byte_perm_S (w4[0], w3[3], selector); + w5[2] = __byte_perm_S (w3[3], w3[2], selector); + w5[1] = __byte_perm_S (w3[2], w3[1], selector); + w5[0] = __byte_perm_S (w3[1], w3[0], selector); + w4[3] = __byte_perm_S (w3[0], w2[3], selector); + w4[2] = __byte_perm_S (w2[3], w2[2], selector); + w4[1] = __byte_perm_S (w2[2], w2[1], selector); + w4[0] = __byte_perm_S (w2[1], w2[0], selector); + w3[3] = __byte_perm_S (w2[0], w1[3], selector); + w3[2] = __byte_perm_S (w1[3], w1[2], selector); + w3[1] = __byte_perm_S (w1[2], w1[1], selector); + w3[0] = __byte_perm_S (w1[1], w1[0], selector); + w2[3] = __byte_perm_S (w1[0], w0[3], selector); + w2[2] = __byte_perm_S (w0[3], w0[2], selector); + w2[1] = __byte_perm_S (w0[2], w0[1], selector); + w2[0] = __byte_perm_S (w0[1], w0[0], selector); + w1[3] = __byte_perm_S (w0[0], 0, selector); + w1[2] = 0; + w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 6: - w7[3] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); - w7[2] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w7[1] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w7[0] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w6[3] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w6[2] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w6[1] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w6[0] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w5[3] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w5[2] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w5[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w5[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w4[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w4[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w4[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w4[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + case 8: + w7[3] = __byte_perm_S (w5[3], w5[2], selector); + w7[2] = __byte_perm_S (w5[2], w5[1], selector); + w7[1] = __byte_perm_S (w5[1], w5[0], selector); + w7[0] = __byte_perm_S (w5[0], w4[3], selector); + w6[3] = __byte_perm_S (w4[3], w4[2], selector); + w6[2] = __byte_perm_S (w4[2], w4[1], selector); + w6[1] = __byte_perm_S (w4[1], w4[0], selector); + w6[0] = __byte_perm_S (w4[0], w3[3], selector); + w5[3] = __byte_perm_S (w3[3], w3[2], selector); + w5[2] = __byte_perm_S (w3[2], w3[1], selector); + w5[1] = __byte_perm_S (w3[1], w3[0], selector); + w5[0] = __byte_perm_S (w3[0], w2[3], selector); + w4[3] = __byte_perm_S (w2[3], w2[2], selector); + w4[2] = __byte_perm_S (w2[2], w2[1], selector); + w4[1] = __byte_perm_S (w2[1], w2[0], selector); + w4[0] = __byte_perm_S (w2[0], w1[3], selector); + w3[3] = __byte_perm_S (w1[3], w1[2], selector); + w3[2] = __byte_perm_S (w1[2], w1[1], selector); + w3[1] = __byte_perm_S (w1[1], w1[0], selector); + w3[0] = __byte_perm_S (w1[0], w0[3], selector); + w2[3] = __byte_perm_S (w0[3], w0[2], selector); + w2[2] = __byte_perm_S (w0[2], w0[1], selector); + w2[1] = __byte_perm_S (w0[1], w0[0], selector); + w2[0] = __byte_perm_S (w0[0], 0, selector); + w1[3] = 0; + w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -10427,64 +13595,34 @@ inline void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 7: - w7[3] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w7[2] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w7[1] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w7[0] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w6[3] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w6[2] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w6[1] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w6[0] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w5[3] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w5[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w5[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w5[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w4[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w4[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w4[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w4[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + case 9: + w7[3] = __byte_perm_S (w5[2], w5[1], selector); + w7[2] = __byte_perm_S (w5[1], w5[0], selector); + w7[1] = __byte_perm_S (w5[0], w4[3], selector); + w7[0] = __byte_perm_S (w4[3], w4[2], selector); + w6[3] = __byte_perm_S (w4[2], w4[1], selector); + w6[2] = __byte_perm_S (w4[1], w4[0], selector); + w6[1] = __byte_perm_S (w4[0], w3[3], selector); + w6[0] = __byte_perm_S (w3[3], w3[2], selector); + w5[3] = __byte_perm_S (w3[2], w3[1], selector); + w5[2] = __byte_perm_S (w3[1], w3[0], selector); + w5[1] = __byte_perm_S (w3[0], w2[3], selector); + w5[0] = __byte_perm_S (w2[3], w2[2], selector); + w4[3] = __byte_perm_S (w2[2], w2[1], selector); + w4[2] = __byte_perm_S (w2[1], w2[0], selector); + w4[1] = __byte_perm_S (w2[0], w1[3], selector); + w4[0] = __byte_perm_S (w1[3], w1[2], selector); + w3[3] = __byte_perm_S (w1[2], w1[1], selector); + w3[2] = __byte_perm_S (w1[1], w1[0], selector); + w3[1] = __byte_perm_S (w1[0], w0[3], selector); + w3[0] = __byte_perm_S (w0[3], w0[2], selector); + w2[3] = __byte_perm_S (w0[2], w0[1], selector); + w2[2] = __byte_perm_S (w0[1], w0[0], selector); + w2[1] = __byte_perm_S (w0[0], 0, selector); + w2[0] = 0; + w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -10493,62 +13631,69 @@ inline void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } + break; + + case 10: + w7[3] = __byte_perm_S (w5[1], w5[0], selector); + w7[2] = __byte_perm_S (w5[0], w4[3], selector); + w7[1] = __byte_perm_S (w4[3], w4[2], selector); + w7[0] = __byte_perm_S (w4[2], w4[1], selector); + w6[3] = __byte_perm_S (w4[1], w4[0], selector); + w6[2] = __byte_perm_S (w4[0], w3[3], selector); + w6[1] = __byte_perm_S (w3[3], w3[2], selector); + w6[0] = __byte_perm_S (w3[2], w3[1], selector); + w5[3] = __byte_perm_S (w3[1], w3[0], selector); + w5[2] = __byte_perm_S (w3[0], w2[3], selector); + w5[1] = __byte_perm_S (w2[3], w2[2], selector); + w5[0] = __byte_perm_S (w2[2], w2[1], selector); + w4[3] = __byte_perm_S (w2[1], w2[0], selector); + w4[2] = __byte_perm_S (w2[0], w1[3], selector); + w4[1] = __byte_perm_S (w1[3], w1[2], selector); + w4[0] = __byte_perm_S (w1[2], w1[1], selector); + w3[3] = __byte_perm_S (w1[1], w1[0], selector); + w3[2] = __byte_perm_S (w1[0], w0[3], selector); + w3[1] = __byte_perm_S (w0[3], w0[2], selector); + w3[0] = __byte_perm_S (w0[2], w0[1], selector); + w2[3] = __byte_perm_S (w0[1], w0[0], selector); + w2[2] = __byte_perm_S (w0[0], 0, selector); + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 8: - w7[3] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w7[2] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w7[1] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w7[0] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w6[3] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w6[2] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w6[1] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w6[0] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w5[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w5[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w5[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w5[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w4[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w4[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w4[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w4[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + case 11: + w7[3] = __byte_perm_S (w5[0], w4[3], selector); + w7[2] = __byte_perm_S (w4[3], w4[2], selector); + w7[1] = __byte_perm_S (w4[2], w4[1], selector); + w7[0] = __byte_perm_S (w4[1], w4[0], selector); + w6[3] = __byte_perm_S (w4[0], w3[3], selector); + w6[2] = __byte_perm_S (w3[3], w3[2], selector); + w6[1] = __byte_perm_S (w3[2], w3[1], selector); + w6[0] = __byte_perm_S (w3[1], w3[0], selector); + w5[3] = __byte_perm_S (w3[0], w2[3], selector); + w5[2] = __byte_perm_S (w2[3], w2[2], selector); + w5[1] = __byte_perm_S (w2[2], w2[1], selector); + w5[0] = __byte_perm_S (w2[1], w2[0], selector); + w4[3] = __byte_perm_S (w2[0], w1[3], selector); + w4[2] = __byte_perm_S (w1[3], w1[2], selector); + w4[1] = __byte_perm_S (w1[2], w1[1], selector); + w4[0] = __byte_perm_S (w1[1], w1[0], selector); + w3[3] = __byte_perm_S (w1[0], w0[3], selector); + w3[2] = __byte_perm_S (w0[3], w0[2], selector); + w3[1] = __byte_perm_S (w0[2], w0[1], selector); + w3[0] = __byte_perm_S (w0[1], w0[0], selector); + w2[3] = __byte_perm_S (w0[0], 0, selector); + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -10558,60 +13703,32 @@ inline void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 9: - w7[3] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w7[2] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w7[1] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w7[0] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w6[3] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w6[2] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w6[1] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w6[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w5[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w5[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w5[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w5[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w4[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w4[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w4[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w4[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + case 12: + w7[3] = __byte_perm_S (w4[3], w4[2], selector); + w7[2] = __byte_perm_S (w4[2], w4[1], selector); + w7[1] = __byte_perm_S (w4[1], w4[0], selector); + w7[0] = __byte_perm_S (w4[0], w3[3], selector); + w6[3] = __byte_perm_S (w3[3], w3[2], selector); + w6[2] = __byte_perm_S (w3[2], w3[1], selector); + w6[1] = __byte_perm_S (w3[1], w3[0], selector); + w6[0] = __byte_perm_S (w3[0], w2[3], selector); + w5[3] = __byte_perm_S (w2[3], w2[2], selector); + w5[2] = __byte_perm_S (w2[2], w2[1], selector); + w5[1] = __byte_perm_S (w2[1], w2[0], selector); + w5[0] = __byte_perm_S (w2[0], w1[3], selector); + w4[3] = __byte_perm_S (w1[3], w1[2], selector); + w4[2] = __byte_perm_S (w1[2], w1[1], selector); + w4[1] = __byte_perm_S (w1[1], w1[0], selector); + w4[0] = __byte_perm_S (w1[0], w0[3], selector); + w3[3] = __byte_perm_S (w0[3], w0[2], selector); + w3[2] = __byte_perm_S (w0[2], w0[1], selector); + w3[1] = __byte_perm_S (w0[1], w0[0], selector); + w3[0] = __byte_perm_S (w0[0], 0, selector); + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -10622,58 +13739,31 @@ inline void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 10: - w7[3] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w7[2] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w7[1] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w7[0] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w6[3] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w6[2] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w6[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w6[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w5[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w5[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w5[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w5[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w4[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w4[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w4[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w4[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + case 13: + w7[3] = __byte_perm_S (w4[2], w4[1], selector); + w7[2] = __byte_perm_S (w4[1], w4[0], selector); + w7[1] = __byte_perm_S (w4[0], w3[3], selector); + w7[0] = __byte_perm_S (w3[3], w3[2], selector); + w6[3] = __byte_perm_S (w3[2], w3[1], selector); + w6[2] = __byte_perm_S (w3[1], w3[0], selector); + w6[1] = __byte_perm_S (w3[0], w2[3], selector); + w6[0] = __byte_perm_S (w2[3], w2[2], selector); + w5[3] = __byte_perm_S (w2[2], w2[1], selector); + w5[2] = __byte_perm_S (w2[1], w2[0], selector); + w5[1] = __byte_perm_S (w2[0], w1[3], selector); + w5[0] = __byte_perm_S (w1[3], w1[2], selector); + w4[3] = __byte_perm_S (w1[2], w1[1], selector); + w4[2] = __byte_perm_S (w1[1], w1[0], selector); + w4[1] = __byte_perm_S (w1[0], w0[3], selector); + w4[0] = __byte_perm_S (w0[3], w0[2], selector); + w3[3] = __byte_perm_S (w0[2], w0[1], selector); + w3[2] = __byte_perm_S (w0[1], w0[0], selector); + w3[1] = __byte_perm_S (w0[0], 0, selector); + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -10685,56 +13775,30 @@ inline void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 11: - w7[3] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w7[2] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w7[1] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w7[0] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w6[3] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w6[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w6[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w6[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w5[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w5[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w5[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w5[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w4[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w4[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w4[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w4[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + case 14: + w7[3] = __byte_perm_S (w4[1], w4[0], selector); + w7[2] = __byte_perm_S (w4[0], w3[3], selector); + w7[1] = __byte_perm_S (w3[3], w3[2], selector); + w7[0] = __byte_perm_S (w3[2], w3[1], selector); + w6[3] = __byte_perm_S (w3[1], w3[0], selector); + w6[2] = __byte_perm_S (w3[0], w2[3], selector); + w6[1] = __byte_perm_S (w2[3], w2[2], selector); + w6[0] = __byte_perm_S (w2[2], w2[1], selector); + w5[3] = __byte_perm_S (w2[1], w2[0], selector); + w5[2] = __byte_perm_S (w2[0], w1[3], selector); + w5[1] = __byte_perm_S (w1[3], w1[2], selector); + w5[0] = __byte_perm_S (w1[2], w1[1], selector); + w4[3] = __byte_perm_S (w1[1], w1[0], selector); + w4[2] = __byte_perm_S (w1[0], w0[3], selector); + w4[1] = __byte_perm_S (w0[3], w0[2], selector); + w4[0] = __byte_perm_S (w0[2], w0[1], selector); + w3[3] = __byte_perm_S (w0[1], w0[0], selector); + w3[2] = __byte_perm_S (w0[0], 0, selector); + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -10747,54 +13811,29 @@ inline void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 12: - w7[3] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w7[2] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w7[1] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w7[0] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w6[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w6[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w6[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w6[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w5[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w5[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w5[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w5[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w4[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w4[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w4[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w4[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + case 15: + w7[3] = __byte_perm_S (w4[0], w3[3], selector); + w7[2] = __byte_perm_S (w3[3], w3[2], selector); + w7[1] = __byte_perm_S (w3[2], w3[1], selector); + w7[0] = __byte_perm_S (w3[1], w3[0], selector); + w6[3] = __byte_perm_S (w3[0], w2[3], selector); + w6[2] = __byte_perm_S (w2[3], w2[2], selector); + w6[1] = __byte_perm_S (w2[2], w2[1], selector); + w6[0] = __byte_perm_S (w2[1], w2[0], selector); + w5[3] = __byte_perm_S (w2[0], w1[3], selector); + w5[2] = __byte_perm_S (w1[3], w1[2], selector); + w5[1] = __byte_perm_S (w1[2], w1[1], selector); + w5[0] = __byte_perm_S (w1[1], w1[0], selector); + w4[3] = __byte_perm_S (w1[0], w0[3], selector); + w4[2] = __byte_perm_S (w0[3], w0[2], selector); + w4[1] = __byte_perm_S (w0[2], w0[1], selector); + w4[0] = __byte_perm_S (w0[1], w0[0], selector); + w3[3] = __byte_perm_S (w0[0], 0, selector); + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -10808,52 +13847,28 @@ inline void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 13: - w7[3] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w7[2] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w7[1] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w7[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w6[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w6[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w6[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w6[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w5[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w5[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w5[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w5[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w4[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w4[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w4[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w4[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + case 16: + w7[3] = __byte_perm_S (w3[3], w3[2], selector); + w7[2] = __byte_perm_S (w3[2], w3[1], selector); + w7[1] = __byte_perm_S (w3[1], w3[0], selector); + w7[0] = __byte_perm_S (w3[0], w2[3], selector); + w6[3] = __byte_perm_S (w2[3], w2[2], selector); + w6[2] = __byte_perm_S (w2[2], w2[1], selector); + w6[1] = __byte_perm_S (w2[1], w2[0], selector); + w6[0] = __byte_perm_S (w2[0], w1[3], selector); + w5[3] = __byte_perm_S (w1[3], w1[2], selector); + w5[2] = __byte_perm_S (w1[2], w1[1], selector); + w5[1] = __byte_perm_S (w1[1], w1[0], selector); + w5[0] = __byte_perm_S (w1[0], w0[3], selector); + w4[3] = __byte_perm_S (w0[3], w0[2], selector); + w4[2] = __byte_perm_S (w0[2], w0[1], selector); + w4[1] = __byte_perm_S (w0[1], w0[0], selector); + w4[0] = __byte_perm_S (w0[0], 0, selector); + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -10868,50 +13883,27 @@ inline void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 14: - w7[3] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w7[2] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w7[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w7[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w6[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w6[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w6[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w6[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w5[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w5[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w5[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w5[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w4[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w4[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w4[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w4[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + case 17: + w7[3] = __byte_perm_S (w3[2], w3[1], selector); + w7[2] = __byte_perm_S (w3[1], w3[0], selector); + w7[1] = __byte_perm_S (w3[0], w2[3], selector); + w7[0] = __byte_perm_S (w2[3], w2[2], selector); + w6[3] = __byte_perm_S (w2[2], w2[1], selector); + w6[2] = __byte_perm_S (w2[1], w2[0], selector); + w6[1] = __byte_perm_S (w2[0], w1[3], selector); + w6[0] = __byte_perm_S (w1[3], w1[2], selector); + w5[3] = __byte_perm_S (w1[2], w1[1], selector); + w5[2] = __byte_perm_S (w1[1], w1[0], selector); + w5[1] = __byte_perm_S (w1[0], w0[3], selector); + w5[0] = __byte_perm_S (w0[3], w0[2], selector); + w4[3] = __byte_perm_S (w0[2], w0[1], selector); + w4[2] = __byte_perm_S (w0[1], w0[0], selector); + w4[1] = __byte_perm_S (w0[0], 0, selector); + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -10927,48 +13919,26 @@ inline void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 15: - w7[3] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w7[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w7[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w7[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w6[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w6[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w6[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w6[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w5[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w5[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w5[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w5[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w4[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w4[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w4[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w4[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + case 18: + w7[3] = __byte_perm_S (w3[1], w3[0], selector); + w7[2] = __byte_perm_S (w3[0], w2[3], selector); + w7[1] = __byte_perm_S (w2[3], w2[2], selector); + w7[0] = __byte_perm_S (w2[2], w2[1], selector); + w6[3] = __byte_perm_S (w2[1], w2[0], selector); + w6[2] = __byte_perm_S (w2[0], w1[3], selector); + w6[1] = __byte_perm_S (w1[3], w1[2], selector); + w6[0] = __byte_perm_S (w1[2], w1[1], selector); + w5[3] = __byte_perm_S (w1[1], w1[0], selector); + w5[2] = __byte_perm_S (w1[0], w0[3], selector); + w5[1] = __byte_perm_S (w0[3], w0[2], selector); + w5[0] = __byte_perm_S (w0[2], w0[1], selector); + w4[3] = __byte_perm_S (w0[1], w0[0], selector); + w4[2] = __byte_perm_S (w0[0], 0, selector); + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -10985,309 +13955,178 @@ inline void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - - break; - } - #endif - - #ifdef IS_NV - const int offset_minus_4 = 4 - (offset % 4); - - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - - switch (offset / 4) - { - case 0: - w7[3] = __byte_perm_S (w7[2], w7[3], selector); - w7[2] = __byte_perm_S (w7[1], w7[2], selector); - w7[1] = __byte_perm_S (w7[0], w7[1], selector); - w7[0] = __byte_perm_S (w6[3], w7[0], selector); - w6[3] = __byte_perm_S (w6[2], w6[3], selector); - w6[2] = __byte_perm_S (w6[1], w6[2], selector); - w6[1] = __byte_perm_S (w6[0], w6[1], selector); - w6[0] = __byte_perm_S (w5[3], w6[0], selector); - w5[3] = __byte_perm_S (w5[2], w5[3], selector); - w5[2] = __byte_perm_S (w5[1], w5[2], selector); - w5[1] = __byte_perm_S (w5[0], w5[1], selector); - w5[0] = __byte_perm_S (w4[3], w5[0], selector); - w4[3] = __byte_perm_S (w4[2], w4[3], selector); - w4[2] = __byte_perm_S (w4[1], w4[2], selector); - w4[1] = __byte_perm_S (w4[0], w4[1], selector); - w4[0] = __byte_perm_S (w3[3], w4[0], selector); - w3[3] = __byte_perm_S (w3[2], w3[3], selector); - w3[2] = __byte_perm_S (w3[1], w3[2], selector); - w3[1] = __byte_perm_S (w3[0], w3[1], selector); - w3[0] = __byte_perm_S (w2[3], w3[0], selector); - w2[3] = __byte_perm_S (w2[2], w2[3], selector); - w2[2] = __byte_perm_S (w2[1], w2[2], selector); - w2[1] = __byte_perm_S (w2[0], w2[1], selector); - w2[0] = __byte_perm_S (w1[3], w2[0], selector); - w1[3] = __byte_perm_S (w1[2], w1[3], selector); - w1[2] = __byte_perm_S (w1[1], w1[2], selector); - w1[1] = __byte_perm_S (w1[0], w1[1], selector); - w1[0] = __byte_perm_S (w0[3], w1[0], selector); - w0[3] = __byte_perm_S (w0[2], w0[3], selector); - w0[2] = __byte_perm_S (w0[1], w0[2], selector); - w0[1] = __byte_perm_S (w0[0], w0[1], selector); - w0[0] = __byte_perm_S ( 0, w0[0], selector); - break; - - case 1: - w7[3] = __byte_perm_S (w7[1], w7[2], selector); - w7[2] = __byte_perm_S (w7[0], w7[1], selector); - w7[1] = __byte_perm_S (w6[3], w7[0], selector); - w7[0] = __byte_perm_S (w6[2], w6[3], selector); - w6[3] = __byte_perm_S (w6[1], w6[2], selector); - w6[2] = __byte_perm_S (w6[0], w6[1], selector); - w6[1] = __byte_perm_S (w5[3], w6[0], selector); - w6[0] = __byte_perm_S (w5[2], w5[3], selector); - w5[3] = __byte_perm_S (w5[1], w5[2], selector); - w5[2] = __byte_perm_S (w5[0], w5[1], selector); - w5[1] = __byte_perm_S (w4[3], w5[0], selector); - w5[0] = __byte_perm_S (w4[2], w4[3], selector); - w4[3] = __byte_perm_S (w4[1], w4[2], selector); - w4[2] = __byte_perm_S (w4[0], w4[1], selector); - w4[1] = __byte_perm_S (w3[3], w4[0], selector); - w4[0] = __byte_perm_S (w3[2], w3[3], selector); - w3[3] = __byte_perm_S (w3[1], w3[2], selector); - w3[2] = __byte_perm_S (w3[0], w3[1], selector); - w3[1] = __byte_perm_S (w2[3], w3[0], selector); - w3[0] = __byte_perm_S (w2[2], w2[3], selector); - w2[3] = __byte_perm_S (w2[1], w2[2], selector); - w2[2] = __byte_perm_S (w2[0], w2[1], selector); - w2[1] = __byte_perm_S (w1[3], w2[0], selector); - w2[0] = __byte_perm_S (w1[2], w1[3], selector); - w1[3] = __byte_perm_S (w1[1], w1[2], selector); - w1[2] = __byte_perm_S (w1[0], w1[1], selector); - w1[1] = __byte_perm_S (w0[3], w1[0], selector); - w1[0] = __byte_perm_S (w0[2], w0[3], selector); - w0[3] = __byte_perm_S (w0[1], w0[2], selector); - w0[2] = __byte_perm_S (w0[0], w0[1], selector); - w0[1] = __byte_perm_S ( 0, w0[0], selector); - w0[0] = 0; - break; - - case 2: - w7[3] = __byte_perm_S (w7[0], w7[1], selector); - w7[2] = __byte_perm_S (w6[3], w7[0], selector); - w7[1] = __byte_perm_S (w6[2], w6[3], selector); - w7[0] = __byte_perm_S (w6[1], w6[2], selector); - w6[3] = __byte_perm_S (w6[0], w6[1], selector); - w6[2] = __byte_perm_S (w5[3], w6[0], selector); - w6[1] = __byte_perm_S (w5[2], w5[3], selector); - w6[0] = __byte_perm_S (w5[1], w5[2], selector); - w5[3] = __byte_perm_S (w5[0], w5[1], selector); - w5[2] = __byte_perm_S (w4[3], w5[0], selector); - w5[1] = __byte_perm_S (w4[2], w4[3], selector); - w5[0] = __byte_perm_S (w4[1], w4[2], selector); - w4[3] = __byte_perm_S (w4[0], w4[1], selector); - w4[2] = __byte_perm_S (w3[3], w4[0], selector); - w4[1] = __byte_perm_S (w3[2], w3[3], selector); - w4[0] = __byte_perm_S (w3[1], w3[2], selector); - w3[3] = __byte_perm_S (w3[0], w3[1], selector); - w3[2] = __byte_perm_S (w2[3], w3[0], selector); - w3[1] = __byte_perm_S (w2[2], w2[3], selector); - w3[0] = __byte_perm_S (w2[1], w2[2], selector); - w2[3] = __byte_perm_S (w2[0], w2[1], selector); - w2[2] = __byte_perm_S (w1[3], w2[0], selector); - w2[1] = __byte_perm_S (w1[2], w1[3], selector); - w2[0] = __byte_perm_S (w1[1], w1[2], selector); - w1[3] = __byte_perm_S (w1[0], w1[1], selector); - w1[2] = __byte_perm_S (w0[3], w1[0], selector); - w1[1] = __byte_perm_S (w0[2], w0[3], selector); - w1[0] = __byte_perm_S (w0[1], w0[2], selector); - w0[3] = __byte_perm_S (w0[0], w0[1], selector); - w0[2] = __byte_perm_S ( 0, w0[0], selector); - w0[1] = 0; - w0[0] = 0; break; - case 3: - w7[3] = __byte_perm_S (w6[3], w7[0], selector); - w7[2] = __byte_perm_S (w6[2], w6[3], selector); - w7[1] = __byte_perm_S (w6[1], w6[2], selector); - w7[0] = __byte_perm_S (w6[0], w6[1], selector); - w6[3] = __byte_perm_S (w5[3], w6[0], selector); - w6[2] = __byte_perm_S (w5[2], w5[3], selector); - w6[1] = __byte_perm_S (w5[1], w5[2], selector); - w6[0] = __byte_perm_S (w5[0], w5[1], selector); - w5[3] = __byte_perm_S (w4[3], w5[0], selector); - w5[2] = __byte_perm_S (w4[2], w4[3], selector); - w5[1] = __byte_perm_S (w4[1], w4[2], selector); - w5[0] = __byte_perm_S (w4[0], w4[1], selector); - w4[3] = __byte_perm_S (w3[3], w4[0], selector); - w4[2] = __byte_perm_S (w3[2], w3[3], selector); - w4[1] = __byte_perm_S (w3[1], w3[2], selector); - w4[0] = __byte_perm_S (w3[0], w3[1], selector); - w3[3] = __byte_perm_S (w2[3], w3[0], selector); - w3[2] = __byte_perm_S (w2[2], w2[3], selector); - w3[1] = __byte_perm_S (w2[1], w2[2], selector); - w3[0] = __byte_perm_S (w2[0], w2[1], selector); - w2[3] = __byte_perm_S (w1[3], w2[0], selector); - w2[2] = __byte_perm_S (w1[2], w1[3], selector); - w2[1] = __byte_perm_S (w1[1], w1[2], selector); - w2[0] = __byte_perm_S (w1[0], w1[1], selector); - w1[3] = __byte_perm_S (w0[3], w1[0], selector); - w1[2] = __byte_perm_S (w0[2], w0[3], selector); - w1[1] = __byte_perm_S (w0[1], w0[2], selector); - w1[0] = __byte_perm_S (w0[0], w0[1], selector); - w0[3] = __byte_perm_S ( 0, w0[0], selector); + case 19: + w7[3] = __byte_perm_S (w3[0], w2[3], selector); + w7[2] = __byte_perm_S (w2[3], w2[2], selector); + w7[1] = __byte_perm_S (w2[2], w2[1], selector); + w7[0] = __byte_perm_S (w2[1], w2[0], selector); + w6[3] = __byte_perm_S (w2[0], w1[3], selector); + w6[2] = __byte_perm_S (w1[3], w1[2], selector); + w6[1] = __byte_perm_S (w1[2], w1[1], selector); + w6[0] = __byte_perm_S (w1[1], w1[0], selector); + w5[3] = __byte_perm_S (w1[0], w0[3], selector); + w5[2] = __byte_perm_S (w0[3], w0[2], selector); + w5[1] = __byte_perm_S (w0[2], w0[1], selector); + w5[0] = __byte_perm_S (w0[1], w0[0], selector); + w4[3] = __byte_perm_S (w0[0], 0, selector); + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 4: - w7[3] = __byte_perm_S (w6[2], w6[3], selector); - w7[2] = __byte_perm_S (w6[1], w6[2], selector); - w7[1] = __byte_perm_S (w6[0], w6[1], selector); - w7[0] = __byte_perm_S (w5[3], w6[0], selector); - w6[3] = __byte_perm_S (w5[2], w5[3], selector); - w6[2] = __byte_perm_S (w5[1], w5[2], selector); - w6[1] = __byte_perm_S (w5[0], w5[1], selector); - w6[0] = __byte_perm_S (w4[3], w5[0], selector); - w5[3] = __byte_perm_S (w4[2], w4[3], selector); - w5[2] = __byte_perm_S (w4[1], w4[2], selector); - w5[1] = __byte_perm_S (w4[0], w4[1], selector); - w5[0] = __byte_perm_S (w3[3], w4[0], selector); - w4[3] = __byte_perm_S (w3[2], w3[3], selector); - w4[2] = __byte_perm_S (w3[1], w3[2], selector); - w4[1] = __byte_perm_S (w3[0], w3[1], selector); - w4[0] = __byte_perm_S (w2[3], w3[0], selector); - w3[3] = __byte_perm_S (w2[2], w2[3], selector); - w3[2] = __byte_perm_S (w2[1], w2[2], selector); - w3[1] = __byte_perm_S (w2[0], w2[1], selector); - w3[0] = __byte_perm_S (w1[3], w2[0], selector); - w2[3] = __byte_perm_S (w1[2], w1[3], selector); - w2[2] = __byte_perm_S (w1[1], w1[2], selector); - w2[1] = __byte_perm_S (w1[0], w1[1], selector); - w2[0] = __byte_perm_S (w0[3], w1[0], selector); - w1[3] = __byte_perm_S (w0[2], w0[3], selector); - w1[2] = __byte_perm_S (w0[1], w0[2], selector); - w1[1] = __byte_perm_S (w0[0], w0[1], selector); - w1[0] = __byte_perm_S ( 0, w0[0], selector); + case 20: + w7[3] = __byte_perm_S (w2[3], w2[2], selector); + w7[2] = __byte_perm_S (w2[2], w2[1], selector); + w7[1] = __byte_perm_S (w2[1], w2[0], selector); + w7[0] = __byte_perm_S (w2[0], w1[3], selector); + w6[3] = __byte_perm_S (w1[3], w1[2], selector); + w6[2] = __byte_perm_S (w1[2], w1[1], selector); + w6[1] = __byte_perm_S (w1[1], w1[0], selector); + w6[0] = __byte_perm_S (w1[0], w0[3], selector); + w5[3] = __byte_perm_S (w0[3], w0[2], selector); + w5[2] = __byte_perm_S (w0[2], w0[1], selector); + w5[1] = __byte_perm_S (w0[1], w0[0], selector); + w5[0] = __byte_perm_S (w0[0], 0, selector); + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 5: - w7[3] = __byte_perm_S (w6[1], w6[2], selector); - w7[2] = __byte_perm_S (w6[0], w6[1], selector); - w7[1] = __byte_perm_S (w5[3], w6[0], selector); - w7[0] = __byte_perm_S (w5[2], w5[3], selector); - w6[3] = __byte_perm_S (w5[1], w5[2], selector); - w6[2] = __byte_perm_S (w5[0], w5[1], selector); - w6[1] = __byte_perm_S (w4[3], w5[0], selector); - w6[0] = __byte_perm_S (w4[2], w4[3], selector); - w5[3] = __byte_perm_S (w4[1], w4[2], selector); - w5[2] = __byte_perm_S (w4[0], w4[1], selector); - w5[1] = __byte_perm_S (w3[3], w4[0], selector); - w5[0] = __byte_perm_S (w3[2], w3[3], selector); - w4[3] = __byte_perm_S (w3[1], w3[2], selector); - w4[2] = __byte_perm_S (w3[0], w3[1], selector); - w4[1] = __byte_perm_S (w2[3], w3[0], selector); - w4[0] = __byte_perm_S (w2[2], w2[3], selector); - w3[3] = __byte_perm_S (w2[1], w2[2], selector); - w3[2] = __byte_perm_S (w2[0], w2[1], selector); - w3[1] = __byte_perm_S (w1[3], w2[0], selector); - w3[0] = __byte_perm_S (w1[2], w1[3], selector); - w2[3] = __byte_perm_S (w1[1], w1[2], selector); - w2[2] = __byte_perm_S (w1[0], w1[1], selector); - w2[1] = __byte_perm_S (w0[3], w1[0], selector); - w2[0] = __byte_perm_S (w0[2], w0[3], selector); - w1[3] = __byte_perm_S (w0[1], w0[2], selector); - w1[2] = __byte_perm_S (w0[0], w0[1], selector); - w1[1] = __byte_perm_S ( 0, w0[0], selector); + case 21: + w7[3] = __byte_perm_S (w2[2], w2[1], selector); + w7[2] = __byte_perm_S (w2[1], w2[0], selector); + w7[1] = __byte_perm_S (w2[0], w1[3], selector); + w7[0] = __byte_perm_S (w1[3], w1[2], selector); + w6[3] = __byte_perm_S (w1[2], w1[1], selector); + w6[2] = __byte_perm_S (w1[1], w1[0], selector); + w6[1] = __byte_perm_S (w1[0], w0[3], selector); + w6[0] = __byte_perm_S (w0[3], w0[2], selector); + w5[3] = __byte_perm_S (w0[2], w0[1], selector); + w5[2] = __byte_perm_S (w0[1], w0[0], selector); + w5[1] = __byte_perm_S (w0[0], 0, selector); + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 6: - w7[3] = __byte_perm_S (w6[0], w6[1], selector); - w7[2] = __byte_perm_S (w5[3], w6[0], selector); - w7[1] = __byte_perm_S (w5[2], w5[3], selector); - w7[0] = __byte_perm_S (w5[1], w5[2], selector); - w6[3] = __byte_perm_S (w5[0], w5[1], selector); - w6[2] = __byte_perm_S (w4[3], w5[0], selector); - w6[1] = __byte_perm_S (w4[2], w4[3], selector); - w6[0] = __byte_perm_S (w4[1], w4[2], selector); - w5[3] = __byte_perm_S (w4[0], w4[1], selector); - w5[2] = __byte_perm_S (w3[3], w4[0], selector); - w5[1] = __byte_perm_S (w3[2], w3[3], selector); - w5[0] = __byte_perm_S (w3[1], w3[2], selector); - w4[3] = __byte_perm_S (w3[0], w3[1], selector); - w4[2] = __byte_perm_S (w2[3], w3[0], selector); - w4[1] = __byte_perm_S (w2[2], w2[3], selector); - w4[0] = __byte_perm_S (w2[1], w2[2], selector); - w3[3] = __byte_perm_S (w2[0], w2[1], selector); - w3[2] = __byte_perm_S (w1[3], w2[0], selector); - w3[1] = __byte_perm_S (w1[2], w1[3], selector); - w3[0] = __byte_perm_S (w1[1], w1[2], selector); - w2[3] = __byte_perm_S (w1[0], w1[1], selector); - w2[2] = __byte_perm_S (w0[3], w1[0], selector); - w2[1] = __byte_perm_S (w0[2], w0[3], selector); - w2[0] = __byte_perm_S (w0[1], w0[2], selector); - w1[3] = __byte_perm_S (w0[0], w0[1], selector); - w1[2] = __byte_perm_S ( 0, w0[0], selector); + case 22: + w7[3] = __byte_perm_S (w2[1], w2[0], selector); + w7[2] = __byte_perm_S (w2[0], w1[3], selector); + w7[1] = __byte_perm_S (w1[3], w1[2], selector); + w7[0] = __byte_perm_S (w1[2], w1[1], selector); + w6[3] = __byte_perm_S (w1[1], w1[0], selector); + w6[2] = __byte_perm_S (w1[0], w0[3], selector); + w6[1] = __byte_perm_S (w0[3], w0[2], selector); + w6[0] = __byte_perm_S (w0[2], w0[1], selector); + w5[3] = __byte_perm_S (w0[1], w0[0], selector); + w5[2] = __byte_perm_S (w0[0], 0, selector); + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 7: - w7[3] = __byte_perm_S (w5[3], w6[0], selector); - w7[2] = __byte_perm_S (w5[2], w5[3], selector); - w7[1] = __byte_perm_S (w5[1], w5[2], selector); - w7[0] = __byte_perm_S (w5[0], w5[1], selector); - w6[3] = __byte_perm_S (w4[3], w5[0], selector); - w6[2] = __byte_perm_S (w4[2], w4[3], selector); - w6[1] = __byte_perm_S (w4[1], w4[2], selector); - w6[0] = __byte_perm_S (w4[0], w4[1], selector); - w5[3] = __byte_perm_S (w3[3], w4[0], selector); - w5[2] = __byte_perm_S (w3[2], w3[3], selector); - w5[1] = __byte_perm_S (w3[1], w3[2], selector); - w5[0] = __byte_perm_S (w3[0], w3[1], selector); - w4[3] = __byte_perm_S (w2[3], w3[0], selector); - w4[2] = __byte_perm_S (w2[2], w2[3], selector); - w4[1] = __byte_perm_S (w2[1], w2[2], selector); - w4[0] = __byte_perm_S (w2[0], w2[1], selector); - w3[3] = __byte_perm_S (w1[3], w2[0], selector); - w3[2] = __byte_perm_S (w1[2], w1[3], selector); - w3[1] = __byte_perm_S (w1[1], w1[2], selector); - w3[0] = __byte_perm_S (w1[0], w1[1], selector); - w2[3] = __byte_perm_S (w0[3], w1[0], selector); - w2[2] = __byte_perm_S (w0[2], w0[3], selector); - w2[1] = __byte_perm_S (w0[1], w0[2], selector); - w2[0] = __byte_perm_S (w0[0], w0[1], selector); - w1[3] = __byte_perm_S ( 0, w0[0], selector); + case 23: + w7[3] = __byte_perm_S (w2[0], w1[3], selector); + w7[2] = __byte_perm_S (w1[3], w1[2], selector); + w7[1] = __byte_perm_S (w1[2], w1[1], selector); + w7[0] = __byte_perm_S (w1[1], w1[0], selector); + w6[3] = __byte_perm_S (w1[0], w0[3], selector); + w6[2] = __byte_perm_S (w0[3], w0[2], selector); + w6[1] = __byte_perm_S (w0[2], w0[1], selector); + w6[0] = __byte_perm_S (w0[1], w0[0], selector); + w5[3] = __byte_perm_S (w0[0], 0, selector); + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -11295,33 +14134,34 @@ inline void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 8: - w7[3] = __byte_perm_S (w5[2], w5[3], selector); - w7[2] = __byte_perm_S (w5[1], w5[2], selector); - w7[1] = __byte_perm_S (w5[0], w5[1], selector); - w7[0] = __byte_perm_S (w4[3], w5[0], selector); - w6[3] = __byte_perm_S (w4[2], w4[3], selector); - w6[2] = __byte_perm_S (w4[1], w4[2], selector); - w6[1] = __byte_perm_S (w4[0], w4[1], selector); - w6[0] = __byte_perm_S (w3[3], w4[0], selector); - w5[3] = __byte_perm_S (w3[2], w3[3], selector); - w5[2] = __byte_perm_S (w3[1], w3[2], selector); - w5[1] = __byte_perm_S (w3[0], w3[1], selector); - w5[0] = __byte_perm_S (w2[3], w3[0], selector); - w4[3] = __byte_perm_S (w2[2], w2[3], selector); - w4[2] = __byte_perm_S (w2[1], w2[2], selector); - w4[1] = __byte_perm_S (w2[0], w2[1], selector); - w4[0] = __byte_perm_S (w1[3], w2[0], selector); - w3[3] = __byte_perm_S (w1[2], w1[3], selector); - w3[2] = __byte_perm_S (w1[1], w1[2], selector); - w3[1] = __byte_perm_S (w1[0], w1[1], selector); - w3[0] = __byte_perm_S (w0[3], w1[0], selector); - w2[3] = __byte_perm_S (w0[2], w0[3], selector); - w2[2] = __byte_perm_S (w0[1], w0[2], selector); - w2[1] = __byte_perm_S (w0[0], w0[1], selector); - w2[0] = __byte_perm_S ( 0, w0[0], selector); + case 24: + w7[3] = __byte_perm_S (w1[3], w1[2], selector); + w7[2] = __byte_perm_S (w1[2], w1[1], selector); + w7[1] = __byte_perm_S (w1[1], w1[0], selector); + w7[0] = __byte_perm_S (w1[0], w0[3], selector); + w6[3] = __byte_perm_S (w0[3], w0[2], selector); + w6[2] = __byte_perm_S (w0[2], w0[1], selector); + w6[1] = __byte_perm_S (w0[1], w0[0], selector); + w6[0] = __byte_perm_S (w0[0], 0, selector); + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -11330,32 +14170,33 @@ inline void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 9: - w7[3] = __byte_perm_S (w5[1], w5[2], selector); - w7[2] = __byte_perm_S (w5[0], w5[1], selector); - w7[1] = __byte_perm_S (w4[3], w5[0], selector); - w7[0] = __byte_perm_S (w4[2], w4[3], selector); - w6[3] = __byte_perm_S (w4[1], w4[2], selector); - w6[2] = __byte_perm_S (w4[0], w4[1], selector); - w6[1] = __byte_perm_S (w3[3], w4[0], selector); - w6[0] = __byte_perm_S (w3[2], w3[3], selector); - w5[3] = __byte_perm_S (w3[1], w3[2], selector); - w5[2] = __byte_perm_S (w3[0], w3[1], selector); - w5[1] = __byte_perm_S (w2[3], w3[0], selector); - w5[0] = __byte_perm_S (w2[2], w2[3], selector); - w4[3] = __byte_perm_S (w2[1], w2[2], selector); - w4[2] = __byte_perm_S (w2[0], w2[1], selector); - w4[1] = __byte_perm_S (w1[3], w2[0], selector); - w4[0] = __byte_perm_S (w1[2], w1[3], selector); - w3[3] = __byte_perm_S (w1[1], w1[2], selector); - w3[2] = __byte_perm_S (w1[0], w1[1], selector); - w3[1] = __byte_perm_S (w0[3], w1[0], selector); - w3[0] = __byte_perm_S (w0[2], w0[3], selector); - w2[3] = __byte_perm_S (w0[1], w0[2], selector); - w2[2] = __byte_perm_S (w0[0], w0[1], selector); - w2[1] = __byte_perm_S ( 0, w0[0], selector); + case 25: + w7[3] = __byte_perm_S (w1[2], w1[1], selector); + w7[2] = __byte_perm_S (w1[1], w1[0], selector); + w7[1] = __byte_perm_S (w1[0], w0[3], selector); + w7[0] = __byte_perm_S (w0[3], w0[2], selector); + w6[3] = __byte_perm_S (w0[2], w0[1], selector); + w6[2] = __byte_perm_S (w0[1], w0[0], selector); + w6[1] = __byte_perm_S (w0[0], 0, selector); + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -11365,31 +14206,32 @@ inline void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 10: - w7[3] = __byte_perm_S (w5[0], w5[1], selector); - w7[2] = __byte_perm_S (w4[3], w5[0], selector); - w7[1] = __byte_perm_S (w4[2], w4[3], selector); - w7[0] = __byte_perm_S (w4[1], w4[2], selector); - w6[3] = __byte_perm_S (w4[0], w4[1], selector); - w6[2] = __byte_perm_S (w3[3], w4[0], selector); - w6[1] = __byte_perm_S (w3[2], w3[3], selector); - w6[0] = __byte_perm_S (w3[1], w3[2], selector); - w5[3] = __byte_perm_S (w3[0], w3[1], selector); - w5[2] = __byte_perm_S (w2[3], w3[0], selector); - w5[1] = __byte_perm_S (w2[2], w2[3], selector); - w5[0] = __byte_perm_S (w2[1], w2[2], selector); - w4[3] = __byte_perm_S (w2[0], w2[1], selector); - w4[2] = __byte_perm_S (w1[3], w2[0], selector); - w4[1] = __byte_perm_S (w1[2], w1[3], selector); - w4[0] = __byte_perm_S (w1[1], w1[2], selector); - w3[3] = __byte_perm_S (w1[0], w1[1], selector); - w3[2] = __byte_perm_S (w0[3], w1[0], selector); - w3[1] = __byte_perm_S (w0[2], w0[3], selector); - w3[0] = __byte_perm_S (w0[1], w0[2], selector); - w2[3] = __byte_perm_S (w0[0], w0[1], selector); - w2[2] = __byte_perm_S ( 0, w0[0], selector); + case 26: + w7[3] = __byte_perm_S (w1[1], w1[0], selector); + w7[2] = __byte_perm_S (w1[0], w0[3], selector); + w7[1] = __byte_perm_S (w0[3], w0[2], selector); + w7[0] = __byte_perm_S (w0[2], w0[1], selector); + w6[3] = __byte_perm_S (w0[1], w0[0], selector); + w6[2] = __byte_perm_S (w0[0], 0, selector); + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -11400,30 +14242,31 @@ inline void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 11: - w7[3] = __byte_perm_S (w4[3], w5[0], selector); - w7[2] = __byte_perm_S (w4[2], w4[3], selector); - w7[1] = __byte_perm_S (w4[1], w4[2], selector); - w7[0] = __byte_perm_S (w4[0], w4[1], selector); - w6[3] = __byte_perm_S (w3[3], w4[0], selector); - w6[2] = __byte_perm_S (w3[2], w3[3], selector); - w6[1] = __byte_perm_S (w3[1], w3[2], selector); - w6[0] = __byte_perm_S (w3[0], w3[1], selector); - w5[3] = __byte_perm_S (w2[3], w3[0], selector); - w5[2] = __byte_perm_S (w2[2], w2[3], selector); - w5[1] = __byte_perm_S (w2[1], w2[2], selector); - w5[0] = __byte_perm_S (w2[0], w2[1], selector); - w4[3] = __byte_perm_S (w1[3], w2[0], selector); - w4[2] = __byte_perm_S (w1[2], w1[3], selector); - w4[1] = __byte_perm_S (w1[1], w1[2], selector); - w4[0] = __byte_perm_S (w1[0], w1[1], selector); - w3[3] = __byte_perm_S (w0[3], w1[0], selector); - w3[2] = __byte_perm_S (w0[2], w0[3], selector); - w3[1] = __byte_perm_S (w0[1], w0[2], selector); - w3[0] = __byte_perm_S (w0[0], w0[1], selector); - w2[3] = __byte_perm_S ( 0, w0[0], selector); + case 27: + w7[3] = __byte_perm_S (w1[0], w0[3], selector); + w7[2] = __byte_perm_S (w0[3], w0[2], selector); + w7[1] = __byte_perm_S (w0[2], w0[1], selector); + w7[0] = __byte_perm_S (w0[1], w0[0], selector); + w6[3] = __byte_perm_S (w0[0], 0, selector); + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -11435,29 +14278,30 @@ inline void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 12: - w7[3] = __byte_perm_S (w4[2], w4[3], selector); - w7[2] = __byte_perm_S (w4[1], w4[2], selector); - w7[1] = __byte_perm_S (w4[0], w4[1], selector); - w7[0] = __byte_perm_S (w3[3], w4[0], selector); - w6[3] = __byte_perm_S (w3[2], w3[3], selector); - w6[2] = __byte_perm_S (w3[1], w3[2], selector); - w6[1] = __byte_perm_S (w3[0], w3[1], selector); - w6[0] = __byte_perm_S (w2[3], w3[0], selector); - w5[3] = __byte_perm_S (w2[2], w2[3], selector); - w5[2] = __byte_perm_S (w2[1], w2[2], selector); - w5[1] = __byte_perm_S (w2[0], w2[1], selector); - w5[0] = __byte_perm_S (w1[3], w2[0], selector); - w4[3] = __byte_perm_S (w1[2], w1[3], selector); - w4[2] = __byte_perm_S (w1[1], w1[2], selector); - w4[1] = __byte_perm_S (w1[0], w1[1], selector); - w4[0] = __byte_perm_S (w0[3], w1[0], selector); - w3[3] = __byte_perm_S (w0[2], w0[3], selector); - w3[2] = __byte_perm_S (w0[1], w0[2], selector); - w3[1] = __byte_perm_S (w0[0], w0[1], selector); - w3[0] = __byte_perm_S ( 0, w0[0], selector); + case 28: + w7[3] = __byte_perm_S (w0[3], w0[2], selector); + w7[2] = __byte_perm_S (w0[2], w0[1], selector); + w7[1] = __byte_perm_S (w0[1], w0[0], selector); + w7[0] = __byte_perm_S (w0[0], 0, selector); + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -11470,28 +14314,29 @@ inline void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 13: - w7[3] = __byte_perm_S (w4[1], w4[2], selector); - w7[2] = __byte_perm_S (w4[0], w4[1], selector); - w7[1] = __byte_perm_S (w3[3], w4[0], selector); - w7[0] = __byte_perm_S (w3[2], w3[3], selector); - w6[3] = __byte_perm_S (w3[1], w3[2], selector); - w6[2] = __byte_perm_S (w3[0], w3[1], selector); - w6[1] = __byte_perm_S (w2[3], w3[0], selector); - w6[0] = __byte_perm_S (w2[2], w2[3], selector); - w5[3] = __byte_perm_S (w2[1], w2[2], selector); - w5[2] = __byte_perm_S (w2[0], w2[1], selector); - w5[1] = __byte_perm_S (w1[3], w2[0], selector); - w5[0] = __byte_perm_S (w1[2], w1[3], selector); - w4[3] = __byte_perm_S (w1[1], w1[2], selector); - w4[2] = __byte_perm_S (w1[0], w1[1], selector); - w4[1] = __byte_perm_S (w0[3], w1[0], selector); - w4[0] = __byte_perm_S (w0[2], w0[3], selector); - w3[3] = __byte_perm_S (w0[1], w0[2], selector); - w3[2] = __byte_perm_S (w0[0], w0[1], selector); - w3[1] = __byte_perm_S ( 0, w0[0], selector); + case 29: + w7[3] = __byte_perm_S (w0[2], w0[1], selector); + w7[2] = __byte_perm_S (w0[1], w0[0], selector); + w7[1] = __byte_perm_S (w0[0], 0, selector); + w7[0] = 0; + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -11505,27 +14350,28 @@ inline void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 14: - w7[3] = __byte_perm_S (w4[0], w4[1], selector); - w7[2] = __byte_perm_S (w3[3], w4[0], selector); - w7[1] = __byte_perm_S (w3[2], w3[3], selector); - w7[0] = __byte_perm_S (w3[1], w3[2], selector); - w6[3] = __byte_perm_S (w3[0], w3[1], selector); - w6[2] = __byte_perm_S (w2[3], w3[0], selector); - w6[1] = __byte_perm_S (w2[2], w2[3], selector); - w6[0] = __byte_perm_S (w2[1], w2[2], selector); - w5[3] = __byte_perm_S (w2[0], w2[1], selector); - w5[2] = __byte_perm_S (w1[3], w2[0], selector); - w5[1] = __byte_perm_S (w1[2], w1[3], selector); - w5[0] = __byte_perm_S (w1[1], w1[2], selector); - w4[3] = __byte_perm_S (w1[0], w1[1], selector); - w4[2] = __byte_perm_S (w0[3], w1[0], selector); - w4[1] = __byte_perm_S (w0[2], w0[3], selector); - w4[0] = __byte_perm_S (w0[1], w0[2], selector); - w3[3] = __byte_perm_S (w0[0], w0[1], selector); - w3[2] = __byte_perm_S ( 0, w0[0], selector); + case 30: + w7[3] = __byte_perm_S (w0[1], w0[0], selector); + w7[2] = __byte_perm_S (w0[0], 0, selector); + w7[1] = 0; + w7[0] = 0; + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -11540,26 +14386,27 @@ inline void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 15: - w7[3] = __byte_perm_S (w3[3], w4[0], selector); - w7[2] = __byte_perm_S (w3[2], w3[3], selector); - w7[1] = __byte_perm_S (w3[1], w3[2], selector); - w7[0] = __byte_perm_S (w3[0], w3[1], selector); - w6[3] = __byte_perm_S (w2[3], w3[0], selector); - w6[2] = __byte_perm_S (w2[2], w2[3], selector); - w6[1] = __byte_perm_S (w2[1], w2[2], selector); - w6[0] = __byte_perm_S (w2[0], w2[1], selector); - w5[3] = __byte_perm_S (w1[3], w2[0], selector); - w5[2] = __byte_perm_S (w1[2], w1[3], selector); - w5[1] = __byte_perm_S (w1[1], w1[2], selector); - w5[0] = __byte_perm_S (w1[0], w1[1], selector); - w4[3] = __byte_perm_S (w0[3], w1[0], selector); - w4[2] = __byte_perm_S (w0[2], w0[3], selector); - w4[1] = __byte_perm_S (w0[1], w0[2], selector); - w4[0] = __byte_perm_S (w0[0], w0[1], selector); - w3[3] = __byte_perm_S ( 0, w0[0], selector); + case 31: + w7[3] = __byte_perm_S (w0[0], 0, selector); + w7[2] = 0; + w7[1] = 0; + w7[0] = 0; + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -11575,17 +14422,19 @@ inline void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; } #endif } -inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset) +inline void switch_buffer_by_offset_8x4_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset) { #if defined IS_AMD || defined IS_GENERIC switch (offset / 4) { case 0: + c0[0] = amd_bytealign_S (w7[3], 0, offset); w7[3] = amd_bytealign_S (w7[2], w7[3], offset); w7[2] = amd_bytealign_S (w7[1], w7[2], offset); w7[1] = amd_bytealign_S (w7[0], w7[1], offset); @@ -11622,6 +14471,8 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 1: + c0[1] = amd_bytealign_S (w7[3], 0, offset); + c0[0] = amd_bytealign_S (w7[2], w7[3], offset); w7[3] = amd_bytealign_S (w7[1], w7[2], offset); w7[2] = amd_bytealign_S (w7[0], w7[1], offset); w7[1] = amd_bytealign_S (w6[3], w7[0], offset); @@ -11658,6 +14509,9 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 2: + c0[2] = amd_bytealign_S (w7[3], 0, offset); + c0[1] = amd_bytealign_S (w7[2], w7[3], offset); + c0[0] = amd_bytealign_S (w7[1], w7[2], offset); w7[3] = amd_bytealign_S (w7[0], w7[1], offset); w7[2] = amd_bytealign_S (w6[3], w7[0], offset); w7[1] = amd_bytealign_S (w6[2], w6[3], offset); @@ -11694,6 +14548,10 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 3: + c0[3] = amd_bytealign_S (w7[3], 0, offset); + c0[2] = amd_bytealign_S (w7[2], w7[3], offset); + c0[1] = amd_bytealign_S (w7[1], w7[2], offset); + c0[0] = amd_bytealign_S (w7[0], w7[1], offset); w7[3] = amd_bytealign_S (w6[3], w7[0], offset); w7[2] = amd_bytealign_S (w6[2], w6[3], offset); w7[1] = amd_bytealign_S (w6[1], w6[2], offset); @@ -11730,6 +14588,11 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 4: + c1[0] = amd_bytealign_S (w7[3], 0, offset); + c0[3] = amd_bytealign_S (w7[2], w7[3], offset); + c0[2] = amd_bytealign_S (w7[1], w7[2], offset); + c0[1] = amd_bytealign_S (w7[0], w7[1], offset); + c0[0] = amd_bytealign_S (w6[3], w7[0], offset); w7[3] = amd_bytealign_S (w6[2], w6[3], offset); w7[2] = amd_bytealign_S (w6[1], w6[2], offset); w7[1] = amd_bytealign_S (w6[0], w6[1], offset); @@ -11766,6 +14629,12 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 5: + c1[1] = amd_bytealign_S (w7[3], 0, offset); + c1[0] = amd_bytealign_S (w7[2], w7[3], offset); + c0[3] = amd_bytealign_S (w7[1], w7[2], offset); + c0[2] = amd_bytealign_S (w7[0], w7[1], offset); + c0[1] = amd_bytealign_S (w6[3], w7[0], offset); + c0[0] = amd_bytealign_S (w6[2], w6[3], offset); w7[3] = amd_bytealign_S (w6[1], w6[2], offset); w7[2] = amd_bytealign_S (w6[0], w6[1], offset); w7[1] = amd_bytealign_S (w5[3], w6[0], offset); @@ -11802,6 +14671,13 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 6: + c1[2] = amd_bytealign_S (w7[3], 0, offset); + c1[1] = amd_bytealign_S (w7[2], w7[3], offset); + c1[0] = amd_bytealign_S (w7[1], w7[2], offset); + c0[3] = amd_bytealign_S (w7[0], w7[1], offset); + c0[2] = amd_bytealign_S (w6[3], w7[0], offset); + c0[1] = amd_bytealign_S (w6[2], w6[3], offset); + c0[0] = amd_bytealign_S (w6[1], w6[2], offset); w7[3] = amd_bytealign_S (w6[0], w6[1], offset); w7[2] = amd_bytealign_S (w5[3], w6[0], offset); w7[1] = amd_bytealign_S (w5[2], w5[3], offset); @@ -11838,6 +14714,14 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 7: + c1[3] = amd_bytealign_S (w7[3], 0, offset); + c1[2] = amd_bytealign_S (w7[2], w7[3], offset); + c1[1] = amd_bytealign_S (w7[1], w7[2], offset); + c1[0] = amd_bytealign_S (w7[0], w7[1], offset); + c0[3] = amd_bytealign_S (w6[3], w7[0], offset); + c0[2] = amd_bytealign_S (w6[2], w6[3], offset); + c0[1] = amd_bytealign_S (w6[1], w6[2], offset); + c0[0] = amd_bytealign_S (w6[0], w6[1], offset); w7[3] = amd_bytealign_S (w5[3], w6[0], offset); w7[2] = amd_bytealign_S (w5[2], w5[3], offset); w7[1] = amd_bytealign_S (w5[1], w5[2], offset); @@ -11874,6 +14758,15 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 8: + c2[0] = amd_bytealign_S (w7[3], 0, offset); + c1[3] = amd_bytealign_S (w7[2], w7[3], offset); + c1[2] = amd_bytealign_S (w7[1], w7[2], offset); + c1[1] = amd_bytealign_S (w7[0], w7[1], offset); + c1[0] = amd_bytealign_S (w6[3], w7[0], offset); + c0[3] = amd_bytealign_S (w6[2], w6[3], offset); + c0[2] = amd_bytealign_S (w6[1], w6[2], offset); + c0[1] = amd_bytealign_S (w6[0], w6[1], offset); + c0[0] = amd_bytealign_S (w5[3], w6[0], offset); w7[3] = amd_bytealign_S (w5[2], w5[3], offset); w7[2] = amd_bytealign_S (w5[1], w5[2], offset); w7[1] = amd_bytealign_S (w5[0], w5[1], offset); @@ -11910,6 +14803,16 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 9: + c2[1] = amd_bytealign_S (w7[3], 0, offset); + c2[0] = amd_bytealign_S (w7[2], w7[3], offset); + c1[3] = amd_bytealign_S (w7[1], w7[2], offset); + c1[2] = amd_bytealign_S (w7[0], w7[1], offset); + c1[1] = amd_bytealign_S (w6[3], w7[0], offset); + c1[0] = amd_bytealign_S (w6[2], w6[3], offset); + c0[3] = amd_bytealign_S (w6[1], w6[2], offset); + c0[2] = amd_bytealign_S (w6[0], w6[1], offset); + c0[1] = amd_bytealign_S (w5[3], w6[0], offset); + c0[0] = amd_bytealign_S (w5[2], w5[3], offset); w7[3] = amd_bytealign_S (w5[1], w5[2], offset); w7[2] = amd_bytealign_S (w5[0], w5[1], offset); w7[1] = amd_bytealign_S (w4[3], w5[0], offset); @@ -11946,6 +14849,17 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 10: + c2[2] = amd_bytealign_S (w7[3], 0, offset); + c2[1] = amd_bytealign_S (w7[2], w7[3], offset); + c2[0] = amd_bytealign_S (w7[1], w7[2], offset); + c1[3] = amd_bytealign_S (w7[0], w7[1], offset); + c1[2] = amd_bytealign_S (w6[3], w7[0], offset); + c1[1] = amd_bytealign_S (w6[2], w6[3], offset); + c1[0] = amd_bytealign_S (w6[1], w6[2], offset); + c0[3] = amd_bytealign_S (w6[0], w6[1], offset); + c0[2] = amd_bytealign_S (w5[3], w6[0], offset); + c0[1] = amd_bytealign_S (w5[2], w5[3], offset); + c0[0] = amd_bytealign_S (w5[1], w5[2], offset); w7[3] = amd_bytealign_S (w5[0], w5[1], offset); w7[2] = amd_bytealign_S (w4[3], w5[0], offset); w7[1] = amd_bytealign_S (w4[2], w4[3], offset); @@ -11982,6 +14896,18 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 11: + c2[3] = amd_bytealign_S (w7[3], 0, offset); + c2[2] = amd_bytealign_S (w7[2], w7[3], offset); + c2[1] = amd_bytealign_S (w7[1], w7[2], offset); + c2[0] = amd_bytealign_S (w7[0], w7[1], offset); + c1[3] = amd_bytealign_S (w6[3], w7[0], offset); + c1[2] = amd_bytealign_S (w6[2], w6[3], offset); + c1[1] = amd_bytealign_S (w6[1], w6[2], offset); + c1[0] = amd_bytealign_S (w6[0], w6[1], offset); + c0[3] = amd_bytealign_S (w5[3], w6[0], offset); + c0[2] = amd_bytealign_S (w5[2], w5[3], offset); + c0[1] = amd_bytealign_S (w5[1], w5[2], offset); + c0[0] = amd_bytealign_S (w5[0], w5[1], offset); w7[3] = amd_bytealign_S (w4[3], w5[0], offset); w7[2] = amd_bytealign_S (w4[2], w4[3], offset); w7[1] = amd_bytealign_S (w4[1], w4[2], offset); @@ -12018,6 +14944,19 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 12: + c3[0] = amd_bytealign_S (w7[3], 0, offset); + c2[3] = amd_bytealign_S (w7[2], w7[3], offset); + c2[2] = amd_bytealign_S (w7[1], w7[2], offset); + c2[1] = amd_bytealign_S (w7[0], w7[1], offset); + c2[0] = amd_bytealign_S (w6[3], w7[0], offset); + c1[3] = amd_bytealign_S (w6[2], w6[3], offset); + c1[2] = amd_bytealign_S (w6[1], w6[2], offset); + c1[1] = amd_bytealign_S (w6[0], w6[1], offset); + c1[0] = amd_bytealign_S (w5[3], w6[0], offset); + c0[3] = amd_bytealign_S (w5[2], w5[3], offset); + c0[2] = amd_bytealign_S (w5[1], w5[2], offset); + c0[1] = amd_bytealign_S (w5[0], w5[1], offset); + c0[0] = amd_bytealign_S (w4[3], w5[0], offset); w7[3] = amd_bytealign_S (w4[2], w4[3], offset); w7[2] = amd_bytealign_S (w4[1], w4[2], offset); w7[1] = amd_bytealign_S (w4[0], w4[1], offset); @@ -12054,6 +14993,20 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 13: + c3[1] = amd_bytealign_S (w7[3], 0, offset); + c3[0] = amd_bytealign_S (w7[2], w7[3], offset); + c2[3] = amd_bytealign_S (w7[1], w7[2], offset); + c2[2] = amd_bytealign_S (w7[0], w7[1], offset); + c2[1] = amd_bytealign_S (w6[3], w7[0], offset); + c2[0] = amd_bytealign_S (w6[2], w6[3], offset); + c1[3] = amd_bytealign_S (w6[1], w6[2], offset); + c1[2] = amd_bytealign_S (w6[0], w6[1], offset); + c1[1] = amd_bytealign_S (w5[3], w6[0], offset); + c1[0] = amd_bytealign_S (w5[2], w5[3], offset); + c0[3] = amd_bytealign_S (w5[1], w5[2], offset); + c0[2] = amd_bytealign_S (w5[0], w5[1], offset); + c0[1] = amd_bytealign_S (w4[3], w5[0], offset); + c0[0] = amd_bytealign_S (w4[2], w4[3], offset); w7[3] = amd_bytealign_S (w4[1], w4[2], offset); w7[2] = amd_bytealign_S (w4[0], w4[1], offset); w7[1] = amd_bytealign_S (w3[3], w4[0], offset); @@ -12090,6 +15043,21 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 14: + c3[2] = amd_bytealign_S (w7[3], 0, offset); + c3[1] = amd_bytealign_S (w7[2], w7[3], offset); + c3[0] = amd_bytealign_S (w7[1], w7[2], offset); + c2[3] = amd_bytealign_S (w7[0], w7[1], offset); + c2[2] = amd_bytealign_S (w6[3], w7[0], offset); + c2[1] = amd_bytealign_S (w6[2], w6[3], offset); + c2[0] = amd_bytealign_S (w6[1], w6[2], offset); + c1[3] = amd_bytealign_S (w6[0], w6[1], offset); + c1[2] = amd_bytealign_S (w5[3], w6[0], offset); + c1[1] = amd_bytealign_S (w5[2], w5[3], offset); + c1[0] = amd_bytealign_S (w5[1], w5[2], offset); + c0[3] = amd_bytealign_S (w5[0], w5[1], offset); + c0[2] = amd_bytealign_S (w4[3], w5[0], offset); + c0[1] = amd_bytealign_S (w4[2], w4[3], offset); + c0[0] = amd_bytealign_S (w4[1], w4[2], offset); w7[3] = amd_bytealign_S (w4[0], w4[1], offset); w7[2] = amd_bytealign_S (w3[3], w4[0], offset); w7[1] = amd_bytealign_S (w3[2], w3[3], offset); @@ -12126,6 +15094,22 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 15: + c3[3] = amd_bytealign_S (w7[3], 0, offset); + c3[2] = amd_bytealign_S (w7[2], w7[3], offset); + c3[1] = amd_bytealign_S (w7[1], w7[2], offset); + c3[0] = amd_bytealign_S (w7[0], w7[1], offset); + c2[3] = amd_bytealign_S (w6[3], w7[0], offset); + c2[2] = amd_bytealign_S (w6[2], w6[3], offset); + c2[1] = amd_bytealign_S (w6[1], w6[2], offset); + c2[0] = amd_bytealign_S (w6[0], w6[1], offset); + c1[3] = amd_bytealign_S (w5[3], w6[0], offset); + c1[2] = amd_bytealign_S (w5[2], w5[3], offset); + c1[1] = amd_bytealign_S (w5[1], w5[2], offset); + c1[0] = amd_bytealign_S (w5[0], w5[1], offset); + c0[3] = amd_bytealign_S (w4[3], w5[0], offset); + c0[2] = amd_bytealign_S (w4[2], w4[3], offset); + c0[1] = amd_bytealign_S (w4[1], w4[2], offset); + c0[0] = amd_bytealign_S (w4[0], w4[1], offset); w7[3] = amd_bytealign_S (w3[3], w4[0], offset); w7[2] = amd_bytealign_S (w3[2], w3[3], offset); w7[1] = amd_bytealign_S (w3[1], w3[2], offset); @@ -12162,6 +15146,23 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 16: + c4[0] = amd_bytealign_S (w7[3], 0, offset); + c3[3] = amd_bytealign_S (w7[2], w7[3], offset); + c3[2] = amd_bytealign_S (w7[1], w7[2], offset); + c3[1] = amd_bytealign_S (w7[0], w7[1], offset); + c3[0] = amd_bytealign_S (w6[3], w7[0], offset); + c2[3] = amd_bytealign_S (w6[2], w6[3], offset); + c2[2] = amd_bytealign_S (w6[1], w6[2], offset); + c2[1] = amd_bytealign_S (w6[0], w6[1], offset); + c2[0] = amd_bytealign_S (w5[3], w6[0], offset); + c1[3] = amd_bytealign_S (w5[2], w5[3], offset); + c1[2] = amd_bytealign_S (w5[1], w5[2], offset); + c1[1] = amd_bytealign_S (w5[0], w5[1], offset); + c1[0] = amd_bytealign_S (w4[3], w5[0], offset); + c0[3] = amd_bytealign_S (w4[2], w4[3], offset); + c0[2] = amd_bytealign_S (w4[1], w4[2], offset); + c0[1] = amd_bytealign_S (w4[0], w4[1], offset); + c0[0] = amd_bytealign_S (w3[3], w4[0], offset); w7[3] = amd_bytealign_S (w3[2], w3[3], offset); w7[2] = amd_bytealign_S (w3[1], w3[2], offset); w7[1] = amd_bytealign_S (w3[0], w3[1], offset); @@ -12198,6 +15199,24 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 17: + c4[1] = amd_bytealign_S (w7[3], 0, offset); + c4[0] = amd_bytealign_S (w7[2], w7[3], offset); + c3[3] = amd_bytealign_S (w7[1], w7[2], offset); + c3[2] = amd_bytealign_S (w7[0], w7[1], offset); + c3[1] = amd_bytealign_S (w6[3], w7[0], offset); + c3[0] = amd_bytealign_S (w6[2], w6[3], offset); + c2[3] = amd_bytealign_S (w6[1], w6[2], offset); + c2[2] = amd_bytealign_S (w6[0], w6[1], offset); + c2[1] = amd_bytealign_S (w5[3], w6[0], offset); + c2[0] = amd_bytealign_S (w5[2], w5[3], offset); + c1[3] = amd_bytealign_S (w5[1], w5[2], offset); + c1[2] = amd_bytealign_S (w5[0], w5[1], offset); + c1[1] = amd_bytealign_S (w4[3], w5[0], offset); + c1[0] = amd_bytealign_S (w4[2], w4[3], offset); + c0[3] = amd_bytealign_S (w4[1], w4[2], offset); + c0[2] = amd_bytealign_S (w4[0], w4[1], offset); + c0[1] = amd_bytealign_S (w3[3], w4[0], offset); + c0[0] = amd_bytealign_S (w3[2], w3[3], offset); w7[3] = amd_bytealign_S (w3[1], w3[2], offset); w7[2] = amd_bytealign_S (w3[0], w3[1], offset); w7[1] = amd_bytealign_S (w2[3], w3[0], offset); @@ -12234,6 +15253,25 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 18: + c4[2] = amd_bytealign_S (w7[3], 0, offset); + c4[1] = amd_bytealign_S (w7[2], w7[3], offset); + c4[0] = amd_bytealign_S (w7[1], w7[2], offset); + c3[3] = amd_bytealign_S (w7[0], w7[1], offset); + c3[2] = amd_bytealign_S (w6[3], w7[0], offset); + c3[1] = amd_bytealign_S (w6[2], w6[3], offset); + c3[0] = amd_bytealign_S (w6[1], w6[2], offset); + c2[3] = amd_bytealign_S (w6[0], w6[1], offset); + c2[2] = amd_bytealign_S (w5[3], w6[0], offset); + c2[1] = amd_bytealign_S (w5[2], w5[3], offset); + c2[0] = amd_bytealign_S (w5[1], w5[2], offset); + c1[3] = amd_bytealign_S (w5[0], w5[1], offset); + c1[2] = amd_bytealign_S (w4[3], w5[0], offset); + c1[1] = amd_bytealign_S (w4[2], w4[3], offset); + c1[0] = amd_bytealign_S (w4[1], w4[2], offset); + c0[3] = amd_bytealign_S (w4[0], w4[1], offset); + c0[2] = amd_bytealign_S (w3[3], w4[0], offset); + c0[1] = amd_bytealign_S (w3[2], w3[3], offset); + c0[0] = amd_bytealign_S (w3[1], w3[2], offset); w7[3] = amd_bytealign_S (w3[0], w3[1], offset); w7[2] = amd_bytealign_S (w2[3], w3[0], offset); w7[1] = amd_bytealign_S (w2[2], w2[3], offset); @@ -12270,6 +15308,26 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 19: + c4[3] = amd_bytealign_S (w7[3], 0, offset); + c4[2] = amd_bytealign_S (w7[2], w7[3], offset); + c4[1] = amd_bytealign_S (w7[1], w7[2], offset); + c4[0] = amd_bytealign_S (w7[0], w7[1], offset); + c3[3] = amd_bytealign_S (w6[3], w7[0], offset); + c3[2] = amd_bytealign_S (w6[2], w6[3], offset); + c3[1] = amd_bytealign_S (w6[1], w6[2], offset); + c3[0] = amd_bytealign_S (w6[0], w6[1], offset); + c2[3] = amd_bytealign_S (w5[3], w6[0], offset); + c2[2] = amd_bytealign_S (w5[2], w5[3], offset); + c2[1] = amd_bytealign_S (w5[1], w5[2], offset); + c2[0] = amd_bytealign_S (w5[0], w5[1], offset); + c1[3] = amd_bytealign_S (w4[3], w5[0], offset); + c1[2] = amd_bytealign_S (w4[2], w4[3], offset); + c1[1] = amd_bytealign_S (w4[1], w4[2], offset); + c1[0] = amd_bytealign_S (w4[0], w4[1], offset); + c0[3] = amd_bytealign_S (w3[3], w4[0], offset); + c0[2] = amd_bytealign_S (w3[2], w3[3], offset); + c0[1] = amd_bytealign_S (w3[1], w3[2], offset); + c0[0] = amd_bytealign_S (w3[0], w3[1], offset); w7[3] = amd_bytealign_S (w2[3], w3[0], offset); w7[2] = amd_bytealign_S (w2[2], w2[3], offset); w7[1] = amd_bytealign_S (w2[1], w2[2], offset); @@ -12306,6 +15364,27 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 20: + c5[0] = amd_bytealign_S (w7[3], 0, offset); + c4[3] = amd_bytealign_S (w7[2], w7[3], offset); + c4[2] = amd_bytealign_S (w7[1], w7[2], offset); + c4[1] = amd_bytealign_S (w7[0], w7[1], offset); + c4[0] = amd_bytealign_S (w6[3], w7[0], offset); + c3[3] = amd_bytealign_S (w6[2], w6[3], offset); + c3[2] = amd_bytealign_S (w6[1], w6[2], offset); + c3[1] = amd_bytealign_S (w6[0], w6[1], offset); + c3[0] = amd_bytealign_S (w5[3], w6[0], offset); + c2[3] = amd_bytealign_S (w5[2], w5[3], offset); + c2[2] = amd_bytealign_S (w5[1], w5[2], offset); + c2[1] = amd_bytealign_S (w5[0], w5[1], offset); + c2[0] = amd_bytealign_S (w4[3], w5[0], offset); + c1[3] = amd_bytealign_S (w4[2], w4[3], offset); + c1[2] = amd_bytealign_S (w4[1], w4[2], offset); + c1[1] = amd_bytealign_S (w4[0], w4[1], offset); + c1[0] = amd_bytealign_S (w3[3], w4[0], offset); + c0[3] = amd_bytealign_S (w3[2], w3[3], offset); + c0[2] = amd_bytealign_S (w3[1], w3[2], offset); + c0[1] = amd_bytealign_S (w3[0], w3[1], offset); + c0[0] = amd_bytealign_S (w2[3], w3[0], offset); w7[3] = amd_bytealign_S (w2[2], w2[3], offset); w7[2] = amd_bytealign_S (w2[1], w2[2], offset); w7[1] = amd_bytealign_S (w2[0], w2[1], offset); @@ -12342,6 +15421,28 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 21: + c5[1] = amd_bytealign_S (w7[3], 0, offset); + c5[0] = amd_bytealign_S (w7[2], w7[3], offset); + c4[3] = amd_bytealign_S (w7[1], w7[2], offset); + c4[2] = amd_bytealign_S (w7[0], w7[1], offset); + c4[1] = amd_bytealign_S (w6[3], w7[0], offset); + c4[0] = amd_bytealign_S (w6[2], w6[3], offset); + c3[3] = amd_bytealign_S (w6[1], w6[2], offset); + c3[2] = amd_bytealign_S (w6[0], w6[1], offset); + c3[1] = amd_bytealign_S (w5[3], w6[0], offset); + c3[0] = amd_bytealign_S (w5[2], w5[3], offset); + c2[3] = amd_bytealign_S (w5[1], w5[2], offset); + c2[2] = amd_bytealign_S (w5[0], w5[1], offset); + c2[1] = amd_bytealign_S (w4[3], w5[0], offset); + c2[0] = amd_bytealign_S (w4[2], w4[3], offset); + c1[3] = amd_bytealign_S (w4[1], w4[2], offset); + c1[2] = amd_bytealign_S (w4[0], w4[1], offset); + c1[1] = amd_bytealign_S (w3[3], w4[0], offset); + c1[0] = amd_bytealign_S (w3[2], w3[3], offset); + c0[3] = amd_bytealign_S (w3[1], w3[2], offset); + c0[2] = amd_bytealign_S (w3[0], w3[1], offset); + c0[1] = amd_bytealign_S (w2[3], w3[0], offset); + c0[0] = amd_bytealign_S (w2[2], w2[3], offset); w7[3] = amd_bytealign_S (w2[1], w2[2], offset); w7[2] = amd_bytealign_S (w2[0], w2[1], offset); w7[1] = amd_bytealign_S (w1[3], w2[0], offset); @@ -12378,6 +15479,29 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 22: + c5[2] = amd_bytealign_S (w7[3], 0, offset); + c5[1] = amd_bytealign_S (w7[2], w7[3], offset); + c5[0] = amd_bytealign_S (w7[1], w7[2], offset); + c4[3] = amd_bytealign_S (w7[0], w7[1], offset); + c4[2] = amd_bytealign_S (w6[3], w7[0], offset); + c4[1] = amd_bytealign_S (w6[2], w6[3], offset); + c4[0] = amd_bytealign_S (w6[1], w6[2], offset); + c3[3] = amd_bytealign_S (w6[0], w6[1], offset); + c3[2] = amd_bytealign_S (w5[3], w6[0], offset); + c3[1] = amd_bytealign_S (w5[2], w5[3], offset); + c3[0] = amd_bytealign_S (w5[1], w5[2], offset); + c2[3] = amd_bytealign_S (w5[0], w5[1], offset); + c2[2] = amd_bytealign_S (w4[3], w5[0], offset); + c2[1] = amd_bytealign_S (w4[2], w4[3], offset); + c2[0] = amd_bytealign_S (w4[1], w4[2], offset); + c1[3] = amd_bytealign_S (w4[0], w4[1], offset); + c1[2] = amd_bytealign_S (w3[3], w4[0], offset); + c1[1] = amd_bytealign_S (w3[2], w3[3], offset); + c1[0] = amd_bytealign_S (w3[1], w3[2], offset); + c0[3] = amd_bytealign_S (w3[0], w3[1], offset); + c0[2] = amd_bytealign_S (w2[3], w3[0], offset); + c0[1] = amd_bytealign_S (w2[2], w2[3], offset); + c0[0] = amd_bytealign_S (w2[1], w2[2], offset); w7[3] = amd_bytealign_S (w2[0], w2[1], offset); w7[2] = amd_bytealign_S (w1[3], w2[0], offset); w7[1] = amd_bytealign_S (w1[2], w1[3], offset); @@ -12414,6 +15538,30 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 23: + c5[3] = amd_bytealign_S (w7[3], 0, offset); + c5[2] = amd_bytealign_S (w7[2], w7[3], offset); + c5[1] = amd_bytealign_S (w7[1], w7[2], offset); + c5[0] = amd_bytealign_S (w7[0], w7[1], offset); + c4[3] = amd_bytealign_S (w6[3], w7[0], offset); + c4[2] = amd_bytealign_S (w6[2], w6[3], offset); + c4[1] = amd_bytealign_S (w6[1], w6[2], offset); + c4[0] = amd_bytealign_S (w6[0], w6[1], offset); + c3[3] = amd_bytealign_S (w5[3], w6[0], offset); + c3[2] = amd_bytealign_S (w5[2], w5[3], offset); + c3[1] = amd_bytealign_S (w5[1], w5[2], offset); + c3[0] = amd_bytealign_S (w5[0], w5[1], offset); + c2[3] = amd_bytealign_S (w4[3], w5[0], offset); + c2[2] = amd_bytealign_S (w4[2], w4[3], offset); + c2[1] = amd_bytealign_S (w4[1], w4[2], offset); + c2[0] = amd_bytealign_S (w4[0], w4[1], offset); + c1[3] = amd_bytealign_S (w3[3], w4[0], offset); + c1[2] = amd_bytealign_S (w3[2], w3[3], offset); + c1[1] = amd_bytealign_S (w3[1], w3[2], offset); + c1[0] = amd_bytealign_S (w3[0], w3[1], offset); + c0[3] = amd_bytealign_S (w2[3], w3[0], offset); + c0[2] = amd_bytealign_S (w2[2], w2[3], offset); + c0[1] = amd_bytealign_S (w2[1], w2[2], offset); + c0[0] = amd_bytealign_S (w2[0], w2[1], offset); w7[3] = amd_bytealign_S (w1[3], w2[0], offset); w7[2] = amd_bytealign_S (w1[2], w1[3], offset); w7[1] = amd_bytealign_S (w1[1], w1[2], offset); @@ -12450,6 +15598,31 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 24: + c6[0] = amd_bytealign_S (w7[3], 0, offset); + c5[3] = amd_bytealign_S (w7[2], w7[3], offset); + c5[2] = amd_bytealign_S (w7[1], w7[2], offset); + c5[1] = amd_bytealign_S (w7[0], w7[1], offset); + c5[0] = amd_bytealign_S (w6[3], w7[0], offset); + c4[3] = amd_bytealign_S (w6[2], w6[3], offset); + c4[2] = amd_bytealign_S (w6[1], w6[2], offset); + c4[1] = amd_bytealign_S (w6[0], w6[1], offset); + c4[0] = amd_bytealign_S (w5[3], w6[0], offset); + c3[3] = amd_bytealign_S (w5[2], w5[3], offset); + c3[2] = amd_bytealign_S (w5[1], w5[2], offset); + c3[1] = amd_bytealign_S (w5[0], w5[1], offset); + c3[0] = amd_bytealign_S (w4[3], w5[0], offset); + c2[3] = amd_bytealign_S (w4[2], w4[3], offset); + c2[2] = amd_bytealign_S (w4[1], w4[2], offset); + c2[1] = amd_bytealign_S (w4[0], w4[1], offset); + c2[0] = amd_bytealign_S (w3[3], w4[0], offset); + c1[3] = amd_bytealign_S (w3[2], w3[3], offset); + c1[2] = amd_bytealign_S (w3[1], w3[2], offset); + c1[1] = amd_bytealign_S (w3[0], w3[1], offset); + c1[0] = amd_bytealign_S (w2[3], w3[0], offset); + c0[3] = amd_bytealign_S (w2[2], w2[3], offset); + c0[2] = amd_bytealign_S (w2[1], w2[2], offset); + c0[1] = amd_bytealign_S (w2[0], w2[1], offset); + c0[0] = amd_bytealign_S (w1[3], w2[0], offset); w7[3] = amd_bytealign_S (w1[2], w1[3], offset); w7[2] = amd_bytealign_S (w1[1], w1[2], offset); w7[1] = amd_bytealign_S (w1[0], w1[1], offset); @@ -12486,6 +15659,32 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 25: + c6[1] = amd_bytealign_S (w7[3], 0, offset); + c6[0] = amd_bytealign_S (w7[2], w7[3], offset); + c5[3] = amd_bytealign_S (w7[1], w7[2], offset); + c5[2] = amd_bytealign_S (w7[0], w7[1], offset); + c5[1] = amd_bytealign_S (w6[3], w7[0], offset); + c5[0] = amd_bytealign_S (w6[2], w6[3], offset); + c4[3] = amd_bytealign_S (w6[1], w6[2], offset); + c4[2] = amd_bytealign_S (w6[0], w6[1], offset); + c4[1] = amd_bytealign_S (w5[3], w6[0], offset); + c4[0] = amd_bytealign_S (w5[2], w5[3], offset); + c3[3] = amd_bytealign_S (w5[1], w5[2], offset); + c3[2] = amd_bytealign_S (w5[0], w5[1], offset); + c3[1] = amd_bytealign_S (w4[3], w5[0], offset); + c3[0] = amd_bytealign_S (w4[2], w4[3], offset); + c2[3] = amd_bytealign_S (w4[1], w4[2], offset); + c2[2] = amd_bytealign_S (w4[0], w4[1], offset); + c2[1] = amd_bytealign_S (w3[3], w4[0], offset); + c2[0] = amd_bytealign_S (w3[2], w3[3], offset); + c1[3] = amd_bytealign_S (w3[1], w3[2], offset); + c1[2] = amd_bytealign_S (w3[0], w3[1], offset); + c1[1] = amd_bytealign_S (w2[3], w3[0], offset); + c1[0] = amd_bytealign_S (w2[2], w2[3], offset); + c0[3] = amd_bytealign_S (w2[1], w2[2], offset); + c0[2] = amd_bytealign_S (w2[0], w2[1], offset); + c0[1] = amd_bytealign_S (w1[3], w2[0], offset); + c0[0] = amd_bytealign_S (w1[2], w1[3], offset); w7[3] = amd_bytealign_S (w1[1], w1[2], offset); w7[2] = amd_bytealign_S (w1[0], w1[1], offset); w7[1] = amd_bytealign_S (w0[3], w1[0], offset); @@ -12522,6 +15721,33 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 26: + c6[2] = amd_bytealign_S (w7[3], 0, offset); + c6[1] = amd_bytealign_S (w7[2], w7[3], offset); + c6[0] = amd_bytealign_S (w7[1], w7[2], offset); + c5[3] = amd_bytealign_S (w7[0], w7[1], offset); + c5[2] = amd_bytealign_S (w6[3], w7[0], offset); + c5[1] = amd_bytealign_S (w6[2], w6[3], offset); + c5[0] = amd_bytealign_S (w6[1], w6[2], offset); + c4[3] = amd_bytealign_S (w6[0], w6[1], offset); + c4[2] = amd_bytealign_S (w5[3], w6[0], offset); + c4[1] = amd_bytealign_S (w5[2], w5[3], offset); + c4[0] = amd_bytealign_S (w5[1], w5[2], offset); + c3[3] = amd_bytealign_S (w5[0], w5[1], offset); + c3[2] = amd_bytealign_S (w4[3], w5[0], offset); + c3[1] = amd_bytealign_S (w4[2], w4[3], offset); + c3[0] = amd_bytealign_S (w4[1], w4[2], offset); + c2[3] = amd_bytealign_S (w4[0], w4[1], offset); + c2[2] = amd_bytealign_S (w3[3], w4[0], offset); + c2[1] = amd_bytealign_S (w3[2], w3[3], offset); + c2[0] = amd_bytealign_S (w3[1], w3[2], offset); + c1[3] = amd_bytealign_S (w3[0], w3[1], offset); + c1[2] = amd_bytealign_S (w2[3], w3[0], offset); + c1[1] = amd_bytealign_S (w2[2], w2[3], offset); + c1[0] = amd_bytealign_S (w2[1], w2[2], offset); + c0[3] = amd_bytealign_S (w2[0], w2[1], offset); + c0[2] = amd_bytealign_S (w1[3], w2[0], offset); + c0[1] = amd_bytealign_S (w1[2], w1[3], offset); + c0[0] = amd_bytealign_S (w1[1], w1[2], offset); w7[3] = amd_bytealign_S (w1[0], w1[1], offset); w7[2] = amd_bytealign_S (w0[3], w1[0], offset); w7[1] = amd_bytealign_S (w0[2], w0[3], offset); @@ -12558,6 +15784,34 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 27: + c6[3] = amd_bytealign_S (w7[3], 0, offset); + c6[2] = amd_bytealign_S (w7[2], w7[3], offset); + c6[1] = amd_bytealign_S (w7[1], w7[2], offset); + c6[0] = amd_bytealign_S (w7[0], w7[1], offset); + c5[3] = amd_bytealign_S (w6[3], w7[0], offset); + c5[2] = amd_bytealign_S (w6[2], w6[3], offset); + c5[1] = amd_bytealign_S (w6[1], w6[2], offset); + c5[0] = amd_bytealign_S (w6[0], w6[1], offset); + c4[3] = amd_bytealign_S (w5[3], w6[0], offset); + c4[2] = amd_bytealign_S (w5[2], w5[3], offset); + c4[1] = amd_bytealign_S (w5[1], w5[2], offset); + c4[0] = amd_bytealign_S (w5[0], w5[1], offset); + c3[3] = amd_bytealign_S (w4[3], w5[0], offset); + c3[2] = amd_bytealign_S (w4[2], w4[3], offset); + c3[1] = amd_bytealign_S (w4[1], w4[2], offset); + c3[0] = amd_bytealign_S (w4[0], w4[1], offset); + c2[3] = amd_bytealign_S (w3[3], w4[0], offset); + c2[2] = amd_bytealign_S (w3[2], w3[3], offset); + c2[1] = amd_bytealign_S (w3[1], w3[2], offset); + c2[0] = amd_bytealign_S (w3[0], w3[1], offset); + c1[3] = amd_bytealign_S (w2[3], w3[0], offset); + c1[2] = amd_bytealign_S (w2[2], w2[3], offset); + c1[1] = amd_bytealign_S (w2[1], w2[2], offset); + c1[0] = amd_bytealign_S (w2[0], w2[1], offset); + c0[3] = amd_bytealign_S (w1[3], w2[0], offset); + c0[2] = amd_bytealign_S (w1[2], w1[3], offset); + c0[1] = amd_bytealign_S (w1[1], w1[2], offset); + c0[0] = amd_bytealign_S (w1[0], w1[1], offset); w7[3] = amd_bytealign_S (w0[3], w1[0], offset); w7[2] = amd_bytealign_S (w0[2], w0[3], offset); w7[1] = amd_bytealign_S (w0[1], w0[2], offset); @@ -12594,6 +15848,35 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 28: + c7[0] = amd_bytealign_S (w7[3], 0, offset); + c6[3] = amd_bytealign_S (w7[2], w7[3], offset); + c6[2] = amd_bytealign_S (w7[1], w7[2], offset); + c6[1] = amd_bytealign_S (w7[0], w7[1], offset); + c6[0] = amd_bytealign_S (w6[3], w7[0], offset); + c5[3] = amd_bytealign_S (w6[2], w6[3], offset); + c5[2] = amd_bytealign_S (w6[1], w6[2], offset); + c5[1] = amd_bytealign_S (w6[0], w6[1], offset); + c5[0] = amd_bytealign_S (w5[3], w6[0], offset); + c4[3] = amd_bytealign_S (w5[2], w5[3], offset); + c4[2] = amd_bytealign_S (w5[1], w5[2], offset); + c4[1] = amd_bytealign_S (w5[0], w5[1], offset); + c4[0] = amd_bytealign_S (w4[3], w5[0], offset); + c3[3] = amd_bytealign_S (w4[2], w4[3], offset); + c3[2] = amd_bytealign_S (w4[1], w4[2], offset); + c3[1] = amd_bytealign_S (w4[0], w4[1], offset); + c3[0] = amd_bytealign_S (w3[3], w4[0], offset); + c2[3] = amd_bytealign_S (w3[2], w3[3], offset); + c2[2] = amd_bytealign_S (w3[1], w3[2], offset); + c2[1] = amd_bytealign_S (w3[0], w3[1], offset); + c2[0] = amd_bytealign_S (w2[3], w3[0], offset); + c1[3] = amd_bytealign_S (w2[2], w2[3], offset); + c1[2] = amd_bytealign_S (w2[1], w2[2], offset); + c1[1] = amd_bytealign_S (w2[0], w2[1], offset); + c1[0] = amd_bytealign_S (w1[3], w2[0], offset); + c0[3] = amd_bytealign_S (w1[2], w1[3], offset); + c0[2] = amd_bytealign_S (w1[1], w1[2], offset); + c0[1] = amd_bytealign_S (w1[0], w1[1], offset); + c0[0] = amd_bytealign_S (w0[3], w1[0], offset); w7[3] = amd_bytealign_S (w0[2], w0[3], offset); w7[2] = amd_bytealign_S (w0[1], w0[2], offset); w7[1] = amd_bytealign_S (w0[0], w0[1], offset); @@ -12630,6 +15913,36 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 29: + c7[1] = amd_bytealign_S (w7[3], 0, offset); + c7[0] = amd_bytealign_S (w7[2], w7[3], offset); + c6[3] = amd_bytealign_S (w7[1], w7[2], offset); + c6[2] = amd_bytealign_S (w7[0], w7[1], offset); + c6[1] = amd_bytealign_S (w6[3], w7[0], offset); + c6[0] = amd_bytealign_S (w6[2], w6[3], offset); + c5[3] = amd_bytealign_S (w6[1], w6[2], offset); + c5[2] = amd_bytealign_S (w6[0], w6[1], offset); + c5[1] = amd_bytealign_S (w5[3], w6[0], offset); + c5[0] = amd_bytealign_S (w5[2], w5[3], offset); + c4[3] = amd_bytealign_S (w5[1], w5[2], offset); + c4[2] = amd_bytealign_S (w5[0], w5[1], offset); + c4[1] = amd_bytealign_S (w4[3], w5[0], offset); + c4[0] = amd_bytealign_S (w4[2], w4[3], offset); + c3[3] = amd_bytealign_S (w4[1], w4[2], offset); + c3[2] = amd_bytealign_S (w4[0], w4[1], offset); + c3[1] = amd_bytealign_S (w3[3], w4[0], offset); + c3[0] = amd_bytealign_S (w3[2], w3[3], offset); + c2[3] = amd_bytealign_S (w3[1], w3[2], offset); + c2[2] = amd_bytealign_S (w3[0], w3[1], offset); + c2[1] = amd_bytealign_S (w2[3], w3[0], offset); + c2[0] = amd_bytealign_S (w2[2], w2[3], offset); + c1[3] = amd_bytealign_S (w2[1], w2[2], offset); + c1[2] = amd_bytealign_S (w2[0], w2[1], offset); + c1[1] = amd_bytealign_S (w1[3], w2[0], offset); + c1[0] = amd_bytealign_S (w1[2], w1[3], offset); + c0[3] = amd_bytealign_S (w1[1], w1[2], offset); + c0[2] = amd_bytealign_S (w1[0], w1[1], offset); + c0[1] = amd_bytealign_S (w0[3], w1[0], offset); + c0[0] = amd_bytealign_S (w0[2], w0[3], offset); w7[3] = amd_bytealign_S (w0[1], w0[2], offset); w7[2] = amd_bytealign_S (w0[0], w0[1], offset); w7[1] = amd_bytealign_S ( 0, w0[0], offset); @@ -12666,6 +15979,37 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 30: + c7[2] = amd_bytealign_S (w7[3], 0, offset); + c7[1] = amd_bytealign_S (w7[2], w7[3], offset); + c7[0] = amd_bytealign_S (w7[1], w7[2], offset); + c6[3] = amd_bytealign_S (w7[0], w7[1], offset); + c6[2] = amd_bytealign_S (w6[3], w7[0], offset); + c6[1] = amd_bytealign_S (w6[2], w6[3], offset); + c6[0] = amd_bytealign_S (w6[1], w6[2], offset); + c5[3] = amd_bytealign_S (w6[0], w6[1], offset); + c5[2] = amd_bytealign_S (w5[3], w6[0], offset); + c5[1] = amd_bytealign_S (w5[2], w5[3], offset); + c5[0] = amd_bytealign_S (w5[1], w5[2], offset); + c4[3] = amd_bytealign_S (w5[0], w5[1], offset); + c4[2] = amd_bytealign_S (w4[3], w5[0], offset); + c4[1] = amd_bytealign_S (w4[2], w4[3], offset); + c4[0] = amd_bytealign_S (w4[1], w4[2], offset); + c3[3] = amd_bytealign_S (w4[0], w4[1], offset); + c3[2] = amd_bytealign_S (w3[3], w4[0], offset); + c3[1] = amd_bytealign_S (w3[2], w3[3], offset); + c3[0] = amd_bytealign_S (w3[1], w3[2], offset); + c2[3] = amd_bytealign_S (w3[0], w3[1], offset); + c2[2] = amd_bytealign_S (w2[3], w3[0], offset); + c2[1] = amd_bytealign_S (w2[2], w2[3], offset); + c2[0] = amd_bytealign_S (w2[1], w2[2], offset); + c1[3] = amd_bytealign_S (w2[0], w2[1], offset); + c1[2] = amd_bytealign_S (w1[3], w2[0], offset); + c1[1] = amd_bytealign_S (w1[2], w1[3], offset); + c1[0] = amd_bytealign_S (w1[1], w1[2], offset); + c0[3] = amd_bytealign_S (w1[0], w1[1], offset); + c0[2] = amd_bytealign_S (w0[3], w1[0], offset); + c0[1] = amd_bytealign_S (w0[2], w0[3], offset); + c0[0] = amd_bytealign_S (w0[1], w0[2], offset); w7[3] = amd_bytealign_S (w0[0], w0[1], offset); w7[2] = amd_bytealign_S ( 0, w0[0], offset); w7[1] = 0; @@ -12702,6 +16046,38 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 31: + c7[3] = amd_bytealign_S (w7[3], 0, offset); + c7[2] = amd_bytealign_S (w7[2], w7[3], offset); + c7[1] = amd_bytealign_S (w7[1], w7[2], offset); + c7[0] = amd_bytealign_S (w7[0], w7[1], offset); + c6[3] = amd_bytealign_S (w6[3], w7[0], offset); + c6[2] = amd_bytealign_S (w6[2], w6[3], offset); + c6[1] = amd_bytealign_S (w6[1], w6[2], offset); + c6[0] = amd_bytealign_S (w6[0], w6[1], offset); + c5[3] = amd_bytealign_S (w5[3], w6[0], offset); + c5[2] = amd_bytealign_S (w5[2], w5[3], offset); + c5[1] = amd_bytealign_S (w5[1], w5[2], offset); + c5[0] = amd_bytealign_S (w5[0], w5[1], offset); + c4[3] = amd_bytealign_S (w4[3], w5[0], offset); + c4[2] = amd_bytealign_S (w4[2], w4[3], offset); + c4[1] = amd_bytealign_S (w4[1], w4[2], offset); + c4[0] = amd_bytealign_S (w4[0], w4[1], offset); + c3[3] = amd_bytealign_S (w3[3], w4[0], offset); + c3[2] = amd_bytealign_S (w3[2], w3[3], offset); + c3[1] = amd_bytealign_S (w3[1], w3[2], offset); + c3[0] = amd_bytealign_S (w3[0], w3[1], offset); + c2[3] = amd_bytealign_S (w2[3], w3[0], offset); + c2[2] = amd_bytealign_S (w2[2], w2[3], offset); + c2[1] = amd_bytealign_S (w2[1], w2[2], offset); + c2[0] = amd_bytealign_S (w2[0], w2[1], offset); + c1[3] = amd_bytealign_S (w1[3], w2[0], offset); + c1[2] = amd_bytealign_S (w1[2], w1[3], offset); + c1[1] = amd_bytealign_S (w1[1], w1[2], offset); + c1[0] = amd_bytealign_S (w1[0], w1[1], offset); + c0[3] = amd_bytealign_S (w0[3], w1[0], offset); + c0[2] = amd_bytealign_S (w0[2], w0[3], offset); + c0[1] = amd_bytealign_S (w0[1], w0[2], offset); + c0[0] = amd_bytealign_S (w0[0], w0[1], offset); w7[3] = amd_bytealign_S ( 0, w0[0], offset); w7[2] = 0; w7[1] = 0; @@ -12745,6 +16121,7 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u switch (offset / 4) { case 0: + c0[0] = __byte_perm_S ( 0, w7[3], selector); w7[3] = __byte_perm_S (w7[3], w7[2], selector); w7[2] = __byte_perm_S (w7[2], w7[1], selector); w7[1] = __byte_perm_S (w7[1], w7[0], selector); @@ -12781,6 +16158,8 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 1: + c0[1] = __byte_perm_S ( 0, w7[3], selector); + c0[0] = __byte_perm_S (w7[3], w7[2], selector); w7[3] = __byte_perm_S (w7[2], w7[1], selector); w7[2] = __byte_perm_S (w7[1], w7[0], selector); w7[1] = __byte_perm_S (w7[0], w6[3], selector); @@ -12817,6 +16196,9 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 2: + c0[2] = __byte_perm_S ( 0, w7[3], selector); + c0[1] = __byte_perm_S (w7[3], w7[2], selector); + c0[0] = __byte_perm_S (w7[2], w7[1], selector); w7[3] = __byte_perm_S (w7[1], w7[0], selector); w7[2] = __byte_perm_S (w7[0], w6[3], selector); w7[1] = __byte_perm_S (w6[3], w6[2], selector); @@ -12853,6 +16235,10 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 3: + c0[3] = __byte_perm_S ( 0, w7[3], selector); + c0[2] = __byte_perm_S (w7[3], w7[2], selector); + c0[1] = __byte_perm_S (w7[2], w7[1], selector); + c0[0] = __byte_perm_S (w7[1], w7[0], selector); w7[3] = __byte_perm_S (w7[0], w6[3], selector); w7[2] = __byte_perm_S (w6[3], w6[2], selector); w7[1] = __byte_perm_S (w6[2], w6[1], selector); @@ -12889,6 +16275,11 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 4: + c1[0] = __byte_perm_S ( 0, w7[3], selector); + c0[3] = __byte_perm_S (w7[3], w7[2], selector); + c0[2] = __byte_perm_S (w7[2], w7[1], selector); + c0[1] = __byte_perm_S (w7[1], w7[0], selector); + c0[0] = __byte_perm_S (w7[0], w6[3], selector); w7[3] = __byte_perm_S (w6[3], w6[2], selector); w7[2] = __byte_perm_S (w6[2], w6[1], selector); w7[1] = __byte_perm_S (w6[1], w6[0], selector); @@ -12925,6 +16316,12 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 5: + c1[1] = __byte_perm_S ( 0, w7[3], selector); + c1[0] = __byte_perm_S (w7[3], w7[2], selector); + c0[3] = __byte_perm_S (w7[2], w7[1], selector); + c0[2] = __byte_perm_S (w7[1], w7[0], selector); + c0[1] = __byte_perm_S (w7[0], w6[3], selector); + c0[0] = __byte_perm_S (w6[3], w6[2], selector); w7[3] = __byte_perm_S (w6[2], w6[1], selector); w7[2] = __byte_perm_S (w6[1], w6[0], selector); w7[1] = __byte_perm_S (w6[0], w5[3], selector); @@ -12961,6 +16358,13 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 6: + c1[2] = __byte_perm_S ( 0, w7[3], selector); + c1[1] = __byte_perm_S (w7[3], w7[2], selector); + c1[0] = __byte_perm_S (w7[2], w7[1], selector); + c0[3] = __byte_perm_S (w7[1], w7[0], selector); + c0[2] = __byte_perm_S (w7[0], w6[3], selector); + c0[1] = __byte_perm_S (w6[3], w6[2], selector); + c0[0] = __byte_perm_S (w6[2], w6[1], selector); w7[3] = __byte_perm_S (w6[1], w6[0], selector); w7[2] = __byte_perm_S (w6[0], w5[3], selector); w7[1] = __byte_perm_S (w5[3], w5[2], selector); @@ -12997,6 +16401,14 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 7: + c1[3] = __byte_perm_S ( 0, w7[3], selector); + c1[2] = __byte_perm_S (w7[3], w7[2], selector); + c1[1] = __byte_perm_S (w7[2], w7[1], selector); + c1[0] = __byte_perm_S (w7[1], w7[0], selector); + c0[3] = __byte_perm_S (w7[0], w6[3], selector); + c0[2] = __byte_perm_S (w6[3], w6[2], selector); + c0[1] = __byte_perm_S (w6[2], w6[1], selector); + c0[0] = __byte_perm_S (w6[1], w6[0], selector); w7[3] = __byte_perm_S (w6[0], w5[3], selector); w7[2] = __byte_perm_S (w5[3], w5[2], selector); w7[1] = __byte_perm_S (w5[2], w5[1], selector); @@ -13033,6 +16445,15 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 8: + c2[0] = __byte_perm_S ( 0, w7[3], selector); + c1[3] = __byte_perm_S (w7[3], w7[2], selector); + c1[2] = __byte_perm_S (w7[2], w7[1], selector); + c1[1] = __byte_perm_S (w7[1], w7[0], selector); + c1[0] = __byte_perm_S (w7[0], w6[3], selector); + c0[3] = __byte_perm_S (w6[3], w6[2], selector); + c0[2] = __byte_perm_S (w6[2], w6[1], selector); + c0[1] = __byte_perm_S (w6[1], w6[0], selector); + c0[0] = __byte_perm_S (w6[0], w5[3], selector); w7[3] = __byte_perm_S (w5[3], w5[2], selector); w7[2] = __byte_perm_S (w5[2], w5[1], selector); w7[1] = __byte_perm_S (w5[1], w5[0], selector); @@ -13069,6 +16490,16 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 9: + c2[1] = __byte_perm_S ( 0, w7[3], selector); + c2[0] = __byte_perm_S (w7[3], w7[2], selector); + c1[3] = __byte_perm_S (w7[2], w7[1], selector); + c1[2] = __byte_perm_S (w7[1], w7[0], selector); + c1[1] = __byte_perm_S (w7[0], w6[3], selector); + c1[0] = __byte_perm_S (w6[3], w6[2], selector); + c0[3] = __byte_perm_S (w6[2], w6[1], selector); + c0[2] = __byte_perm_S (w6[1], w6[0], selector); + c0[1] = __byte_perm_S (w6[0], w5[3], selector); + c0[0] = __byte_perm_S (w5[3], w5[2], selector); w7[3] = __byte_perm_S (w5[2], w5[1], selector); w7[2] = __byte_perm_S (w5[1], w5[0], selector); w7[1] = __byte_perm_S (w5[0], w4[3], selector); @@ -13105,6 +16536,17 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 10: + c2[2] = __byte_perm_S ( 0, w7[3], selector); + c2[1] = __byte_perm_S (w7[3], w7[2], selector); + c2[0] = __byte_perm_S (w7[2], w7[1], selector); + c1[3] = __byte_perm_S (w7[1], w7[0], selector); + c1[2] = __byte_perm_S (w7[0], w6[3], selector); + c1[1] = __byte_perm_S (w6[3], w6[2], selector); + c1[0] = __byte_perm_S (w6[2], w6[1], selector); + c0[3] = __byte_perm_S (w6[1], w6[0], selector); + c0[2] = __byte_perm_S (w6[0], w5[3], selector); + c0[1] = __byte_perm_S (w5[3], w5[2], selector); + c0[0] = __byte_perm_S (w5[2], w5[1], selector); w7[3] = __byte_perm_S (w5[1], w5[0], selector); w7[2] = __byte_perm_S (w5[0], w4[3], selector); w7[1] = __byte_perm_S (w4[3], w4[2], selector); @@ -13141,6 +16583,18 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 11: + c2[3] = __byte_perm_S ( 0, w7[3], selector); + c2[2] = __byte_perm_S (w7[3], w7[2], selector); + c2[1] = __byte_perm_S (w7[2], w7[1], selector); + c2[0] = __byte_perm_S (w7[1], w7[0], selector); + c1[3] = __byte_perm_S (w7[0], w6[3], selector); + c1[2] = __byte_perm_S (w6[3], w6[2], selector); + c1[1] = __byte_perm_S (w6[2], w6[1], selector); + c1[0] = __byte_perm_S (w6[1], w6[0], selector); + c0[3] = __byte_perm_S (w6[0], w5[3], selector); + c0[2] = __byte_perm_S (w5[3], w5[2], selector); + c0[1] = __byte_perm_S (w5[2], w5[1], selector); + c0[0] = __byte_perm_S (w5[1], w5[0], selector); w7[3] = __byte_perm_S (w5[0], w4[3], selector); w7[2] = __byte_perm_S (w4[3], w4[2], selector); w7[1] = __byte_perm_S (w4[2], w4[1], selector); @@ -13177,6 +16631,19 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 12: + c3[0] = __byte_perm_S ( 0, w7[3], selector); + c2[3] = __byte_perm_S (w7[3], w7[2], selector); + c2[2] = __byte_perm_S (w7[2], w7[1], selector); + c2[1] = __byte_perm_S (w7[1], w7[0], selector); + c2[0] = __byte_perm_S (w7[0], w6[3], selector); + c1[3] = __byte_perm_S (w6[3], w6[2], selector); + c1[2] = __byte_perm_S (w6[2], w6[1], selector); + c1[1] = __byte_perm_S (w6[1], w6[0], selector); + c1[0] = __byte_perm_S (w6[0], w5[3], selector); + c0[3] = __byte_perm_S (w5[3], w5[2], selector); + c0[2] = __byte_perm_S (w5[2], w5[1], selector); + c0[1] = __byte_perm_S (w5[1], w5[0], selector); + c0[0] = __byte_perm_S (w5[0], w4[3], selector); w7[3] = __byte_perm_S (w4[3], w4[2], selector); w7[2] = __byte_perm_S (w4[2], w4[1], selector); w7[1] = __byte_perm_S (w4[1], w4[0], selector); @@ -13213,6 +16680,20 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 13: + c3[1] = __byte_perm_S ( 0, w7[3], selector); + c3[0] = __byte_perm_S (w7[3], w7[2], selector); + c2[3] = __byte_perm_S (w7[2], w7[1], selector); + c2[2] = __byte_perm_S (w7[1], w7[0], selector); + c2[1] = __byte_perm_S (w7[0], w6[3], selector); + c2[0] = __byte_perm_S (w6[3], w6[2], selector); + c1[3] = __byte_perm_S (w6[2], w6[1], selector); + c1[2] = __byte_perm_S (w6[1], w6[0], selector); + c1[1] = __byte_perm_S (w6[0], w5[3], selector); + c1[0] = __byte_perm_S (w5[3], w5[2], selector); + c0[3] = __byte_perm_S (w5[2], w5[1], selector); + c0[2] = __byte_perm_S (w5[1], w5[0], selector); + c0[1] = __byte_perm_S (w5[0], w4[3], selector); + c0[0] = __byte_perm_S (w4[3], w4[2], selector); w7[3] = __byte_perm_S (w4[2], w4[1], selector); w7[2] = __byte_perm_S (w4[1], w4[0], selector); w7[1] = __byte_perm_S (w4[0], w3[3], selector); @@ -13249,6 +16730,21 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 14: + c3[2] = __byte_perm_S ( 0, w7[3], selector); + c3[1] = __byte_perm_S (w7[3], w7[2], selector); + c3[0] = __byte_perm_S (w7[2], w7[1], selector); + c2[3] = __byte_perm_S (w7[1], w7[0], selector); + c2[2] = __byte_perm_S (w7[0], w6[3], selector); + c2[1] = __byte_perm_S (w6[3], w6[2], selector); + c2[0] = __byte_perm_S (w6[2], w6[1], selector); + c1[3] = __byte_perm_S (w6[1], w6[0], selector); + c1[2] = __byte_perm_S (w6[0], w5[3], selector); + c1[1] = __byte_perm_S (w5[3], w5[2], selector); + c1[0] = __byte_perm_S (w5[2], w5[1], selector); + c0[3] = __byte_perm_S (w5[1], w5[0], selector); + c0[2] = __byte_perm_S (w5[0], w4[3], selector); + c0[1] = __byte_perm_S (w4[3], w4[2], selector); + c0[0] = __byte_perm_S (w4[2], w4[1], selector); w7[3] = __byte_perm_S (w4[1], w4[0], selector); w7[2] = __byte_perm_S (w4[0], w3[3], selector); w7[1] = __byte_perm_S (w3[3], w3[2], selector); @@ -13285,6 +16781,22 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 15: + c3[3] = __byte_perm_S ( 0, w7[3], selector); + c3[2] = __byte_perm_S (w7[3], w7[2], selector); + c3[1] = __byte_perm_S (w7[2], w7[1], selector); + c3[0] = __byte_perm_S (w7[1], w7[0], selector); + c2[3] = __byte_perm_S (w7[0], w6[3], selector); + c2[2] = __byte_perm_S (w6[3], w6[2], selector); + c2[1] = __byte_perm_S (w6[2], w6[1], selector); + c2[0] = __byte_perm_S (w6[1], w6[0], selector); + c1[3] = __byte_perm_S (w6[0], w5[3], selector); + c1[2] = __byte_perm_S (w5[3], w5[2], selector); + c1[1] = __byte_perm_S (w5[2], w5[1], selector); + c1[0] = __byte_perm_S (w5[1], w5[0], selector); + c0[3] = __byte_perm_S (w5[0], w4[3], selector); + c0[2] = __byte_perm_S (w4[3], w4[2], selector); + c0[1] = __byte_perm_S (w4[2], w4[1], selector); + c0[0] = __byte_perm_S (w4[1], w4[0], selector); w7[3] = __byte_perm_S (w4[0], w3[3], selector); w7[2] = __byte_perm_S (w3[3], w3[2], selector); w7[1] = __byte_perm_S (w3[2], w3[1], selector); @@ -13321,6 +16833,23 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 16: + c4[0] = __byte_perm_S ( 0, w7[3], selector); + c3[3] = __byte_perm_S (w7[3], w7[2], selector); + c3[2] = __byte_perm_S (w7[2], w7[1], selector); + c3[1] = __byte_perm_S (w7[1], w7[0], selector); + c3[0] = __byte_perm_S (w7[0], w6[3], selector); + c2[3] = __byte_perm_S (w6[3], w6[2], selector); + c2[2] = __byte_perm_S (w6[2], w6[1], selector); + c2[1] = __byte_perm_S (w6[1], w6[0], selector); + c2[0] = __byte_perm_S (w6[0], w5[3], selector); + c1[3] = __byte_perm_S (w5[3], w5[2], selector); + c1[2] = __byte_perm_S (w5[2], w5[1], selector); + c1[1] = __byte_perm_S (w5[1], w5[0], selector); + c1[0] = __byte_perm_S (w5[0], w4[3], selector); + c0[3] = __byte_perm_S (w4[3], w4[2], selector); + c0[2] = __byte_perm_S (w4[2], w4[1], selector); + c0[1] = __byte_perm_S (w4[1], w4[0], selector); + c0[0] = __byte_perm_S (w4[0], w3[3], selector); w7[3] = __byte_perm_S (w3[3], w3[2], selector); w7[2] = __byte_perm_S (w3[2], w3[1], selector); w7[1] = __byte_perm_S (w3[1], w3[0], selector); @@ -13357,6 +16886,24 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 17: + c4[1] = __byte_perm_S ( 0, w7[3], selector); + c4[0] = __byte_perm_S (w7[3], w7[2], selector); + c3[3] = __byte_perm_S (w7[2], w7[1], selector); + c3[2] = __byte_perm_S (w7[1], w7[0], selector); + c3[1] = __byte_perm_S (w7[0], w6[3], selector); + c3[0] = __byte_perm_S (w6[3], w6[2], selector); + c2[3] = __byte_perm_S (w6[2], w6[1], selector); + c2[2] = __byte_perm_S (w6[1], w6[0], selector); + c2[1] = __byte_perm_S (w6[0], w5[3], selector); + c2[0] = __byte_perm_S (w5[3], w5[2], selector); + c1[3] = __byte_perm_S (w5[2], w5[1], selector); + c1[2] = __byte_perm_S (w5[1], w5[0], selector); + c1[1] = __byte_perm_S (w5[0], w4[3], selector); + c1[0] = __byte_perm_S (w4[3], w4[2], selector); + c0[3] = __byte_perm_S (w4[2], w4[1], selector); + c0[2] = __byte_perm_S (w4[1], w4[0], selector); + c0[1] = __byte_perm_S (w4[0], w3[3], selector); + c0[0] = __byte_perm_S (w3[3], w3[2], selector); w7[3] = __byte_perm_S (w3[2], w3[1], selector); w7[2] = __byte_perm_S (w3[1], w3[0], selector); w7[1] = __byte_perm_S (w3[0], w2[3], selector); @@ -13393,6 +16940,25 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 18: + c4[2] = __byte_perm_S ( 0, w7[3], selector); + c4[1] = __byte_perm_S (w7[3], w7[2], selector); + c4[0] = __byte_perm_S (w7[2], w7[1], selector); + c3[3] = __byte_perm_S (w7[1], w7[0], selector); + c3[2] = __byte_perm_S (w7[0], w6[3], selector); + c3[1] = __byte_perm_S (w6[3], w6[2], selector); + c3[0] = __byte_perm_S (w6[2], w6[1], selector); + c2[3] = __byte_perm_S (w6[1], w6[0], selector); + c2[2] = __byte_perm_S (w6[0], w5[3], selector); + c2[1] = __byte_perm_S (w5[3], w5[2], selector); + c2[0] = __byte_perm_S (w5[2], w5[1], selector); + c1[3] = __byte_perm_S (w5[1], w5[0], selector); + c1[2] = __byte_perm_S (w5[0], w4[3], selector); + c1[1] = __byte_perm_S (w4[3], w4[2], selector); + c1[0] = __byte_perm_S (w4[2], w4[1], selector); + c0[3] = __byte_perm_S (w4[1], w4[0], selector); + c0[2] = __byte_perm_S (w4[0], w3[3], selector); + c0[1] = __byte_perm_S (w3[3], w3[2], selector); + c0[0] = __byte_perm_S (w3[2], w3[1], selector); w7[3] = __byte_perm_S (w3[1], w3[0], selector); w7[2] = __byte_perm_S (w3[0], w2[3], selector); w7[1] = __byte_perm_S (w2[3], w2[2], selector); @@ -13429,6 +16995,26 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 19: + c4[3] = __byte_perm_S ( 0, w7[3], selector); + c4[2] = __byte_perm_S (w7[3], w7[2], selector); + c4[1] = __byte_perm_S (w7[2], w7[1], selector); + c4[0] = __byte_perm_S (w7[1], w7[0], selector); + c3[3] = __byte_perm_S (w7[0], w6[3], selector); + c3[2] = __byte_perm_S (w6[3], w6[2], selector); + c3[1] = __byte_perm_S (w6[2], w6[1], selector); + c3[0] = __byte_perm_S (w6[1], w6[0], selector); + c2[3] = __byte_perm_S (w6[0], w5[3], selector); + c2[2] = __byte_perm_S (w5[3], w5[2], selector); + c2[1] = __byte_perm_S (w5[2], w5[1], selector); + c2[0] = __byte_perm_S (w5[1], w5[0], selector); + c1[3] = __byte_perm_S (w5[0], w4[3], selector); + c1[2] = __byte_perm_S (w4[3], w4[2], selector); + c1[1] = __byte_perm_S (w4[2], w4[1], selector); + c1[0] = __byte_perm_S (w4[1], w4[0], selector); + c0[3] = __byte_perm_S (w4[0], w3[3], selector); + c0[2] = __byte_perm_S (w3[3], w3[2], selector); + c0[1] = __byte_perm_S (w3[2], w3[1], selector); + c0[0] = __byte_perm_S (w3[1], w3[0], selector); w7[3] = __byte_perm_S (w3[0], w2[3], selector); w7[2] = __byte_perm_S (w2[3], w2[2], selector); w7[1] = __byte_perm_S (w2[2], w2[1], selector); @@ -13465,6 +17051,27 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 20: + c5[0] = __byte_perm_S ( 0, w7[3], selector); + c4[3] = __byte_perm_S (w7[3], w7[2], selector); + c4[2] = __byte_perm_S (w7[2], w7[1], selector); + c4[1] = __byte_perm_S (w7[1], w7[0], selector); + c4[0] = __byte_perm_S (w7[0], w6[3], selector); + c3[3] = __byte_perm_S (w6[3], w6[2], selector); + c3[2] = __byte_perm_S (w6[2], w6[1], selector); + c3[1] = __byte_perm_S (w6[1], w6[0], selector); + c3[0] = __byte_perm_S (w6[0], w5[3], selector); + c2[3] = __byte_perm_S (w5[3], w5[2], selector); + c2[2] = __byte_perm_S (w5[2], w5[1], selector); + c2[1] = __byte_perm_S (w5[1], w5[0], selector); + c2[0] = __byte_perm_S (w5[0], w4[3], selector); + c1[3] = __byte_perm_S (w4[3], w4[2], selector); + c1[2] = __byte_perm_S (w4[2], w4[1], selector); + c1[1] = __byte_perm_S (w4[1], w4[0], selector); + c1[0] = __byte_perm_S (w4[0], w3[3], selector); + c0[3] = __byte_perm_S (w3[3], w3[2], selector); + c0[2] = __byte_perm_S (w3[2], w3[1], selector); + c0[1] = __byte_perm_S (w3[1], w3[0], selector); + c0[0] = __byte_perm_S (w3[0], w2[3], selector); w7[3] = __byte_perm_S (w2[3], w2[2], selector); w7[2] = __byte_perm_S (w2[2], w2[1], selector); w7[1] = __byte_perm_S (w2[1], w2[0], selector); @@ -13501,6 +17108,28 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 21: + c5[1] = __byte_perm_S ( 0, w7[3], selector); + c5[0] = __byte_perm_S (w7[3], w7[2], selector); + c4[3] = __byte_perm_S (w7[2], w7[1], selector); + c4[2] = __byte_perm_S (w7[1], w7[0], selector); + c4[1] = __byte_perm_S (w7[0], w6[3], selector); + c4[0] = __byte_perm_S (w6[3], w6[2], selector); + c3[3] = __byte_perm_S (w6[2], w6[1], selector); + c3[2] = __byte_perm_S (w6[1], w6[0], selector); + c3[1] = __byte_perm_S (w6[0], w5[3], selector); + c3[0] = __byte_perm_S (w5[3], w5[2], selector); + c2[3] = __byte_perm_S (w5[2], w5[1], selector); + c2[2] = __byte_perm_S (w5[1], w5[0], selector); + c2[1] = __byte_perm_S (w5[0], w4[3], selector); + c2[0] = __byte_perm_S (w4[3], w4[2], selector); + c1[3] = __byte_perm_S (w4[2], w4[1], selector); + c1[2] = __byte_perm_S (w4[1], w4[0], selector); + c1[1] = __byte_perm_S (w4[0], w3[3], selector); + c1[0] = __byte_perm_S (w3[3], w3[2], selector); + c0[3] = __byte_perm_S (w3[2], w3[1], selector); + c0[2] = __byte_perm_S (w3[1], w3[0], selector); + c0[1] = __byte_perm_S (w3[0], w2[3], selector); + c0[0] = __byte_perm_S (w2[3], w2[2], selector); w7[3] = __byte_perm_S (w2[2], w2[1], selector); w7[2] = __byte_perm_S (w2[1], w2[0], selector); w7[1] = __byte_perm_S (w2[0], w1[3], selector); @@ -13537,6 +17166,29 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 22: + c5[2] = __byte_perm_S ( 0, w7[3], selector); + c5[1] = __byte_perm_S (w7[3], w7[2], selector); + c5[0] = __byte_perm_S (w7[2], w7[1], selector); + c4[3] = __byte_perm_S (w7[1], w7[0], selector); + c4[2] = __byte_perm_S (w7[0], w6[3], selector); + c4[1] = __byte_perm_S (w6[3], w6[2], selector); + c4[0] = __byte_perm_S (w6[2], w6[1], selector); + c3[3] = __byte_perm_S (w6[1], w6[0], selector); + c3[2] = __byte_perm_S (w6[0], w5[3], selector); + c3[1] = __byte_perm_S (w5[3], w5[2], selector); + c3[0] = __byte_perm_S (w5[2], w5[1], selector); + c2[3] = __byte_perm_S (w5[1], w5[0], selector); + c2[2] = __byte_perm_S (w5[0], w4[3], selector); + c2[1] = __byte_perm_S (w4[3], w4[2], selector); + c2[0] = __byte_perm_S (w4[2], w4[1], selector); + c1[3] = __byte_perm_S (w4[1], w4[0], selector); + c1[2] = __byte_perm_S (w4[0], w3[3], selector); + c1[1] = __byte_perm_S (w3[3], w3[2], selector); + c1[0] = __byte_perm_S (w3[2], w3[1], selector); + c0[3] = __byte_perm_S (w3[1], w3[0], selector); + c0[2] = __byte_perm_S (w3[0], w2[3], selector); + c0[1] = __byte_perm_S (w2[3], w2[2], selector); + c0[0] = __byte_perm_S (w2[2], w2[1], selector); w7[3] = __byte_perm_S (w2[1], w2[0], selector); w7[2] = __byte_perm_S (w2[0], w1[3], selector); w7[1] = __byte_perm_S (w1[3], w1[2], selector); @@ -13573,6 +17225,30 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 23: + c5[3] = __byte_perm_S ( 0, w7[3], selector); + c5[2] = __byte_perm_S (w7[3], w7[2], selector); + c5[1] = __byte_perm_S (w7[2], w7[1], selector); + c5[0] = __byte_perm_S (w7[1], w7[0], selector); + c4[3] = __byte_perm_S (w7[0], w6[3], selector); + c4[2] = __byte_perm_S (w6[3], w6[2], selector); + c4[1] = __byte_perm_S (w6[2], w6[1], selector); + c4[0] = __byte_perm_S (w6[1], w6[0], selector); + c3[3] = __byte_perm_S (w6[0], w5[3], selector); + c3[2] = __byte_perm_S (w5[3], w5[2], selector); + c3[1] = __byte_perm_S (w5[2], w5[1], selector); + c3[0] = __byte_perm_S (w5[1], w5[0], selector); + c2[3] = __byte_perm_S (w5[0], w4[3], selector); + c2[2] = __byte_perm_S (w4[3], w4[2], selector); + c2[1] = __byte_perm_S (w4[2], w4[1], selector); + c2[0] = __byte_perm_S (w4[1], w4[0], selector); + c1[3] = __byte_perm_S (w4[0], w3[3], selector); + c1[2] = __byte_perm_S (w3[3], w3[2], selector); + c1[1] = __byte_perm_S (w3[2], w3[1], selector); + c1[0] = __byte_perm_S (w3[1], w3[0], selector); + c0[3] = __byte_perm_S (w3[0], w2[3], selector); + c0[2] = __byte_perm_S (w2[3], w2[2], selector); + c0[1] = __byte_perm_S (w2[2], w2[1], selector); + c0[0] = __byte_perm_S (w2[1], w2[0], selector); w7[3] = __byte_perm_S (w2[0], w1[3], selector); w7[2] = __byte_perm_S (w1[3], w1[2], selector); w7[1] = __byte_perm_S (w1[2], w1[1], selector); @@ -13609,6 +17285,31 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 24: + c6[0] = __byte_perm_S ( 0, w7[3], selector); + c5[3] = __byte_perm_S (w7[3], w7[2], selector); + c5[2] = __byte_perm_S (w7[2], w7[1], selector); + c5[1] = __byte_perm_S (w7[1], w7[0], selector); + c5[0] = __byte_perm_S (w7[0], w6[3], selector); + c4[3] = __byte_perm_S (w6[3], w6[2], selector); + c4[2] = __byte_perm_S (w6[2], w6[1], selector); + c4[1] = __byte_perm_S (w6[1], w6[0], selector); + c4[0] = __byte_perm_S (w6[0], w5[3], selector); + c3[3] = __byte_perm_S (w5[3], w5[2], selector); + c3[2] = __byte_perm_S (w5[2], w5[1], selector); + c3[1] = __byte_perm_S (w5[1], w5[0], selector); + c3[0] = __byte_perm_S (w5[0], w4[3], selector); + c2[3] = __byte_perm_S (w4[3], w4[2], selector); + c2[2] = __byte_perm_S (w4[2], w4[1], selector); + c2[1] = __byte_perm_S (w4[1], w4[0], selector); + c2[0] = __byte_perm_S (w4[0], w3[3], selector); + c1[3] = __byte_perm_S (w3[3], w3[2], selector); + c1[2] = __byte_perm_S (w3[2], w3[1], selector); + c1[1] = __byte_perm_S (w3[1], w3[0], selector); + c1[0] = __byte_perm_S (w3[0], w2[3], selector); + c0[3] = __byte_perm_S (w2[3], w2[2], selector); + c0[2] = __byte_perm_S (w2[2], w2[1], selector); + c0[1] = __byte_perm_S (w2[1], w2[0], selector); + c0[0] = __byte_perm_S (w2[0], w1[3], selector); w7[3] = __byte_perm_S (w1[3], w1[2], selector); w7[2] = __byte_perm_S (w1[2], w1[1], selector); w7[1] = __byte_perm_S (w1[1], w1[0], selector); @@ -13645,6 +17346,32 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 25: + c6[1] = __byte_perm_S ( 0, w7[3], selector); + c6[0] = __byte_perm_S (w7[3], w7[2], selector); + c5[3] = __byte_perm_S (w7[2], w7[1], selector); + c5[2] = __byte_perm_S (w7[1], w7[0], selector); + c5[1] = __byte_perm_S (w7[0], w6[3], selector); + c5[0] = __byte_perm_S (w6[3], w6[2], selector); + c4[3] = __byte_perm_S (w6[2], w6[1], selector); + c4[2] = __byte_perm_S (w6[1], w6[0], selector); + c4[1] = __byte_perm_S (w6[0], w5[3], selector); + c4[0] = __byte_perm_S (w5[3], w5[2], selector); + c3[3] = __byte_perm_S (w5[2], w5[1], selector); + c3[2] = __byte_perm_S (w5[1], w5[0], selector); + c3[1] = __byte_perm_S (w5[0], w4[3], selector); + c3[0] = __byte_perm_S (w4[3], w4[2], selector); + c2[3] = __byte_perm_S (w4[2], w4[1], selector); + c2[2] = __byte_perm_S (w4[1], w4[0], selector); + c2[1] = __byte_perm_S (w4[0], w3[3], selector); + c2[0] = __byte_perm_S (w3[3], w3[2], selector); + c1[3] = __byte_perm_S (w3[2], w3[1], selector); + c1[2] = __byte_perm_S (w3[1], w3[0], selector); + c1[1] = __byte_perm_S (w3[0], w2[3], selector); + c1[0] = __byte_perm_S (w2[3], w2[2], selector); + c0[3] = __byte_perm_S (w2[2], w2[1], selector); + c0[2] = __byte_perm_S (w2[1], w2[0], selector); + c0[1] = __byte_perm_S (w2[0], w1[3], selector); + c0[0] = __byte_perm_S (w1[3], w1[2], selector); w7[3] = __byte_perm_S (w1[2], w1[1], selector); w7[2] = __byte_perm_S (w1[1], w1[0], selector); w7[1] = __byte_perm_S (w1[0], w0[3], selector); @@ -13681,6 +17408,33 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 26: + c6[2] = __byte_perm_S ( 0, w7[3], selector); + c6[1] = __byte_perm_S (w7[3], w7[2], selector); + c6[0] = __byte_perm_S (w7[2], w7[1], selector); + c5[3] = __byte_perm_S (w7[1], w7[0], selector); + c5[2] = __byte_perm_S (w7[0], w6[3], selector); + c5[1] = __byte_perm_S (w6[3], w6[2], selector); + c5[0] = __byte_perm_S (w6[2], w6[1], selector); + c4[3] = __byte_perm_S (w6[1], w6[0], selector); + c4[2] = __byte_perm_S (w6[0], w5[3], selector); + c4[1] = __byte_perm_S (w5[3], w5[2], selector); + c4[0] = __byte_perm_S (w5[2], w5[1], selector); + c3[3] = __byte_perm_S (w5[1], w5[0], selector); + c3[2] = __byte_perm_S (w5[0], w4[3], selector); + c3[1] = __byte_perm_S (w4[3], w4[2], selector); + c3[0] = __byte_perm_S (w4[2], w4[1], selector); + c2[3] = __byte_perm_S (w4[1], w4[0], selector); + c2[2] = __byte_perm_S (w4[0], w3[3], selector); + c2[1] = __byte_perm_S (w3[3], w3[2], selector); + c2[0] = __byte_perm_S (w3[2], w3[1], selector); + c1[3] = __byte_perm_S (w3[1], w3[0], selector); + c1[2] = __byte_perm_S (w3[0], w2[3], selector); + c1[1] = __byte_perm_S (w2[3], w2[2], selector); + c1[0] = __byte_perm_S (w2[2], w2[1], selector); + c0[3] = __byte_perm_S (w2[1], w2[0], selector); + c0[2] = __byte_perm_S (w2[0], w1[3], selector); + c0[1] = __byte_perm_S (w1[3], w1[2], selector); + c0[0] = __byte_perm_S (w1[2], w1[1], selector); w7[3] = __byte_perm_S (w1[1], w1[0], selector); w7[2] = __byte_perm_S (w1[0], w0[3], selector); w7[1] = __byte_perm_S (w0[3], w0[2], selector); @@ -13717,6 +17471,34 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 27: + c6[3] = __byte_perm_S ( 0, w7[3], selector); + c6[2] = __byte_perm_S (w7[3], w7[2], selector); + c6[1] = __byte_perm_S (w7[2], w7[1], selector); + c6[0] = __byte_perm_S (w7[1], w7[0], selector); + c5[3] = __byte_perm_S (w7[0], w6[3], selector); + c5[2] = __byte_perm_S (w6[3], w6[2], selector); + c5[1] = __byte_perm_S (w6[2], w6[1], selector); + c5[0] = __byte_perm_S (w6[1], w6[0], selector); + c4[3] = __byte_perm_S (w6[0], w5[3], selector); + c4[2] = __byte_perm_S (w5[3], w5[2], selector); + c4[1] = __byte_perm_S (w5[2], w5[1], selector); + c4[0] = __byte_perm_S (w5[1], w5[0], selector); + c3[3] = __byte_perm_S (w5[0], w4[3], selector); + c3[2] = __byte_perm_S (w4[3], w4[2], selector); + c3[1] = __byte_perm_S (w4[2], w4[1], selector); + c3[0] = __byte_perm_S (w4[1], w4[0], selector); + c2[3] = __byte_perm_S (w4[0], w3[3], selector); + c2[2] = __byte_perm_S (w3[3], w3[2], selector); + c2[1] = __byte_perm_S (w3[2], w3[1], selector); + c2[0] = __byte_perm_S (w3[1], w3[0], selector); + c1[3] = __byte_perm_S (w3[0], w2[3], selector); + c1[2] = __byte_perm_S (w2[3], w2[2], selector); + c1[1] = __byte_perm_S (w2[2], w2[1], selector); + c1[0] = __byte_perm_S (w2[1], w2[0], selector); + c0[3] = __byte_perm_S (w2[0], w1[3], selector); + c0[2] = __byte_perm_S (w1[3], w1[2], selector); + c0[1] = __byte_perm_S (w1[2], w1[1], selector); + c0[0] = __byte_perm_S (w1[1], w1[0], selector); w7[3] = __byte_perm_S (w1[0], w0[3], selector); w7[2] = __byte_perm_S (w0[3], w0[2], selector); w7[1] = __byte_perm_S (w0[2], w0[1], selector); @@ -13753,6 +17535,35 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 28: + c7[0] = __byte_perm_S ( 0, w7[3], selector); + c6[3] = __byte_perm_S (w7[3], w7[2], selector); + c6[2] = __byte_perm_S (w7[2], w7[1], selector); + c6[1] = __byte_perm_S (w7[1], w7[0], selector); + c6[0] = __byte_perm_S (w7[0], w6[3], selector); + c5[3] = __byte_perm_S (w6[3], w6[2], selector); + c5[2] = __byte_perm_S (w6[2], w6[1], selector); + c5[1] = __byte_perm_S (w6[1], w6[0], selector); + c5[0] = __byte_perm_S (w6[0], w5[3], selector); + c4[3] = __byte_perm_S (w5[3], w5[2], selector); + c4[2] = __byte_perm_S (w5[2], w5[1], selector); + c4[1] = __byte_perm_S (w5[1], w5[0], selector); + c4[0] = __byte_perm_S (w5[0], w4[3], selector); + c3[3] = __byte_perm_S (w4[3], w4[2], selector); + c3[2] = __byte_perm_S (w4[2], w4[1], selector); + c3[1] = __byte_perm_S (w4[1], w4[0], selector); + c3[0] = __byte_perm_S (w4[0], w3[3], selector); + c2[3] = __byte_perm_S (w3[3], w3[2], selector); + c2[2] = __byte_perm_S (w3[2], w3[1], selector); + c2[1] = __byte_perm_S (w3[1], w3[0], selector); + c2[0] = __byte_perm_S (w3[0], w2[3], selector); + c1[3] = __byte_perm_S (w2[3], w2[2], selector); + c1[2] = __byte_perm_S (w2[2], w2[1], selector); + c1[1] = __byte_perm_S (w2[1], w2[0], selector); + c1[0] = __byte_perm_S (w2[0], w1[3], selector); + c0[3] = __byte_perm_S (w1[3], w1[2], selector); + c0[2] = __byte_perm_S (w1[2], w1[1], selector); + c0[1] = __byte_perm_S (w1[1], w1[0], selector); + c0[0] = __byte_perm_S (w1[0], w0[3], selector); w7[3] = __byte_perm_S (w0[3], w0[2], selector); w7[2] = __byte_perm_S (w0[2], w0[1], selector); w7[1] = __byte_perm_S (w0[1], w0[0], selector); @@ -13789,6 +17600,36 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 29: + c7[1] = __byte_perm_S ( 0, w7[3], selector); + c7[0] = __byte_perm_S (w7[3], w7[2], selector); + c6[3] = __byte_perm_S (w7[2], w7[1], selector); + c6[2] = __byte_perm_S (w7[1], w7[0], selector); + c6[1] = __byte_perm_S (w7[0], w6[3], selector); + c6[0] = __byte_perm_S (w6[3], w6[2], selector); + c5[3] = __byte_perm_S (w6[2], w6[1], selector); + c5[2] = __byte_perm_S (w6[1], w6[0], selector); + c5[1] = __byte_perm_S (w6[0], w5[3], selector); + c5[0] = __byte_perm_S (w5[3], w5[2], selector); + c4[3] = __byte_perm_S (w5[2], w5[1], selector); + c4[2] = __byte_perm_S (w5[1], w5[0], selector); + c4[1] = __byte_perm_S (w5[0], w4[3], selector); + c4[0] = __byte_perm_S (w4[3], w4[2], selector); + c3[3] = __byte_perm_S (w4[2], w4[1], selector); + c3[2] = __byte_perm_S (w4[1], w4[0], selector); + c3[1] = __byte_perm_S (w4[0], w3[3], selector); + c3[0] = __byte_perm_S (w3[3], w3[2], selector); + c2[3] = __byte_perm_S (w3[2], w3[1], selector); + c2[2] = __byte_perm_S (w3[1], w3[0], selector); + c2[1] = __byte_perm_S (w3[0], w2[3], selector); + c2[0] = __byte_perm_S (w2[3], w2[2], selector); + c1[3] = __byte_perm_S (w2[2], w2[1], selector); + c1[2] = __byte_perm_S (w2[1], w2[0], selector); + c1[1] = __byte_perm_S (w2[0], w1[3], selector); + c1[0] = __byte_perm_S (w1[3], w1[2], selector); + c0[3] = __byte_perm_S (w1[2], w1[1], selector); + c0[2] = __byte_perm_S (w1[1], w1[0], selector); + c0[1] = __byte_perm_S (w1[0], w0[3], selector); + c0[0] = __byte_perm_S (w0[3], w0[2], selector); w7[3] = __byte_perm_S (w0[2], w0[1], selector); w7[2] = __byte_perm_S (w0[1], w0[0], selector); w7[1] = __byte_perm_S (w0[0], 0, selector); @@ -13825,6 +17666,37 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 30: + c7[2] = __byte_perm_S ( 0, w7[3], selector); + c7[1] = __byte_perm_S (w7[3], w7[2], selector); + c7[0] = __byte_perm_S (w7[2], w7[1], selector); + c6[3] = __byte_perm_S (w7[1], w7[0], selector); + c6[2] = __byte_perm_S (w7[0], w6[3], selector); + c6[1] = __byte_perm_S (w6[3], w6[2], selector); + c6[0] = __byte_perm_S (w6[2], w6[1], selector); + c5[3] = __byte_perm_S (w6[1], w6[0], selector); + c5[2] = __byte_perm_S (w6[0], w5[3], selector); + c5[1] = __byte_perm_S (w5[3], w5[2], selector); + c5[0] = __byte_perm_S (w5[2], w5[1], selector); + c4[3] = __byte_perm_S (w5[1], w5[0], selector); + c4[2] = __byte_perm_S (w5[0], w4[3], selector); + c4[1] = __byte_perm_S (w4[3], w4[2], selector); + c4[0] = __byte_perm_S (w4[2], w4[1], selector); + c3[3] = __byte_perm_S (w4[1], w4[0], selector); + c3[2] = __byte_perm_S (w4[0], w3[3], selector); + c3[1] = __byte_perm_S (w3[3], w3[2], selector); + c3[0] = __byte_perm_S (w3[2], w3[1], selector); + c2[3] = __byte_perm_S (w3[1], w3[0], selector); + c2[2] = __byte_perm_S (w3[0], w2[3], selector); + c2[1] = __byte_perm_S (w2[3], w2[2], selector); + c2[0] = __byte_perm_S (w2[2], w2[1], selector); + c1[3] = __byte_perm_S (w2[1], w2[0], selector); + c1[2] = __byte_perm_S (w2[0], w1[3], selector); + c1[1] = __byte_perm_S (w1[3], w1[2], selector); + c1[0] = __byte_perm_S (w1[2], w1[1], selector); + c0[3] = __byte_perm_S (w1[1], w1[0], selector); + c0[2] = __byte_perm_S (w1[0], w0[3], selector); + c0[1] = __byte_perm_S (w0[3], w0[2], selector); + c0[0] = __byte_perm_S (w0[2], w0[1], selector); w7[3] = __byte_perm_S (w0[1], w0[0], selector); w7[2] = __byte_perm_S (w0[0], 0, selector); w7[1] = 0; @@ -13861,6 +17733,38 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u break; case 31: + c7[3] = __byte_perm_S ( 0, w7[3], selector); + c7[2] = __byte_perm_S (w7[3], w7[2], selector); + c7[1] = __byte_perm_S (w7[2], w7[1], selector); + c7[0] = __byte_perm_S (w7[1], w7[0], selector); + c6[3] = __byte_perm_S (w7[0], w6[3], selector); + c6[2] = __byte_perm_S (w6[3], w6[2], selector); + c6[1] = __byte_perm_S (w6[2], w6[1], selector); + c6[0] = __byte_perm_S (w6[1], w6[0], selector); + c5[3] = __byte_perm_S (w6[0], w5[3], selector); + c5[2] = __byte_perm_S (w5[3], w5[2], selector); + c5[1] = __byte_perm_S (w5[2], w5[1], selector); + c5[0] = __byte_perm_S (w5[1], w5[0], selector); + c4[3] = __byte_perm_S (w5[0], w4[3], selector); + c4[2] = __byte_perm_S (w4[3], w4[2], selector); + c4[1] = __byte_perm_S (w4[2], w4[1], selector); + c4[0] = __byte_perm_S (w4[1], w4[0], selector); + c3[3] = __byte_perm_S (w4[0], w3[3], selector); + c3[2] = __byte_perm_S (w3[3], w3[2], selector); + c3[1] = __byte_perm_S (w3[2], w3[1], selector); + c3[0] = __byte_perm_S (w3[1], w3[0], selector); + c2[3] = __byte_perm_S (w3[0], w2[3], selector); + c2[2] = __byte_perm_S (w2[3], w2[2], selector); + c2[1] = __byte_perm_S (w2[2], w2[1], selector); + c2[0] = __byte_perm_S (w2[1], w2[0], selector); + c1[3] = __byte_perm_S (w2[0], w1[3], selector); + c1[2] = __byte_perm_S (w1[3], w1[2], selector); + c1[1] = __byte_perm_S (w1[2], w1[1], selector); + c1[0] = __byte_perm_S (w1[1], w1[0], selector); + c0[3] = __byte_perm_S (w1[0], w0[3], selector); + c0[2] = __byte_perm_S (w0[3], w0[2], selector); + c0[1] = __byte_perm_S (w0[2], w0[1], selector); + c0[0] = __byte_perm_S (w0[1], w0[0], selector); w7[3] = __byte_perm_S (w0[0], 0, selector); w7[2] = 0; w7[1] = 0;